Esempio n. 1
0
 def makePPIDatasets(self):  ### Generate PPI datasets from pairwise data
     '''Generate PPI datasets from pairwise data.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self, 'YeastPPI/')
         seqdict = self.dict['SeqDict']
         ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (hx, htot, fx) = (0.0, len(self.dict['PPI']), 0)
         for hub in rje.sortKeys(self.dict['PPI']):
             self.progLog(
                 '\r#FAS', 'Generating %s PPI fasta files: %.2f' %
                 (rje.integerString(fx), hx / htot))
             hx += 100.0
             if len(self.dict['PPI'][hub]) < 3: continue
             seqs = []
             for spoke in self.dict['PPI'][hub]:
                 if spoke not in seqdict: continue
                 seqs.append(seqdict[spoke])
             if len(seqs) < 3: continue
             self.obj['SeqList'].saveFasta(seqs,
                                           rje.makePath('YeastPPI/%s.fas' %
                                                        hub,
                                                        wholepath=True),
                                           log=False)
             fx += 1
         self.printLog(
             '\r#FAS',
             'Generation of %s PPI fasta files from %s hubs complete.' %
             (rje.integerString(fx), rje.integerString(htot)))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Esempio n. 2
0
 def startElement(self, tag,
                  attributes):  ### Called when a new element begins
     ### ~ [1] Generate XML object for element ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     self.printLog(
         '\r#PARSE', 'Parsing %s: %s elements (%s level 1; %s retained)' %
         (self.xml.info['Name'], rje.integerString(
             self.e), rje.integerString(self.x), rje.integerString(self.r)),
         False, False)
     self.e += 1
     if not self.parsing:
         myxml = self.xml  # Very first Element goes in main XML Object
         myxml.list['ParentTags'] = [tag]
     else:
         myxml = XML(log=self.xml.log, cmd_list=self.xml.cmd_list)
         myxml.obj['Parent'] = self.parsing[-1]
         myxml.list[
             'ParentTags'] = myxml.obj['Parent'].list['ParentTags'] + [tag]
         if self.parsing[-1] == self.xml: self.x += 1
     if tag in self.xml.list['Elements'] or not self.xml.list['Elements']:
         if myxml.obj['Parent']: self.parsing[-1].list['XML'].append(myxml)
         if myxml.list['ParentTags'] not in self.schemalist:
             self.schemalist.append(myxml.list['ParentTags'])
     myxml.info['Name'] = tag
     self.parsing.append(myxml)
     myxml.stat['Level'] = len(self.parsing)
     ### ~ [2] Update Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     schjoin = string.join(myxml.list['ParentTags'], ':')
     if schjoin not in self.schatts: self.schatts[schjoin] = []
     for q in attributes.getQNames():
         if q in self.xml.list['Attributes'] or not self.xml.list[
                 'Attributes']:  # Only add if wanted
             myxml.dict['Attributes'][q] = attributes.getValueByQName(q)
             if q not in self.schatts[schjoin]:
                 self.schatts[schjoin].append(q)
Esempio n. 3
0
 def dpi(self):  ### Domain-protein interactions
     '''Domain-protein interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_DPI'
         rje.mkDir(self, outdir)
         dpi = {}  # Dictionary of {domain:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['Domain']):
             dpi[dom] = []
             for hub in self.dict['Domain'][dom]:
                 if hub in self.dict['PPI']:
                     dpi[dom] += self.dict['PPI'][
                         hub]  # Add with redundancy
             for spoke in dpi[dom][0:]:
                 if dpi[dom].count(spoke) == 1:
                     dpi[dom].remove(
                         spoke)  # Must have 2+ domain interactions
             for hub in self.dict['Domain'][dom]:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in dpi[dom]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict[
                                 'PPI'][spoke]:
                             self.dict['PPI'][spoke].remove(hub)
             dpi[dom] = rje.sortUnique(dpi[dom], False, False)
             acc = []
             for name in dpi[dom]:
                 if not name: continue
                 if name in self.dict['Seq']:
                     acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname:
                     badname.append(name)
             open('%s/%s.dpi.acc' % (outdir, dom),
                  'w').write(string.join(acc, '\n'))
             self.printLog('#DPI',
                           '%s domain => %d interactors' % (dom, len(acc)))
         if badname:
             badname.sort()
             self.printLog(
                 '#BAD', '%d "bad" protein names: %s' %
                 (len(badname), string.join(badname, '; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DPI',
                           'No %s PPI left after DPI removed' % hub,
                           screen=False)
         self.printLog(
             '#PPX', '%s of %s PPI hubs remain after DPI removed' %
             (rje.integerString(len(
                 self.dict['PPI'])), rje.integerString(hx)))
     except:
         self.errorLog('Problem with SLiMPID.dpi()', quitchoice=True)
Esempio n. 4
0
 def loadFeatures(self, ftfile):  ### Loads features from given file
     '''Loads features from given file.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if ftfile in ['', 'none']: return
         if not os.path.exists(ftfile):
             return self.printLog('#ERR', 'Features file "%s" missing')
         delimit = rje.delimitFromExt(filename=ftfile)
         ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit)
         mainkeys = [headers[0]]
         hmap = {}
         for h in headers:
             hmap[h.lower()] = h
         pos = ''  # Leader for start/end positions
         if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_'
         for h in [
                 'feature',
                 '%sstart' % pos,
                 '%send' % pos, 'description'
         ]:
             if h not in hmap:
                 return self.printLog(
                     '#ERR', 'No %s field detected in "%s" features file' %
                     (h, ftfile))
             mainkeys.append(hmap[h])
         mainkeys.remove(hmap['description'])
         ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ftdata = rje.dataDict(self,
                               ftfile,
                               mainkeys, ['description'],
                               delimit,
                               headers,
                               lists=True)
         (mx, mtot, fx) = (0.0, len(ftdata), 0)
         for mainkey in rje.sortKeys(ftdata):
             self.progLog(
                 '\r#FT',
                 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot))
             mx += 100.0
             (id, ft, start, end) = string.split(mainkey, delimit)
             if id == mainkeys[0]: continue
             if id not in self.dict['Features']:
                 self.dict['Features'][id] = []
             for desc in ftdata[mainkey][hmap['description']]:
                 fx += 1
                 self.dict['Features'][id].append({
                     'Type': ft,
                     'Start': int(start),
                     'End': int(end),
                     'Desc': desc
                 })
         self.printLog(
             '\r#FT', 'Loaded %s features for %s IDs from %s' %
             (rje.integerString(fx),
              rje.integerString(len(self.dict['Features'])), ftfile))
     except:
         self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
Esempio n. 5
0
 def ddi(self):  ### Domain-domain interactions
     '''Domain-domain interactions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ddx = 0
         (dx, dtot) = (0.0, len(self.dict['DDI']))
         if not self.dict['DDI'] or not self.dict['Domain']: return
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['DDI']):
             self.progLog(
                 '\r#DDI',
                 'Screening domain-domain interactions: %.1f%%; %s removed'
                 % ((dx / dtot), rje.integerString(ddx)))
             dx += 100
             if dom not in self.dict['Domain']:
                 self.printLog('#DOM',
                               'No sequences with "%s" domains' % dom)
                 continue
             for ddi in self.dict['DDI'][dom]:
                 if ddi not in self.dict['Domain']: continue
                 for hub in self.dict['Domain'][dom]:
                     if hub not in self.dict['PPI']: continue
                     for spoke in self.dict['PPI'][hub][0:]:
                         if spoke in self.dict['Domain'][ddi]:
                             ddx += 1
                             self.dict['PPI'][hub].remove(spoke)
                 for hub in self.dict['Domain'][ddi]:
                     if hub not in self.dict['PPI']: continue
                     for spoke in self.dict['PPI'][hub][0:]:
                         if spoke in self.dict['Domain'][dom]:
                             ddx += 1
                             self.dict['PPI'][hub].remove(spoke)
         self.printLog(
             '\r#DDI',
             'Screening domain-domain interactions complete: %s removed.' %
             (rje.integerString(ddx)))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DDI',
                           'No %s interactions left after DDI removed' %
                           hub,
                           screen=False)
         self.printLog(
             '#PPX', '%s of %s PPI hubs remain after DDI removed' %
             (rje.integerString(len(
                 self.dict['PPI'])), rje.integerString(hx)))
     except:
         self.errorLog('Problem with SLiMPID.ddi()', quitchoice=True)
Esempio n. 6
0
 def loadPPI(self):  ### Load pairwise interaction data
     '''Load pairwise interaction data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not rje.checkForFile(self.info['PPIFile']): return False
         ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in open(self.info['PPIFile'],'r').readlines():
             try: [pa,pb] = string.split(rje.chomp(line))[:2]
             except: continue
             for ppi in [(pa,pb),(pb,pa)]:
                 if ppi[0] not in self.dict['PPI']: self.dict['PPI'][ppi[0]] = []
                 if ppi[1] not in self.dict['PPI'][ppi[0]]: self.dict['PPI'][ppi[0]].append(ppi[1])
             self.progLog('\r#PPI','Loading PPI data: %s proteins' % rje.integerString(len(self.dict['PPI'])))
         self.printLog('\r#PPI','Loaded PPI data for %s proteins' % rje.integerString(len(self.dict['PPI'])))
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Esempio n. 7
0
 def loadPillars(self):  ### Load YGOB Pillar data
     '''Load YGOB Pillar data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not rje.checkForFile(self.info['Pillars']): return False
         ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in self.loadFromFile(filename=self.info['Pillars'],chomplines=True):
             pillars = string.split(line)
             #self.deBug('%s = %d' % (pillars,len(pillars)))
             if len(pillars) < 17: continue
             pillars = pillars[:5] + pillars[6:]     # Remove ancestral gene
             while '---' in pillars: pillars.remove('---')
             #self.deBug('%s = %d' % (pillars,len(pillars)))
             if pillars: self.list['Pillars'].append(pillars)
             self.progLog('\r#YGOB','Loading Pillar data: %s loci' % rje.integerString(len(self.list['Pillars'])))
         self.printLog('\r#YGOB','Loaded Pillar data for %s loci' % rje.integerString(len(self.list['Pillars'])))
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Esempio n. 8
0
 def loadAlias(self, sourcefile):  ### Loads Alias data
     '''
     Loads Alias data.
     >> sourcefile:str = Source filename
     '''
     try:  ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if sourcefile.lower() in ['', 'none']: return
         if not os.path.exists(sourcefile):
             return self.log.errorLog('Alias file "%s" not found' %
                                      (sourcefile),
                                      printerror=False)
         data = rje.dataDict(self,
                             sourcefile,
                             datakeys=['Aliases'],
                             lists=True)
         ### ~ [2] Parse out Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (hx, htot) = (0.0, len(data))
         for id in data:
             self.log.printLog('\r#ALIAS',
                               'Processing %s: %.1f%%' %
                               (sourcefile, hx / htot),
                               newline=False,
                               log=False)
             hx += 100.0
             ## ~ [2a] Update self.dict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             for alist in data[id]['Aliases']:
                 for alias in string.split(alist, ','):
                     self.addAlias(id, alias)
             if id in self.dict['Aliases']: self.dict['Aliases'][id].sort()
         self.log.printLog(
             '\r#ALIAS', 'Processed %s: %s IDs with aliases' %
             (sourcefile, rje.integerString(len(self.dict['Aliases']))))
     except:
         self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 9
0
 def ddi(self):  ### Domain-domain interactions
     '''Domain-domain interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ddx = 0
         (dx,dtot) = (0.0,len(self.dict['DDI']))
         if not self.dict['DDI'] or not self.dict['Domain']: return
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['DDI']):
             self.progLog('\r#DDI','Screening domain-domain interactions: %.1f%%; %s removed' % ((dx/dtot),rje.integerString(ddx))); dx += 100
             if dom not in self.dict['Domain']: self.printLog('#DOM','No sequences with "%s" domains' % dom); continue
             for ddi in self.dict['DDI'][dom]:
                 if ddi not in self.dict['Domain']: continue
                 for hub in self.dict['Domain'][dom]:
                     if hub not in self.dict['PPI']: continue
                     for spoke in self.dict['PPI'][hub][0:]:
                         if spoke in self.dict['Domain'][ddi]: ddx+=1; self.dict['PPI'][hub].remove(spoke)
                 for hub in self.dict['Domain'][ddi]:
                     if hub not in self.dict['PPI']: continue
                     for spoke in self.dict['PPI'][hub][0:]:
                         if spoke in self.dict['Domain'][dom]: ddx+=1; self.dict['PPI'][hub].remove(spoke)
         self.printLog('\r#DDI','Screening domain-domain interactions complete: %s removed.' % (rje.integerString(ddx)))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DDI','No %s interactions left after DDI removed' % hub,screen=False)
         self.printLog('#PPX','%s of %s PPI hubs remain after DDI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.ddi()',quitchoice=True)
Esempio n. 10
0
 def loadFeatures(self,ftfile):  ### Loads features from given file
     '''Loads features from given file.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if ftfile in ['','none']: return
         if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing')
         delimit = rje.delimitFromExt(filename=ftfile)
         ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         headers = rje.readDelimit(open(ftfile,'r').readline(),delimit)
         mainkeys = [headers[0]]
         hmap = {}
         for h in headers: hmap[h.lower()] = h
         pos = ''    # Leader for start/end positions
         if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_'
         for h in ['feature','%sstart' % pos,'%send' % pos,'description']:
             if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile))
             mainkeys.append(hmap[h])
         mainkeys.remove(hmap['description'])
         ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True)
         (mx,mtot,fx) = (0.0,len(ftdata),0)
         for mainkey in rje.sortKeys(ftdata):
             self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot))
             mx += 100.0                                                                           
             (id,ft,start,end) = string.split(mainkey,delimit)
             if id == mainkeys[0]: continue
             if id not in self.dict['Features']: self.dict['Features'][id] = []
             for desc in ftdata[mainkey][hmap['description']]:
                 fx += 1
                 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc})
         self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile))
     except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
Esempio n. 11
0
 def seqBySeq(
     self
 ):  ### Runs in SeqBySeq Mode                                                               #V1.0
     '''
     In SeqBySeq mode, the program assumes that seqin=FILE and basefile=X are given and farm states the program to be run.
     Seqin will then be worked through in turn and each sequence farmed out to the farm program. Outputs given by OutList
     are then compiled, as is the Log, into the correct basefile=X given. In the case of *.csv and *.tdt files, the header
     row is copied for the first file and then excluded for all subsequent files. For all other files extensions, the
     whole output is copied.
     '''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('Farm')[-3:] == '.py':
             self.str['Farm'] = self.str['Farm'][:-3]
         self.list['Seq'] = rje_seq.SeqList(
             self.log,
             self.cmd_list + ['autoload=T', 'accnr=F', 'seqnr=F']).seq[0:]
         while self.getStrLC('StartFrom') and self.list['Seq']:
             if self.list['Seq'][0].shortName() != self.getStr('StartFrom'):
                 self.list['Seq'] = self.list['Seq'][1:]
             else:
                 self.str['StartFrom'] = ''
         self.printLog(
             '#SEQ', '%s query sequences to farm out' %
             rje.integerString(len(self.list['Seq'])))
         self.list['Pickup'] = self.pickupList()
         ### ~ [2] ~ Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.runJobs()
         return True
     except SystemExit:
         raise  # Child
     except:
         self.errorLog('JobFarmer.seqBySeq error')
     return False
Esempio n. 12
0
 def ensLoci(self):  ### Reads from EnsLoci file if it exists and parses into dictionaries.
     '''Reads from EnsLoci file if it exists and parses into dictionaries.'''
     self.dict['EnsLoci'] = {}    # Dictionary of {EnsGene:shortName()}
     self.dict['EnsDesc'] = {}    # Dictionary of {EnsGene:Description}
     self.dict['UniEns'] = {}     # Dictionary of {UniProt?:EnsGene}
     if os.path.exists(self.info['EnsLoci']):
         elines = self.loadFromFile(self.info['EnsLoci'])
         (ex,etot) = (0.0,len(elines))
         while elines:
             ex += 100.0
             line = elines.pop(0)
             if line[:1] != '>': continue
             if rje.matchExp('^>(\S+).+ gene:(\S+)\]',line): (name,gene) = rje.matchExp('^>(\S+).+ gene:(\S+)\]',line)
             else:
                 self.log.errorLog('Problem with EnsLoci line: %s' % line,printerror=False)
                 continue
             try: acc = rje.matchExp('\[acc:(\S+)',line)[0]
             except: acc = ''
             if acc: self.dict['UniEns'][acc] = gene
             self.dict['EnsLoci'][gene] = name
             self.dict['EnsDesc'][gene] = string.join(string.split(string.split(line,' [acc:')[0][1:])[1:])
             if self.opt['FullEns'] and gene not in self.list['Genes']:
                 self.list['Genes'].append(gene)
             if self.opt['FullEns'] and gene not in self.dict['GeneCard']:
                 self.dict['GeneCard'][gene] = {'EnsEMBL':gene,'Symbol':'!FAILED!'}
             self.log.printLog('\r#ENS','Parsing EnsLoci %.1f%%: %s genes' % (ex/etot,rje.integerString(len(self.dict['EnsLoci']))),newline=False,log=False)
         self.log.printLog('\r#ENS','Parsing EnsLoci complete: %s genes' % (rje.integerString(len(self.dict['EnsLoci']))))
Esempio n. 13
0
    def convert(self,filelist=[],outfile=None):      ### Converts scansite output files in FileList to Outfile
        '''
        Converts scansite output files in FileList to Outfile.
        '''
        try:
            ### Setup ###
            _stage = 'Setup'
            if len(filelist) < 1:
                filelist = self.list['FileList']
            if not outfile:
                outfile = self.info['Name']          
            if len(filelist) < 1:
                self.log.errorLog('No scansite files to convert! %s unchanged/not made.' % outfile,printerror=False)
                return False
            delimit = rje.getDelimit(self.cmd_list)
            ext = rje.delimitExt(delimit)
            if ext != outfile[-3:]:
                newfile = outfile[:-3] + ext
                if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)):
                    outfile = newfile
            self.log.printLog('#OUT','Converting %d file(s), output to %s.' % (len(filelist),outfile))

            ### Output File ###
            _stage = 'Output File'
            if not self.opt['Append'] or not os.path.exists(outfile):   # Create with header
                OUTFILE = open(outfile,'w')
                headers = ['seq_id','enzyme','enz_group','aa','pos','score','percentile','matchseq','sa']
                rje.writeDelimit(OUTFILE,headers,delimit)
            else:
                OUTFILE = open(outfile,'a')

            ### Conversion ###
            _stage = 'Conversion'
            sx = 0
            for infile in filelist:
                if not os.path.exists(infile):
                    self.log.errorLog('Input file %s does not exist! :o(' % infile,False,False)
                    continue
                fx = 0
                INFILE = open(infile,'r')
                inline = rje.nextLine(INFILE)
                while inline != None:
                    if rje.matchExp(re_scansite,inline):
                        scanlist = rje.matchExp(re_scansite,inline)
                    rje.writeDelimit(OUTFILE,scanlist,delimit)
                    sx += 1
                    fx += 1
                    rje.progressPrint(self,sx)
                    inline = rje.nextLine(INFILE)
                self.log.printLog('#OUT','%s scansite results from %s. (%s Total.)' % (rje.integerString(fx),infile,rje.integerString(sx)))
                INFILE.close()

            ### End ###
            _stage = 'End'
            OUTFILE.close()
            self.log.printLog('#OUT','%s scansite results output to %s.' % (rje.integerString(sx),outfile))
            return True            
        except:
            self.log.errorLog('Error in convert(%s)' % _stage,printerror=True,quitchoice=False)
            raise   
Esempio n. 14
0
    def loadMutations(self):    ### Inputs parsed mutations back into dictionaries
        '''Inputs parsed mutations back into dictionaries.'''
        try:### ~ [1] Setup input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease']
            infile = 'omim_mutations.tdt'
            if not os.path.exists(infile): return False
            datadict = rje.dataDict(self,infile,headers[:2],headers,'\t')
            mx = len(datadict)

            ### ~ [2] Process into dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for dkey in datadict.keys()[0:]:
                data = datadict.pop(dkey)
                record = data['OMIM_ID']
                subid = data['SubID']
                gene = data['Gene']
                mutation = '%s%s%s' % (data['WildAA'],data['Pos'],data['MutAA'])
                disease = data['Disease']
                if gene not in self.dict['Records']: self.dict['Records'][gene] = [record]
                if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record]
                if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {}
                self.dict['Mutations'][gene][subid] = (disease,mutation)
            self.log.printLog('\r#OMIM','Loaded %s OMIM mutations (%s genes).' % (rje.integerString(mx),rje.integerString(len(self.dict['Records']))))
            return True
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            return False
Esempio n. 15
0
 def outputCards(self):  ### Outputs cards to delimited file
     '''Outputs cards to delimited file.'''
     ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     genelist = self.list['Genes']
     if self.opt['Purify'] and self.opt['Restrict']:
         for gene in genelist[0:]:
             if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']:  # Replace with symbol
                 genelist.remove(gene)
                 if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol'])
     delimit = rje.delimitFromExt(filename=self.info['CardOut'])
     CARDOUT = open(self.info['CardOut'],'a')
     ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     (noens,noloci,ox) = (0,0,0)
     for gene in rje.sortKeys(self.dict['GeneCard']):
         if self.opt['Restrict'] and gene not in genelist: continue
         elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue
         self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1
         self.dict['GeneCard'][gene]['Alias'] = gene
         self.dict['GeneCard'][gene]['Species'] = self.info['Species']
         rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene])
         if self.dict['GeneCard'][gene]['Symbol'] == gene:   # Not an alias
             if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1
             if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1
     CARDOUT.close()
     self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut']))
     self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
Esempio n. 16
0
 def run(self,batch=False):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch: self.setupResults()
         ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not batch and not self.obj['SeqList'].seqs():    ### Look for batch files and run for each
             batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0)
             self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles)))
             if not batchfiles: self.errorLog('No input files found!',printerror=False)
             else:
                 bx = 0
                 for infile in batchfiles:
                     bx += 1
                     self.printLog('#BATCH','Batch running %s' % infile)
                     bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile]
                     self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd)
                     self.run(batch=True)
                     self.opt['Append'] = True
                     self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False)
             if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32
             return
         ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.info['Special'].lower() == 'allbyall':
             self.printLog('#RUN','Performing special "all-by-all" pairwise run')
             self.info['Special'] = ''
             for i in range(len(self.seqs())-1):
                 self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i]
                 for j in range(i+1,len(self.seqs())):
                     self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1)
                     self.run(batch=True)
                     self.opt['Append'] = True
             self.info['Special'] = 'allbyall'; return                
         ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.setup()
         ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.fitness()
         self.phenotype()
         self.grouping()
         for vector in ['Fitness','Phenotype','SeqGroup']:
             if len(self.list[vector]) != self.qry().seqLen():
                 self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False)
                 raise ValueError
         results = self.price()
         ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True)
         results['Query'] = self.qry().shortName()
         results['Fitness'] = self.info['Fmethod']
         results['Phenotype'] = self.info['Pmethod']
         results['SeqGroup'] = self.info['SeqGroup']
         rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results)
         self.printLog('#OUT','Results output to %s' % self.info['ResFile'])
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Esempio n. 17
0
 def caseChange(self):     ### Returns groupings based on Case boundaries of query
     '''Returns groupings based on Case boundaries of query.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         qry = self.qry()
         self.deBug(qry.getSequence(case=True))
         grplist = ['UC'] * qry.seqLen()    # List of groups (None = no group)
         ### ~ [1] Map Case ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for (start,end) in self.qry().dict['Case']['Lower']:
             for i in range(start-1,end): grplist[i] = 'LC'
         caselist = grplist[0:]
         gx = 1
         for r in range(qry.seqLen()):
             q = qry.info['Sequence'].upper()[r]
             if not self.opt['QryGaps'] and q == '-': grplist[r] = 0
             elif r > 0 and caselist[r] != caselist[r-1]: gx += 1
             grplist[r] = gx
         self.printLog('#GRP','%s case groups from %s%s' % (rje.integerString(gx),rje.integerString(qry.seqLen()),self.obj['SeqList'].units()))
         self.deBug(grplist)
         return grplist
     except: self.errorLog(rje_zen.Zen().wisdom()); raise           
Esempio n. 18
0
    def setup(self):    ### Main class setup method.
        '''Main class setup method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] ~ Sequence file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list)   #!# Add code for memsaver/autoload=F #!#
            self.printLog('#SCAP','%s sequences loaded for SCAP analysis' % rje.integerString(seqlist.seqNum()))
            ## ~ [1b] ~ Xmer background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            mseqfile = self.info['XmerBack']
            if mseqfile.lower() in ['','none']: mseqfile = self.info['XmerBack'] = seqlist.info['Name']
            markov = self.obj['Markov'] = rje_markov.Markov(self.log,['autoload=T','accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % mseqfile,'direction=both','markov=F','scap=T'])
            markov.setup()
            maxx = markov.stat['MaxXmer']
            if self.info['Basefile'].lower() in ['','none']:
                self.info['Basefile'] = '%s.scap' % rje.baseFile(seqlist.info['Name'],True)
                if markov.opt['Sorted']: self.info['Basefile'] = '%s.sorted' % self.info['Basefile']
            basefile = self.info['Basefile']
            self.printLog('#MARKOV','Markov setup complete')
            ## ~ [1c] ~ SCAP Background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            scapfile = self.info['ScapBack']
            if scapfile.lower() in ['','none',seqlist.info['Name'].lower()]: self.obj['ScapBack'] = self.obj['SeqList']
            elif scapfile == mseqfile: self.obj['ScapBack'] = markov.obj['SeqList'] 
            else: self.obj['ScapBack'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % scapfile])
            self.printLog('#SCAP','%s sequences for SCAP Background' % rje.integerString(seqlist.seqNum()))

            ### ~ [2] Markov Chains ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if mseqfile == seqlist.info['Name']: markov.obj['SeqList'] = seqlist
            elif mseqfile == self.obj['ScapBack'].info['Name']: markov.obj['SeqList'] = self.obj['ScapBack']
            mpickle = markov.unpickleMe()
            if mpickle: markov = self.obj['Markov'] = mpickle
            if not markov.suftree() or not markov.pretree() or maxx > markov.stat['MaxXmer']:
                markov.run()
                markov.pickleMe()
            markov.opt['DeBug'] = self.opt['DeBug']
            self.deBug(markov.opt)
            self.deBug(markov.stat)
            #self.deBug(markov.suftree())
            #self.deBug(markov.pretree())
            return True     # Setup successful
        except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Esempio n. 19
0
    def loadMutations(
            self):  ### Inputs parsed mutations back into dictionaries
        '''Inputs parsed mutations back into dictionaries.'''
        try:  ### ~ [1] Setup input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            headers = [
                'OMIM_ID', 'SubID', 'Gene', 'Pos', 'WildAA', 'MutAA', 'Disease'
            ]
            infile = 'omim_mutations.tdt'
            if not os.path.exists(infile): return False
            datadict = rje.dataDict(self, infile, headers[:2], headers, '\t')
            mx = len(datadict)

            ### ~ [2] Process into dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for dkey in datadict.keys()[0:]:
                data = datadict.pop(dkey)
                record = data['OMIM_ID']
                subid = data['SubID']
                gene = data['Gene']
                mutation = '%s%s%s' % (data['WildAA'], data['Pos'],
                                       data['MutAA'])
                disease = data['Disease']
                if gene not in self.dict['Records']:
                    self.dict['Records'][gene] = [record]
                if record not in self.dict['Records'][gene]:
                    self.dict['Records'][gene] += [record]
                if gene not in self.dict['Mutations']:
                    self.dict['Mutations'][gene] = {}
                self.dict['Mutations'][gene][subid] = (disease, mutation)
            self.log.printLog(
                '\r#OMIM', 'Loaded %s OMIM mutations (%s genes).' %
                (rje.integerString(mx),
                 rje.integerString(len(self.dict['Records']))))
            return True
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            return False
Esempio n. 20
0
 def loadPPI(self):  ### Load pairwise interaction data
     '''Load pairwise interaction data.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not rje.checkForFile(self.info['PPIFile']): return False
         ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in open(self.info['PPIFile'], 'r').readlines():
             try:
                 [pa, pb] = string.split(rje.chomp(line))[:2]
             except:
                 continue
             for ppi in [(pa, pb), (pb, pa)]:
                 if ppi[0] not in self.dict['PPI']:
                     self.dict['PPI'][ppi[0]] = []
                 if ppi[1] not in self.dict['PPI'][ppi[0]]:
                     self.dict['PPI'][ppi[0]].append(ppi[1])
             self.progLog(
                 '\r#PPI', 'Loading PPI data: %s proteins' %
                 rje.integerString(len(self.dict['PPI'])))
         self.printLog(
             '\r#PPI', 'Loaded PPI data for %s proteins' %
             rje.integerString(len(self.dict['PPI'])))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Esempio n. 21
0
 def loadPillars(self):  ### Load YGOB Pillar data
     '''Load YGOB Pillar data.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not rje.checkForFile(self.info['Pillars']): return False
         ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for line in self.loadFromFile(filename=self.info['Pillars'],
                                       chomplines=True):
             pillars = string.split(line)
             #self.deBug('%s = %d' % (pillars,len(pillars)))
             if len(pillars) < 17: continue
             pillars = pillars[:5] + pillars[6:]  # Remove ancestral gene
             while '---' in pillars:
                 pillars.remove('---')
             #self.deBug('%s = %d' % (pillars,len(pillars)))
             if pillars: self.list['Pillars'].append(pillars)
             self.progLog(
                 '\r#YGOB', 'Loading Pillar data: %s loci' %
                 rje.integerString(len(self.list['Pillars'])))
         self.printLog(
             '\r#YGOB', 'Loaded Pillar data for %s loci' %
             rje.integerString(len(self.list['Pillars'])))
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Esempio n. 22
0
 def test(self): ### Development method
     '''Development method.'''
     self.readGO()
     self.mapEnsGO()
     gohead = ['EnsG','GO_ID','GO_Type','GO_Desc']
     gofile = 'test.go.tdt'
     rje.delimitedFileOutput(self,gofile,gohead,rje_backup=True)
     gx = 0.0; gtot = len(self.dict['EnsGO'])
     for gene in rje.sortKeys(self.dict['EnsGO']):
         self.progLog('\r#ENSGO','Compiling %s: %.2f%%' % (gofile,gx/gtot)); gx += 100.0
         for goid in self.dict['EnsGO'][gene]:
             godata = {'EnsG':gene, 'GO_ID':goid}
             godata['GO_Type'] = self.dict['GO'][goid]['type']
             godata['GO_Desc'] = self.dict['GO'][goid]['name']
             rje.delimitedFileOutput(self,gofile,gohead,datadict=godata)
     self.printLog('\r#ENSGO','Compiling %s all done: %s genes.' % (gofile,rje.integerString(gtot)))
Esempio n. 23
0
 def codons(self): ### Returns grouping vector based on DNA codon positions (three groups)
     '''Returns grouping vector based on DNA codon positions (three groups).'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         qry = self.qry()
         grplist = [0] * qry.seqLen()    # List of groups (0 = no group)
         ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         trip = 1
         for r in range(qry.seqLen()):
             q = qry.info['Sequence'].upper()[r]
             if self.opt['QryGaps'] and q == '-': continue
             grplist[r] = trip
             if trip == 3: trip = 1
             else: trip += 1
         self.printLog('#GRP','3 codon groups from %s%s' % (rje.integerString(qry.seqLen()),self.obj['SeqList'].units()))
         return grplist
     except: self.errorLog(rje_zen.Zen().wisdom()); raise           
Esempio n. 24
0
 def fpi(self):  ### Family-protein interactions
     '''Family-protein interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_FPI'
         rje.mkDir(self,outdir)
         fpi = {}            # Dictionary of {family:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qry in rje.sortKeys(self.dict['PPI']):
             try:
                 fam = self.dict['Fam'][qry]
                 if len(fam) < 2: continue
             except: self.errorLog('Problem with "%s" protein family' % qry); continue
             fpi[qry] = []
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 fpi[qry] += self.dict['PPI'][hub]      # Add with redundancy
             for spoke in fpi[qry][0:]:
                 if fpi[qry].count(spoke) == 1: fpi[qry].remove(spoke)   # Must have 2+ family interactions
             for hub in fam:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in fpi[qry]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub)
             fpi[qry] = rje.sortUnique(fpi[qry],False,False)
             acc = []
             gene = self.dict['Gene'][qry]
             for name in fpi[qry]:
                 if not name: continue
                 if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname: badname.append(name)                     
             open('%s/%s.fpi.acc' % (outdir,gene),'w').write(string.join(acc,'\n'))
             self.printLog('#FPI','%s family => %d interactors' % (gene,len(acc)))
         if badname:
             badname.sort()
             self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#FPI','No %s PPI left after FPI removed' % hub)
         self.printLog('#PPX','%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.fpi()',quitchoice=True)
Esempio n. 25
0
 def triplets(self): ### Returns grouping vector based on DNA triplets
     '''Returns grouping vector based on DNA triplets.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         qry = self.qry()
         grplist = [0] * qry.seqLen()    # List of groups (0 = no group)
         gx = 0
         ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         trip = 0
         for r in range(qry.seqLen()):
             q = qry.info['Sequence'].upper()[r]
             if self.opt['QryGaps'] and q == '-': continue
             if not trip: gx += 1
             grplist[r] = gx
             if trip == 2: trip = 0
             else: trip += 1
         self.printLog('#GRP','%s triplet groups from %s%s' % (rje.integerString(gx),rje.integerString(qry.seqLen()),self.obj['SeqList'].units()))
         return grplist
     except: self.errorLog(rje_zen.Zen().wisdom()); raise           
Esempio n. 26
0
 def makePPIDatasets(self):  ### Generate PPI datasets from pairwise data
     '''Generate PPI datasets from pairwise data.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rje.mkDir(self,'YeastPPI/')
         seqdict = self.dict['SeqDict']
         ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (hx,htot,fx) = (0.0,len(self.dict['PPI']),0)
         for hub in rje.sortKeys(self.dict['PPI']):
             self.progLog('\r#FAS','Generating %s PPI fasta files: %.2f' % (rje.integerString(fx),hx/htot)); hx += 100.0
             if len(self.dict['PPI'][hub]) < 3: continue
             seqs = []
             for spoke in self.dict['PPI'][hub]:
                 if spoke not in seqdict: continue
                 seqs.append(seqdict[spoke])
             if len(seqs) < 3: continue
             self.obj['SeqList'].saveFasta(seqs,rje.makePath('YeastPPI/%s.fas' % hub,wholepath=True),log=False); fx+=1
         self.printLog('\r#FAS','Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx),rje.integerString(htot)))
     except: self.errorLog(rje_zen.Zen().wisdom()); raise   # Delete this if method error not terrible
Esempio n. 27
0
 def dpi(self):  ### Domain-protein interactions
     '''Domain-protein interactions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.dict['Domain']: return
         outdir = 'SLiMPID_DPI'
         rje.mkDir(self,outdir)
         dpi = {}            # Dictionary of {domain:[interactors]}
         badname = []
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for dom in rje.sortKeys(self.dict['Domain']):
             dpi[dom] = []
             for hub in self.dict['Domain'][dom]:
                 if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][hub]      # Add with redundancy
             for spoke in dpi[dom][0:]:
                 if dpi[dom].count(spoke) == 1: dpi[dom].remove(spoke)   # Must have 2+ domain interactions
             for hub in self.dict['Domain'][dom]:
                 if hub not in self.dict['PPI']: continue
                 for spoke in self.dict['PPI'][hub][0:]:
                     if spoke in dpi[dom]:
                         self.dict['PPI'][hub].remove(spoke)
                         if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub)
             dpi[dom] = rje.sortUnique(dpi[dom],False,False)
             acc = []
             for name in dpi[dom]:
                 if not name: continue
                 if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum'])
                 elif name not in badname: badname.append(name) 
             open('%s/%s.dpi.acc' % (outdir,dom),'w').write(string.join(acc,'\n'))
             self.printLog('#DPI','%s domain => %d interactors' % (dom,len(acc)))
         if badname:
             badname.sort()
             self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; ')))
         ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         hx = len(self.dict['PPI'])
         for hub in rje.sortKeys(self.dict['PPI']):
             if hub and self.dict['PPI'][hub]: continue
             self.dict['PPI'].pop(hub)
             self.printLog('#DPI','No %s PPI left after DPI removed' % hub,screen=False)
         self.printLog('#PPX','%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx)))
     except: self.errorLog('Problem with SLiMPID.dpi()',quitchoice=True)
Esempio n. 28
0
 def seqBySeq(self):     ### Runs in SeqBySeq Mode                                                               #V1.0
     '''
     In SeqBySeq mode, the program assumes that seqin=FILE and basefile=X are given and farm states the program to be run.
     Seqin will then be worked through in turn and each sequence farmed out to the farm program. Outputs given by OutList
     are then compiled, as is the Log, into the correct basefile=X given. In the case of *.csv and *.tdt files, the header
     row is copied for the first file and then excluded for all subsequent files. For all other files extensions, the
     whole output is copied.
     '''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.getStr('Farm')[-3:] == '.py': self.str['Farm'] = self.str['Farm'][:-3]
         self.list['Seq'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T','accnr=F','seqnr=F']).seq[0:]
         while self.getStrLC('StartFrom') and self.list['Seq']:
             if self.list['Seq'][0].shortName() != self.getStr('StartFrom'): self.list['Seq'] = self.list['Seq'][1:]
             else: self.str['StartFrom'] = ''
         self.printLog('#SEQ','%s query sequences to farm out' % rje.integerString(len(self.list['Seq'])))
         self.list['Pickup'] = self.pickupList()
         ### ~ [2] ~ Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.runJobs()
         return True
     except SystemExit: raise    # Child
     except: self.errorLog('JobFarmer.seqBySeq error')
     return False
Esempio n. 29
0
 def addToGeneCards(self,cards,addcards=True): ### Reconfigures and adds parsed HPRD data to GeneCards
     '''
     Reconfigures and adds parsed HPRD data to GeneCards.
     >> cards:rje_genecards.GeneCards object
     >> addcards:boolean [True] = whether to add genes from HPRD to the GeneCards dictionary
     '''
     ### Add relevant headers for future output ###
     for h in ['HPRD','OMIM','EntrezCheck','Desc']:
         if h not in cards.list['Headers']:
             cards.list['Headers'].append(h)
         for gene in cards.list['Genes']:
             if h not in cards.dict['GeneCard'][gene]: cards.dict['GeneCard'][gene][h] = ''
     ### Add to GeneCards ###
     (hx,htot) = (0.0,len(self.dict['HPRD']))
     for hprd in self.dict['HPRD']:
         self.log.printLog('\r#HPRD','Adding HPRD to GeneCards: %.1f%%' % (hx/htot),newline=False,log=False)
         hx += 100.0
         self.deBug(self.dict['HPRD'][hprd])
         gene = self.dict['HPRD'][hprd]['gene']
         omim = self.dict['HPRD'][hprd]['omim']
         entrez = self.dict['HPRD'][hprd]['entrez']
         if gene in cards.list['Genes']:
             if cards.dict['GeneCard'][gene]['HPRD'] == '': cards.dict['GeneCard'][gene]['HPRD'] = hprd
             elif hprd not in string.split(cards.dict['GeneCard'][gene]['HPRD'],','):
                 cards.dict['GeneCard'][gene]['HPRD'] = string.join(string.split(cards.dict['GeneCard'][gene]['HPRD'],',')+[hprd],',')
             if cards.dict['GeneCard'][gene]['OMIM'] == '': cards.dict['GeneCard'][gene]['OMIM'] = omim
             elif omim not in string.split(cards.dict['GeneCard'][gene]['OMIM'],','):
                 cards.dict['GeneCard'][gene]['OMIM'] = string.join(string.split(cards.dict['GeneCard'][gene]['OMIM'],',')+[omim],',')
             if cards.dict['GeneCard'][gene]['EntrezCheck'] == '': cards.dict['GeneCard'][gene]['EntrezCheck'] = entrez
             elif entrez not in string.split(cards.dict['GeneCard'][gene]['EntrezCheck'],','):
                 cards.dict['GeneCard'][gene]['EntrezCheck'] = string.join(string.split(cards.dict['GeneCard'][gene]['EntrezCheck'],',')+[entrez],',')
         elif addcards:
             if gene == '-': gene = 'HPRD' + hprd
             cards.list['Genes'].append(gene)
             cards.dict['GeneCard'][gene] = {'Symbol':'!FAILED!','HPRD':hprd,'OMIM':omim,'EntrezCheck':entrez,'Desc':self.dict['HPRD'][hprd]['desc']} 
     self.log.printLog('\r#HPRD','Added %s HPRD genes to GeneCards.' % (rje.integerString(htot)))
Esempio n. 30
0
    def domainFasta(
        self
    ):  ### Outputs parsed domain and domain PPI datasets in Fasta format
        '''Outputs parsed PPI datasets in Fasta format.'''
        try:
            ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['Domain', 'HPRD', 'Gene']
            dfile = self.info['OutDir'] + 'HPRD.domains.tdt'
            rje.delimitedFileOutput(self, dfile, headers, '\t')
            sfile = self.info['OutDir'] + 'HPRD.domsource.tdt'
            shead = ['Domain', 'Source']
            rje.delimitedFileOutput(self, sfile, shead, '\t')
            dx = 0.0
            for domain in rje.sortKeys(self.dict['Domains']):
                self.log.printLog('\r#DOM',
                                  'HPRD Domain output (%s): %.1f%%' %
                                  (dfile, dx / len(self.dict['Domains'])),
                                  newline=False,
                                  log=False)
                dx += 100.0
                for hid in self.dict['Domains'][domain]:
                    datadict = {
                        'Domain': domain,
                        'HPRD': hid,
                        'Gene': self.dict['HPRD'][hid]['gene']
                    }
                    rje.delimitedFileOutput(self, dfile, headers, '\t',
                                            datadict)
                for source in self.dict['DomainSource'][domain]:
                    datadict = {'Domain': domain, 'Source': source}
                    rje.delimitedFileOutput(self, sfile, shead, '\t', datadict)
            self.log.printLog(
                '\r#DOM', 'HPRD Domain output (%s): %s domains.' %
                (dfile, rje.integerString(len(self.dict['Domains']))))

            ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datpath = self.info['OutDir'] + rje.makePath(
                'HPRD_Domain_Datasets/')
            rje.mkDir(self, datpath)
            for domain in rje.sortKeys(self.dict['Domains']):
                ## Generate a list of all interactors with domain-containing proteins ##
                plist = []
                for p1 in self.dict['Domains'][domain]:
                    if p1 not in self.dict['PPI']: continue
                    for p2 in self.dict['PPI'][p1]:
                        if p2 not in plist: plist.append(p2)
                plist.sort()
                ## Generate Sequence list and output ##
                mylist = []
                for p in plist:
                    if self.opt['AllIso']:
                        mylist += self.dict['HPRD'][p]['Seq']
                    else:
                        mylist.append(self.dict['HPRD'][p]['Seq'])
                sfile = '%s%s_hprd.fas' % (datpath, domain)
                if mylist:
                    self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile)
                else:
                    self.log.printLog(
                        '#DOM', 'No PPI partners for domain "%s"' % domain)
            self.log.printLog('\r#DOM', 'HPRD Domain fasta output complete.')
        except:
            self.log.errorLog('Error in HPRD.saveFasta()',
                              printerror=True,
                              quitchoice=False)
            raise
Esempio n. 31
0
 def readHMMPFamSearch(
         self,
         resfile=None,
         readaln=False):  ### Reads HMM PFam Search Results into objects
     '''
     Reads HMM Search Results into objects.
     >> resfile:str = Results File (set as self.info['OutFile'])
     >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!!
     '''
     try:
         ### Setup ###
         if not resfile or not os.path.exists(resfile):
             self.log.errorLog('Results file "%s" missing!' % resfile,
                               printerror=False)
             return False
         ## Make RegExp for starting next alignment ##
         re_hit = string.join([
             '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)',
             'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)'
         ], '\s+')
         ## Search dictionary as results come back per sequence, not per HMM! ##
         pfam = {}  # Dictionary of {PFam name:search}
         hitx = 0  # Total number of hits
         hitlist = [
         ]  # List of sequences processed from file (may or may not include zero hit sequences)
         ### Read in Search results ###
         if open(resfile, 'r').readline().find('hmmpfam') != 0:
             self.errorLog(
                 'File "%s" does not appear to be an hmmpfam results file' %
                 resfile,
                 printerror=False)
             if rje.yesNo(
                     'Delete incorrect results file? (Check that hmmpfam=T is right!)',
                     default='N'):
                 os.unlink(resfile)
                 self.printLog('#DEL',
                               'Dodgy results file "%s" deleted.' % resfile)
             return False
         hitname = None
         i = 0
         hx = 0
         seqx = 0
         RESFILE = open(resfile, 'r')
         #x#resline = self.loadFromFile(resfile,chomplines=True)
         #x#while i < len(resline):
         line = RESFILE.readline()
         newres = [rje.chomp(line)]
         newresout = True
         newresfile = '%s.partial' % resfile
         if os.path.exists(newresfile): os.unlink(newresfile)
         while line:
             self.progLog(
                 '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' %
                 (resfile, rje.integerString(hx),
                  rje.integerString(len(pfam)), rje.integerString(hitx)))
             line = rje.chomp(line)
             #print line
             ## New Sequence ##
             if rje.matchExp('^Query sequence:\s+(\S+)', line):
                 if newres and newresout and self.opt['CleanRes']:
                     open(newresfile, 'a').write(string.join(newres, '\n'))
                 newres = ['', line]
                 newresout = False
                 hitname = rje.matchExp('^Query sequence:\s+(\S+)', line)[0]
                 hx += 1
                 #x#if hitname not in hitlist: hitlist.append(hitname)
             ## One Line Data for hits ##
             elif line.find('Parsed for domains:') == 0:
                 #x#i += 3      # Skip two complete lines
                 newres += [
                     line,
                     rje.chomp(RESFILE.readline()),
                     rje.chomp(RESFILE.readline())
                 ]
                 line = rje.chomp(RESFILE.readline())
                 newres.append(line)
                 #Model           Domain  seq-f seq-t    hmm-f hmm-t      score  E-value
                 #--------        ------- ----- -----    ----- -----      -----  -------
                 #Lep_receptor_Ig   1/1      24   114 ..     1   103 []   158.4  1.7e-44
                 # ... else ...
                 #         [no hits above thresholds]
                 while rje.matchExp(
                         string.join([
                             '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)',
                             '(\S+)\s*$'
                         ], '\s+'), line):
                     newresout = True
                     (dom, start, end, score, eval) = rje.matchExp(
                         string.join([
                             '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)',
                             '(\S+)\s*$'
                         ], '\s+'), line)
                     if not pfam.has_key(dom):
                         pfam[dom] = self._addSearch()
                         pfam[dom].info['Name'] = dom
                     hit = pfam[dom]._addHit()
                     hit.info['Name'] = hitname
                     aln = hit._addAln()
                     aln.setStat({
                         'SbjStart': string.atoi(start),
                         'SbjEnd': string.atoi(end),
                         'Expect': string.atof(eval),
                         'BitScore': string.atof(score)
                     })
                     hitx += 1
                     self.progLog(
                         '\r#RES',
                         'Reading %s: %s Seqs; %s Domains; %s Hits' %
                         (resfile, rje.integerString(hx),
                          rje.integerString(
                              len(pfam)), rje.integerString(hitx)))
                     line = rje.chomp(RESFILE.readline())
                     newres.append(line)
             ## End of Protein ##
             elif line[:2] == '//':
                 hitname = None
                 newres.append(line)
             elif rje.matchExp(
                     'End of rje_hmm reduced results file: (%d) sequences in original',
                     line):
                 seqx = string.atoi(
                     rje.matchExp(
                         'End of rje_hmm reduced results file: (\d+) sequences in original',
                         line)[0])
             elif newres:
                 newres.append(line)
             #x#i += 1
             line = RESFILE.readline()
         if newres and newresout and self.opt['CleanRes']:
             open(newresfile, 'a').write(string.join(newres, '\n'))
         if not seqx: seqx = hx
         if self.opt['CleanRes']:
             open(newresfile, 'a').write(
                 string.join([
                     '',
                     'End of rje_hmm reduced results file: %d sequences in original'
                     % seqx
                 ], '\n'))
             os.unlink(resfile)
             os.rename(newresfile, resfile)
             self.printLog(
                 '\r#RED',
                 'Results file %s replaced with reduced version (%s Hits only)'
                 % (resfile, rje.integerString(hitx)))
         self.printLog(
             '\r#RES', 'Reading %s complete: %s Seqs; %s Domains; %s Hits' %
             (resfile, rje.integerString(seqx), rje.integerString(
                 len(pfam)), rje.integerString(hitx)))
         return True
     except:
         self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile))
         return False
Esempio n. 32
0
    def altPAM(self):  ### Alternative PAM matrix construction
        '''Alternative PAM matrix construction.'''
        try:
            ### Setup ##
            wlines = self.loadFromFile(self.info['AltPam'])
            if not wlines:
                raise IOError
            aas = string.split(wlines[0].upper())
            codes = string.split(wlines[1])
            rawfreqs = string.split(wlines[2])
            freq = {}
            for i in range(len(rawfreqs)):
                freq[aas[i]] = string.atof(rawfreqs[i])
            prob = {}
            for r in range(3, 22):
                subs = string.split(wlines[r])
                for i in range(len(subs)):
                    prob['%s%s' % (aas[i], aas[r - 2])] = string.atof(subs[i])
                    prob['%s%s' % (aas[r - 2], aas[i])] = string.atof(subs[i])

            ### Alternative freqs ###
            if self.info['SeqIn'].lower() not in [
                    '', 'none'
            ] and os.path.exists(self.info['SeqIn']):
                ## Clear freq ##
                freq = {}
                for a in aas:
                    freq[a] = 0.0
                ## Count freq ##
                slines = self.loadFromFile(self.info['SeqIn'])
                for line in slines:
                    if line[:1] == '>':
                        continue
                    for a in aas:
                        freq[a] += string.count(line.upper(), a)
                ## Convert to freq ##
                total = sum(freq.values())
                if total > 0:
                    for a in aas:
                        freq[a] = freq[a] / total
                self.log.printLog(
                    '#AA', 'Rescaling matrix based on %s aa from %s.' %
                    (rje.integerString(total), self.info['SeqIn']))

            ### Calculate s ###
            s = 0.01
            step = 0.000001
            solve = True
            bests = 1.000000
            bestdif = -1
            while solve and s >= step:
                ## Scaler ##
                s = s - step
                self.log.printLog(
                    '\r#WAG',
                    'Considering s = %.6f; Best s = %.6f (Dif = %.6f)' %
                    (s, bests, bestdif),
                    log=False,
                    newline=False)
                ## Self Subs ##
                newprobs = rje.scaledict(dict=prob, scale=s)
                toobig = False
                for a in aas:
                    newprobs['%s%s' % (a, a)] = 1.0
                    for key in prob.keys():
                        if key[0] == a:
                            newprobs['%s%s' % (a, a)] -= newprobs[key]
                            if newprobs['%s%s' %
                                        (a, a)] < 0.0:  # Overshot possibility
                                toobig = True
                                break
                    if toobig:
                        break
                if toobig:
                    continue
                #print 'PAM!!',
                ## PAM1 ##
                dsum = 0.0
                for a in aas:
                    dsum += freq[a] * newprobs['%s%s' % (a, a)]
                dif = 0.99 - dsum
                if dif < 0:
                    dif = -dif
                if dif < bestdif or bestdif < 0:
                    bestdif = dif
                    bests = s

            ### Output best s ###
            self.log.printLog(
                '\r#WAG',
                'Considered all s <= 0.010000; Best s = %.6f (Dif = %.6f)' %
                (bests, bestdif))
            if self.info['PamOut'].lower() in ['', 'none']:
                self.info['PamOut'] = self.info['AltPam'] + '.pam'
            self.log.printLog(
                '#PAM',
                'Rescaled PAM matrix output to %s' % self.info['PamOut'])
            PAM = open(self.info['PamOut'], 'w')
            rje.writeDelimit(PAM, aas, ' ')
            newprobs = rje.scaledict(dict=prob, scale=bests)
            for a in aas:
                newprobs['%s%s' % (a, a)] = 1.0
                for key in prob.keys():
                    if key[0] == a:
                        newprobs['%s%s' % (a, a)] -= newprobs[key]
            for i in range(len(aas)):
                out = [codes[i]]
                a = aas[i]
                for b in aas:
                    out.append('%.6f' % newprobs['%s%s' % (a, b)])
                rje.writeDelimit(PAM, out, ' ')
            PAM.close()
            self.info['Name'] = self.info['PamOut']

        except:
            self.log.errorLog('Major Error with PamCtrl.altPAM().',
                              quitchoice=True)
Esempio n. 33
0
    def singleSeqAQ(self,seqlist,focus=[0,-1]):     ### Performs SAQ on seqlist, adding seq.info['SAQ']
        '''
        Performs SAQ on seqlist, adding seq.info['SAQ'].
        >> seqlist:rje_seq.SeqList Object
        - NB. This object will itself have sequences removed from it, so beware!
        - A new info key will be added: SAQX = SAQ sequences with individual Xs
        - A new info key will be added: SAQ = SAQ sequences with aligment Xs
        >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:].
        '''
        ### <SAQ1> ### Setup
        try:
            _stage = '<1> Setup'
            haqlist = seqlist   # SeqList Object to store individually Xd sequences
            query = haqlist.obj['QuerySeq']
            if self.opt['NoQuery']:
                query = None
            badres = [-1,0]     # List of how many bad residues in total dataset
            block_align = {}    # Dictionary of whether residue in block of sequence that is well-aligned or not
            res_align = {}      # Dictionary of whether residue of sequence is well-aligned or not
            res_gap = {}        # Dictionary of whether residue of sequence is a gap or not
            gap_align = {}      # Dictionary of whether residue of sequence is a gap in a well-aligned block or not
            for seq in haqlist.seq:
                seq.info['SAQ'] = seq.info['Sequence'][0:]      # Note! Sequence is modified and SAQ not, then they are swapped at end!
                block_align[seq] = [False] * seq.seqLen()
                res_align[seq] = [False] * seq.seqLen()
                res_gap[seq] = [False] * seq.seqLen()
                gap_align[seq] = [False] * seq.seqLen()

        ### <SAQ2> ### Repeated cycles of defining well- and badly-aligned blocks
            #X#self.deBug(self.stat)
            _stage = '<2> BlockID'
            while badres[-1] != badres[-2]:     # Change in number of bad residues
                total_res = 0
                badres.append(0)    # badres[-1] is the current number of bad residues
                infotxt = 'SAQ%d-%d: Calculating "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2)
                for seq in haqlist.seq:
                    myinfo = '%s %.1f%%' % (infotxt,(100.0 * haqlist.seq.index(seq) / haqlist.seqNum()))
                    self.log.printLog('\r#SAQ',myinfo,log=False,newline=False)
                    #self.verbose(0,3,'\r%45s' % myinfo,0)

                    ## <SAQ2a> ## For each sequence, mark residues as aligned or gapped
                    _stage = '<2a> Mark Residues'
                    for r in range(seq.seqLen()):
                        gap_align[seq][r] = False
                        res_align[seq][r] = False
                        if block_align[seq][r] or len(badres) == 3:     # After first cycle, look only at well-aligned blocks (well-aligned for sequence not whole alignment)
                            a = seq.info['Sequence'][r]
                            res_gap[seq][r] = False
                            if a == '-':
                                res_gap[seq][r] = True
                                gap_align[seq][r] = True
                            else:   # 'X' handled by self._saqCon
                                conx = 0  # Matches with good regions of otherseqs (*including self*)
                                for otherseq in haqlist.seq[0:]:
                                    if otherseq == seq:     # > so self not counted!
                                        continue
                                    if len(otherseq.info['Sequence']) != len(seq.info['Sequence']):
                                        self.log.errorLog('Sequence lengths do not match - should be aligned!',printerror=False)
                                        raise ValueError
                                    if (block_align[otherseq][r] or len(badres) == 3):
                                        conx += self._saqCon(a, otherseq.info['Sequence'][r])
                                #if seq == query and r > 590:
                                #    print seq.shortName(),r,conx,'vs',self.stat['SAQCon'],
                                if conx >= self.stat['SAQCon']:    
                                    res_align[seq][r] = True
                        #if seq == query and r > 590:
                        #    print r, res_align[seq][r]

                    ## <SAQ2b> ## Marked regions of well-aligned residues for each sequence
                    _stage = '<2b> Mark Regions'
                    ## <i> ## Clear first
                    _stage = '<2b-i> Mark Regions'
                    for r in range(seq.seqLen()):
                        block_align[seq][r] = False
                    ## <ii> ## Recalculate
                    _stage = '<2b-ii> Mark Regions'
                    for r in range(seq.seqLen()):
                        _stage = '<2b-ii> Blocks'
                        if res_align[seq][r]:   # Start of potential block
                            blen = 0    # Block length (SAQBlock) = AAs
                            win = 0     # Window length = all sequence
                            matchx = 1  # Good residues in window (first residue must be good!) (SAQMatch)
                            while blen < self.stat['SAQBlock'] and matchx < self.stat['SAQMatch']:
                                win += 1
                                if (r + win) >= seq.seqLen() or seq.info['Sequence'][r+win] == 'X':     # Hit Bad Region: Abort
                                    break
                                else:   # Better region
                                    if gap_align[seq][r+win]:   # Decent gap
                                        continue
                                    else:
                                        blen += 1   # Increase Block
                                        if res_align[seq][r+win]:   # Good residue
                                            matchx += 1
                            #if seq == query and r > 590:
                            #    print seq.shortName(),r,matchx,'vs',self.stat['SAQMatch'],
                            if matchx >= self.stat['SAQMatch']:
                                for w in range((win+1)):
                                    block_align[seq][r+w] = True
                        #if seq == query and r > 590:
                        #    print r, block_align[seq][r]
                    ## <iii> ## Update bad residue count
                    for r in range(seq.seqLen()):
                        _stage = '<2b-iii> Mark Regions'
                        #print seq.shortName(), r, seq.seqLen(), block_align[seq][r], res_gap[seq][r], badres[-1]   # Bad residue
                        if not block_align[seq][r] and not res_gap[seq][r]:   # Bad residue
                            badres[-1] += 1
                        if not res_gap[seq][r]:
                            total_res += 1
                myinfo = '%s 100.0%%' % infotxt
                myinfo += ' => %s bad of %s total residues' % (rje.integerString(badres[-1]),rje.integerString(total_res))
                self.log.printLog('\r#SAQ',myinfo)
                #self.verbose(0,3,'\r%45s' % myinfo,0)
                if badres[-1] == total_res:
                    self.log.errorLog('All residues marked as bad in SAQ!',printerror=False,quitchoice=True)
                # Now have all residues in all sequences marked as good (block_align=True) or bad (block_align=False)

        ### <SAQ3> ### X out badly-aligned blocks
            _stage = '<3> X-Out'
            self.log.printLog('#SAQ','SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),log=False,newline=False)
            #self.verbose(0,3,'SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),0)
            for seq in haqlist.seq:
                newseq = ''
                for r in range(seq.seqLen()):
                    if block_align[seq][r] or seq.info['Sequence'][r] == '-':   #!# Was backwards? res_gap[seq][r] == False:
                        newseq += seq.info['Sequence'][r]
                    else: # Bad residue
                        newseq += 'X'
                seq.info['Sequence'] = newseq[0:]
                seq.info['SAQX'] = newseq[0:]       # Stores Xd sequences for individuals for use in PAQ
            #!# Add saving of data in 'datafull' option

        ### <SAQ4> ### Remove sequences and/or badly-aligned regions
            _stage = '<4> Removal'
            self.log.printLog('\r#SAQ','SAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),log=False,newline=False)
            #self.verbose(0,3,'\rSAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),0)
            ## <SAQ4a> ## Process Query first - only interested in good regions within query
            _stage = '<4a> Query Removal'
            if self.opt['NoQuery'] or query == None:  # No preprocessing of Query
                self.verbose(0,4,'no Master Query processing...',0)
            else:
                haqlist.mapX(query, qtrim=True, focus=focus) # Replaces other sequence ends and query X columns with Xs
                self.verbose(0,4,'Query (%s) processed...' % query.shortName(),0)
            self.verbose(0,3,'',1)
            if self.opt['ManSAQ']:
                haqlist.saveFasta(seqfile='%s.mansaq.fas' % haqlist.info['Basefile'])

            ## <SAQ4b> ## Cycle through other sequences (worst first) until no more good residues or sequences are lost
            _stage = '<4b> Seq Removal'
            goodres = [0, self._getGood(haqlist.seq)]   # List of number of 'good' residues
            goodseq = [0, haqlist.seqNum()]
            while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]:
                colgood = [0] * haqlist.seq[0].seqLen()    # Good residues per column
                for r in range(haqlist.seq[0].seqLen()):
                    for seq in haqlist.seq:
                        if seq.info['Sequence'][r] != '-' and seq.info['Sequence'][r] != 'X':
                            colgood[r] += 1
                ## <i> ## Compare relative loss of masking and losing each sequence
                keepx = {}  # Dictionary of seq:number of lost residues if seq kept
                losex = {}  # Dictionary of seq:number of lost residues if seq lost
                badkx = -1  # Biggest loss if kept
                badlx = -1  # Biggest loss if lost
                bads = None # Worst sequence
                for seq in haqlist.seq:
                    if seq == query and self.opt['NoQuery'] == False:
                        continue    # Next sequence
                    # Calculate keepx and losex
                    keepx[seq] = 0
                    for r in range(seq.seqLen()):
                        if seq.info['Sequence'][r] == 'X':
                            keepx[seq] += colgood[r]
                    losex[seq] = self._getGood([seq])
                    # Update bads if worse
                    if keepx[seq] > badkx:
                        badkx = keepx[seq]
                        badlx = losex[seq]
                        bads = seq
                    elif keepx[seq] == badkx and losex[seq] < badlx:
                        badlx = losex[seq]
                        bads = seq
                ## <ii> ## Remove bad sequences and/or regions
                if badkx > 0:
                    if self.opt['ManSAQ']:
                        default = 'N'
                        if badkx * self.stat['SAQKeepLen'] > badlx * self.stat['SAQKeepSeq']:   # Lose sequence!
                            default = 'Y'
                        if rje.yesNo('%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),default):
                            seqlist.removeSeq(text='SAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['SAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads)
                        else:   # X out
                            haqlist.mapX(bads)
                    else:
                        self.verbose(1,3,'%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),1)
                        #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?)
                        if badkx * self.stat['SAQKeepLen'] > badlx * self.stat['SAQKeepSeq']:   # Lose sequence!
                            haqlist.removeSeq(text='SAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['SAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads)
                        else:   # X out
                            haqlist.mapX(bads)
                ### <iii> ### Recalculate goodres
                goodres.append(self._getGood(haqlist.seq))
                goodseq.append(haqlist.seqNum())
                #X#self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1)

            ### <SAQ5> ### Reinstate UnX'd sequence:
            _stage = '<4b> Seq Removal'
            for seq in haqlist.seq:
                #print seq.info
                [seq.info['SAQ'],seq.info['Sequence']] = [seq.info['Sequence'],seq.info['SAQ']]
            if self.opt['ManSAQ'] and rje.checkForFile('%s.mansaq.fas' % haqlist.info['Basefile']):
                os.unlink('%s.mansaq.fas' % haqlist.info['Basefile'])

        except:
            self.log.errorLog('Problem with singleSeqAQ() %s.' % _stage, quitchoice=True)
Esempio n. 34
0
class Price(rje.RJE_Object):     
    '''
    Price Class. Author: Rich Edwards (2009).

    Info:str
    - Fitness = Fitness measurement [cons]
    - Phenotype = Phenotype measurement [cons]
    - ResFile = Results file [price.tdt]
    - SeqGroup = Sequence grouping method [triplets]
    - Special = Instigate special run, e.g. allbyall [None]

    Opt:boolean
    - NormFit = Normalise fitness to have mean of 1 [False]
    - QryGaps = Whether to include gaps in the query sequence as positions to score [False]
    - Weighted = Weight the mean covariance by size of group [False]

    Stat:numeric

    List:list
    - Batch = List of alignment files to use as input [*.fas,*.fasta]
    - Fitness = Fitness measurement vector (matches query sequence) 
    - Phenotype = Phenotype measurement (matches query sequence) 
    - SeqGroup = Sequence grouping method (matches query sequence)

    Dict:dictionary    

    Obj:RJE_Objects
    - SeqList = Sequence list object
    '''
#########################################################################################################################
    def qry(self): return self.obj['SeqList'].obj['QuerySeq']
    def seqs(self): return self.obj['SeqList'].seqs()
    def dna(self): return self.obj['SeqList'].dna()
#########################################################################################################################
    ### <1> ### Class Initiation etc.: sets attributes                                                                  #
#########################################################################################################################
    def _setAttributes(self):   ### Sets Attributes of Object
        '''Sets Attributes of Object.'''
        ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
        self.infolist = ['Fitness','Phenotype','ResFile','SeqGroup','Special']
        self.optlist = ['QryGaps','NormFit','Weighted']
        self.statlist = []
        self.listlist = ['Batch','Fitness','Phenotype','SeqGroup']
        self.dictlist = []
        self.objlist = ['SeqList']
        ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
        self._setDefaults(info='None',opt=False,stat=0.0,obj=None,setlist=True,setdict=True)
        self.setInfo({'Fitness':'cons','Phenotype':'cons','SeqGroup':'triplets','ResFile':'price.tdt'})
        self.setOpt({'Append':True,'Weighted':True})
        self.list['Batch'] = ['*.fas','*.fasta']
        ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
        self.obj['SeqList'] = rje_seq.SeqList(self.log,['query=1']+self.cmd_list+['autoload=T'])
#########################################################################################################################
    def _cmdList(self):     ### Sets Attributes from commandline
        '''
        Sets attributes according to commandline parameters:
        - see .__doc__ or run with 'help' option
        '''
        for cmd in self.cmd_list:
            try:
                self._generalCmd(cmd)   ### General Options ### 
                ### Class Options ### 
                self._cmdReadList(cmd,'info',['Fitness','Phenotype','SeqGroup','Special'])
                self._cmdReadList(cmd,'file',['ResFile'])
                self._cmdReadList(cmd,'opt',['QryGaps','NormFit','Weighted'])
                self._cmdReadList(cmd,'list',['Batch'])
            except: self.errorLog('Problem with cmd:%s' % cmd)
#########################################################################################################################
    ### <2> ### Main Class Backbone                                                                                     #
#########################################################################################################################
    def run(self,batch=False):  ### Main run method
        '''Main run method.'''
        try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not batch: self.setupResults()
            ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not batch and not self.obj['SeqList'].seqs():    ### Look for batch files and run for each
                batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0)
                self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles)))
                if not batchfiles: self.errorLog('No input files found!',printerror=False)
                else:
                    bx = 0
                    for infile in batchfiles:
                        bx += 1
                        self.printLog('#BATCH','Batch running %s' % infile)
                        bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile]
                        self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd)
                        self.run(batch=True)
                        self.opt['Append'] = True
                        self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False)
                if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32
                return
            ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if self.info['Special'].lower() == 'allbyall':
                self.printLog('#RUN','Performing special "all-by-all" pairwise run')
                self.info['Special'] = ''
                for i in range(len(self.seqs())-1):
                    self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i]
                    for j in range(i+1,len(self.seqs())):
                        self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1)
                        self.run(batch=True)
                        self.opt['Append'] = True
                self.info['Special'] = 'allbyall'; return                
            ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.setup()
            ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.fitness()
            self.phenotype()
            self.grouping()
            for vector in ['Fitness','Phenotype','SeqGroup']:
                if len(self.list[vector]) != self.qry().seqLen():
                    self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False)
                    raise ValueError
            results = self.price()
            ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True)
            results['Query'] = self.qry().shortName()
            results['Fitness'] = self.info['Fmethod']
            results['Phenotype'] = self.info['Pmethod']
            results['SeqGroup'] = self.info['SeqGroup']
            rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results)
            self.printLog('#OUT','Results output to %s' % self.info['ResFile'])
        except:
            self.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible
#########################################################################################################################
    def setupResults(self):    ### Main results setup method.
        '''Main results setup method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.list['Headers'] = ['Dataset','Query','Fitness','Phenotype','SeqGroup','CovP','CovB','CovW','Price','Ratio']
            rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],rje_backup=True)
        except: self.errorLog('Problem during %s setupResults().' % self); raise
#########################################################################################################################
    def setup(self):    ### Main class setup method.
        '''Main class setup method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            seqlist = self.obj['SeqList'] 
            seqlist._checkAln(aln=True,realign=True)
            if not seqlist.obj['QuerySeq']:
                seqlist.obj['QuerySeq'] = seqlist.seqs()[0]
                self.printLog('#QRY','No query sequence: will use %s' % seqlist.obj['QuerySeq'].shortName())
        except: self.errorLog('Problem during %s setup.' % self); raise
#########################################################################################################################
    ### <3> ### Price Equation Methods                                                                                  #
#########################################################################################################################
    def fitness(self):  ### Calculates fitness vector
        '''Calculates fitness vector.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            methodlist = ['cons','seqnumber']
            self.info['Fmethod'] = method = self.info['Fitness'].lower()
            if method not in methodlist:
                try:
                    method = string.atoi(method)
                    try: method = self.seqs()[method-1]
                    except: self.errorLog('Cannot use sequence "%s" for comparison!' % method); raise
                    self.info['Fmethod'] = method.shortName()
                except: 
                    self.errorLog('Fitness method "%s" not recognised!' % method,printerror=False)
                    self.errorLog('Check fitness=%s' % string.join(methodlist,'/'),printerror=False)
                    raise ValueError
            ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if method == 'cons': self.list['Fitness'] = self.posPercID()
            elif method in self.seqs(): self.list['Fitness'] = self.posPercID(comp=method)
            elif os.path.exists(method):
                self.list['Fitness'] = rje.listFromCommand(method,checkfile=True)
                self.printLog('#FIT','Vector of %s fitness values read from %s' % (len(self.list['Fitness']),method))
            return
        except: self.errorLog(rje_zen.Zen().wisdom()); raise   
#########################################################################################################################
    def phenotype(self):  ### Calculates phenotype vector
        '''Calculates phenotype vector.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            methodlist = ['cons','seqnumber','hyd']
            self.info['Pmethod'] = method = self.info['Phenotype'].lower()
            if method not in methodlist:
                try:
                    method = string.atoi(method)
                    try: method = self.seqs()[method-1]
                    except: self.errorLog('Cannot use sequence "%s" for comparison!' % method); raise
                    self.info['Pmethod'] = method.shortName()
                except: 
                    self.errorLog('Phenotype method "%s" not recognised!' % method,printerror=False)
                    self.errorLog('Check phenotype=%s' % string.join(methodlist,'/'),printerror=False)
                    raise ValueError
            ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if method == 'cons': self.list['Phenotype'] = self.posPercID()
            elif method in self.seqs(): self.list['Phenotype'] = self.posPercID(comp=method)
            elif method == 'hyd': self.list['Phenotype'] = rje_sequence.eisenbergHydropathy(self.qry().info['Sequence'],returnlist=True)
            elif os.path.exists(method):
                self.list['Phenotype'] = rje.listFromCommand(method,checkfile=True)
                self.printLog('#PHEN','Vector of %s phenotype values read from %s' % (len(self.list['Phenotype']),method))
        except: self.errorLog(rje_zen.Zen().wisdom()); raise   
#########################################################################################################################
    def grouping(self):  ### Calculates grouping vector
        '''Calculates grouping vector.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            methodlist = ['triplets','codons','casechange','case','disorder']
            method = self.info['SeqGroup'].lower()
            if method not in methodlist:
                self.errorLog('SeqGroup method "%s" not recognised!' % method,printerror=False)
                self.errorLog('Check seqgroup=%s' % string.join(methodlist,'/'),printerror=False)
                raise ValueError
            ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if method == 'triplets': self.list['SeqGroup'] = self.triplets()
            elif method == 'codons': self.list['SeqGroup'] = self.codons()
            elif method == 'casechange': self.list['SeqGroup'] = self.caseChange()
            elif method == 'case': self.list['SeqGroup'] = self.case()
            elif method == 'disorder':
                if self.opt['QryGaps']: self.list['SeqGroup'] = self.qry().gappedDisorder()
                else: self.list['SeqGroup'] = self.qry().gappedDisorder(gap=None)
                for i in range(self.qry().seqLen()):
                    if self.list['SeqGroup'][i]:
                        if self.list['SeqGroup'][i] > self.qry().obj['Disorder'].stat['IUCut']: self.list['SeqGroup'][i] = 'Dis'
                        else: self.list['SeqGroup'][i] = 'Ord'
            elif os.path.exists(method):
                self.list['SeqGroup'] = rje.listFromCommand(method,checkfile=True)
                self.printLog('#GRP','Vector of %s group values read from %s' % (len(self.list['SeqGroup']),method))
        except: self.errorLog(rje_zen.Zen().wisdom()); raise   
#########################################################################################################################
    def price(self):  ### Calculates price equation, using Fitness, Phenotype and SeqGroup vectors
        '''Calculates price equation, using Fitness, Phenotype and SeqGroup vectors.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pop = {'z':[],'w':[]}       # w = fitness, z = phenotype
            grp = {}                    # Each group will have its own w and z
            grpmean = {'z':[],'w':[]}   # Calculate means for each group
            grpcov = []                 # List of group covariances
            ### ~ [2] Populate data vectors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.deBug(self.list['SeqGroup'])
            self.deBug(self.list['Fitness'])
            self.deBug(self.list['Phenotype'])
            for i in range(len(self.list['SeqGroup'])):
                if not self.list['SeqGroup'][i]: continue
                g = self.list['SeqGroup'][i]
                w = self.list['Fitness'][i]
                z = self.list['Phenotype'][i]
                pop['z'].append(z); pop['w'].append(w)
                if g not in grp: grp[g] = {'z':[],'w':[]}
                grp[g]['z'].append(z); grp[g]['w'].append(w)
            ## ~ [2a] Normalise fitness? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if self.opt['NormFit']:
                meanfit = float(rje.meansd(pop['w'])[0])
                for i in range(len(pop['w'])): pop['w'][i] = pop['w'][i] / meanfit
                for g in grp:
                    for i in range(len(grp[g]['w'])): grp[g]['w'][i] = grp[g]['w'][i] / meanfit
            ## ~ [2b] Group means ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            covw = 0.0                      # Mean covariance within groups
            for g in grp:
                grp[g]['cov'] = self.covariance(grp[g]['z'],grp[g]['w'])
                grpcov.append(grp[g]['cov'])
                if self.opt['Weighted']: covw += grp[g]['cov'] * len(grp[g]['w']) / len(pop['w'])
                else: covw += grp[g]['cov'] / len(grp)
                grpmean['z'].append(rje.meansd(grp[g]['z'])[0])
                grpmean['w'].append(rje.meansd(grp[g]['w'])[0])
            ### ~ [3] Calculate within and between group covariance ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            covp = self.covariance(pop['z'],pop['w'])           # Covariance of whole population
            covb = self.covariance(grpmean['z'],grpmean['w'])   # Covariance between groups
            #x#covw = rje.meansd(grpcov)[0]                        # Mean covariance within groups
            price = covp / rje.meansd(pop['w'])[0]
            try: ratio = covb / covw
            except: ratio = -1
            ## ~ [3a] Perform checks of calculation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.printLog('#CHECK','CovP = %s; (CovB + CovW) = %s' % (rje.expectString(covp),rje.expectString(covb+covw)))
            self.printLog('#PRICE','Price value = %s; CovB/CovW ratio = %s' % (rje.expectString(price),rje.expectString(ratio)))
            return {'CovP':rje.expectString(covp),'CovB':rje.expectString(covb),'CovW':rje.expectString(covw),'Price':rje.expectString(price),'Ratio':rje.expectString(ratio)}
        except: self.errorLog(rje_zen.Zen().wisdom()); raise   
#########################################################################################################################
    def covariance(self,list1,list2):   ### Calculates the covariance of two lists and returns
        '''Calculates the covariance of two lists and returns.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            n = len(list1)
            if not n: self.errorLog('Lists for covariance are empty!',printerror=False); return 0.0
            if len(list2) != n: self.errorLog('Lists for covariance of different lengths!',printerror=False); raise ValueError
            ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            return covariance(list1,list2)
        except: self.errorLog(rje_zen.Zen().wisdom()); raise   
#########################################################################################################################
    ### <4> ### Fitness/Phenotype Methods                                                                               #
#########################################################################################################################
    def posPercID(self,gaps=True,xval=0.0,default=1.0,comp=None):  ### Returns a list of absolute pecentage conservation across each position
        '''
        Returns a list of absolute pecentage conservation across each position.
        >> gaps:bool [True] = Whether to include gapped sequences in calculation [True]
        >> xval:num [0.0] = The value (0-1) to give undefined residues matching defined residues
        >> default:num [1.0] = Value to return if no homologues for position
        >> comp:Sequence object = sequence for pairwise comparison
        '''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            qry = self.qry()
            if comp: compseq = [comp]
            else: compseq = self.seqs()[0:]; compseq.remove(qry)
            poslist = [default] * qry.seqLen()    # List of percentage ID values
            xval = min(1.0,max(0.0,xval))
            ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for r in range(qry.seqLen()):
                q = qry.info['Sequence'].upper()[r]
                i = 0.0; n = 0
                for seq in compseq:
                    s = seq.info['Sequence'].upper()[r]
                    if s == q: i += 1; n += 1
                    elif 'X' in [s,q]: i += xval; n += 1
                    elif s == '-' and not gaps: continue
                    else: n += 1
                if n: poslist[r] = i / n
            return poslist
        except: self.errorLog(rje_zen.Zen().wisdom()); raise
#########################################################################################################################
    ### <5> ### Grouping Methods                                                                                        #
#########################################################################################################################
    def triplets(self): ### Returns grouping vector based on DNA triplets
        '''Returns grouping vector based on DNA triplets.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            qry = self.qry()
            grplist = [0] * qry.seqLen()    # List of groups (0 = no group)
            gx = 0
            ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            trip = 0
            for r in range(qry.seqLen()):
                q = qry.info['Sequence'].upper()[r]
                if self.opt['QryGaps'] and q == '-': continue
                if not trip: gx += 1
                grplist[r] = gx
                if trip == 2: trip = 0
                else: trip += 1
            self.printLog('#GRP','%s triplet groups from %s%s' % (rje.integerString(gx),rje.integerString(qry.seqLen()),self.obj['SeqList'].units()))
            return grplist
        except: self.errorLog(rje_zen.Zen().wisdom()); raise           
#########################################################################################################################
    def codons(self): ### Returns grouping vector based on DNA codon positions (three groups)
        '''Returns grouping vector based on DNA codon positions (three groups).'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            qry = self.qry()
            grplist = [0] * qry.seqLen()    # List of groups (0 = no group)
            ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            trip = 1
            for r in range(qry.seqLen()):
                q = qry.info['Sequence'].upper()[r]
                if self.opt['QryGaps'] and q == '-': continue
                grplist[r] = trip
                if trip == 3: trip = 1
                else: trip += 1
            self.printLog('#GRP','3 codon groups from %s%s' % (rje.integerString(qry.seqLen()),self.obj['SeqList'].units()))
            return grplist
        except: self.errorLog(rje_zen.Zen().wisdom()); raise           
#########################################################################################################################
    def caseChange(self):     ### Returns groupings based on Case boundaries of query
        '''Returns groupings based on Case boundaries of query.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            qry = self.qry()
            self.deBug(qry.getSequence(case=True))
            grplist = ['UC'] * qry.seqLen()    # List of groups (None = no group)
            ### ~ [1] Map Case ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for (start,end) in self.qry().dict['Case']['Lower']:
                for i in range(start-1,end): grplist[i] = 'LC'
            caselist = grplist[0:]
            gx = 1
            for r in range(qry.seqLen()):
                q = qry.info['Sequence'].upper()[r]
                if not self.opt['QryGaps'] and q == '-': grplist[r] = 0
                elif r > 0 and caselist[r] != caselist[r-1]: gx += 1
                grplist[r] = gx
            self.printLog('#GRP','%s case groups from %s%s' % (rje.integerString(gx),rje.integerString(qry.seqLen()),self.obj['SeqList'].units()))
            self.deBug(grplist)
            return grplist
Esempio n. 35
0
    def convert(self,
                filelist=[],
                outfile=None
                ):  ### Converts scansite output files in FileList to Outfile
        '''
        Converts scansite output files in FileList to Outfile.
        '''
        try:
            ### Setup ###
            _stage = 'Setup'
            if len(filelist) < 1:
                filelist = self.list['FileList']
            if not outfile:
                outfile = self.info['Name']
            if len(filelist) < 1:
                self.log.errorLog(
                    'No scansite files to convert! %s unchanged/not made.' %
                    outfile,
                    printerror=False)
                return False
            delimit = rje.getDelimit(self.cmd_list)
            ext = rje.delimitExt(delimit)
            if ext != outfile[-3:]:
                newfile = outfile[:-3] + ext
                if rje.yesNo('Change file name from %s to %s?' %
                             (outfile, newfile)):
                    outfile = newfile
            self.log.printLog(
                '#OUT', 'Converting %d file(s), output to %s.' %
                (len(filelist), outfile))

            ### Output File ###
            _stage = 'Output File'
            if not self.opt['Append'] or not os.path.exists(
                    outfile):  # Create with header
                OUTFILE = open(outfile, 'w')
                headers = [
                    'seq_id', 'enzyme', 'enz_group', 'aa', 'pos', 'score',
                    'percentile', 'matchseq', 'sa'
                ]
                rje.writeDelimit(OUTFILE, headers, delimit)
            else:
                OUTFILE = open(outfile, 'a')

            ### Conversion ###
            _stage = 'Conversion'
            sx = 0
            for infile in filelist:
                if not os.path.exists(infile):
                    self.log.errorLog(
                        'Input file %s does not exist! :o(' % infile, False,
                        False)
                    continue
                fx = 0
                INFILE = open(infile, 'r')
                inline = rje.nextLine(INFILE)
                while inline != None:
                    if rje.matchExp(re_scansite, inline):
                        scanlist = rje.matchExp(re_scansite, inline)
                    rje.writeDelimit(OUTFILE, scanlist, delimit)
                    sx += 1
                    fx += 1
                    rje.progressPrint(self, sx)
                    inline = rje.nextLine(INFILE)
                self.log.printLog(
                    '#OUT', '%s scansite results from %s. (%s Total.)' %
                    (rje.integerString(fx), infile, rje.integerString(sx)))
                INFILE.close()

            ### End ###
            _stage = 'End'
            OUTFILE.close()
            self.log.printLog(
                '#OUT', '%s scansite results output to %s.' %
                (rje.integerString(sx), outfile))
            return True
        except:
            self.log.errorLog('Error in convert(%s)' % _stage,
                              printerror=True,
                              quitchoice=False)
            raise
Esempio n. 36
0
 def tabulatePPIRegion(
         self):  ### Tabulates regions of known PPI from DAT file
     '''Tabulates regions of known PPI from DAT file.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tabfile = 'ppi_region.tdt'
         unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat'
         if os.path.exists(tabfile) and not self.opt['Force']:
             return self.printLog('#REGTAB',
                                  '%s found. (Force=F)' % tabfile)
         headers = ['Protein', 'Start', 'End', 'Interactor']
         rje.delimitedFileOutput(self, tabfile, headers, rje_backup=True)
         ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gcmd = "grep -P '(ID   |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile
         self.printLog('#GREP', gcmd)
         prot = None
         rx = 0
         plist = []
         ilist = []
         for gline in os.popen(gcmd).readlines():
             if rje.matchExp('ID   (\S+)', gline):
                 prot = rje.matchExp('ID   (\S+)', gline)[0]
             if rje.matchExp(
                     'FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',
                     gline):
                 (rstart, rend, rint) = rje.matchExp(
                     'FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',
                     gline)
                 for ppi in string.split(rint):
                     if rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi):
                         datadict = {
                             'Protein':
                             prot,
                             'Start':
                             rstart,
                             'End':
                             rend,
                             'Interactor':
                             rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi)[0]
                         }
                         rje.delimitedFileOutput(self,
                                                 tabfile,
                                                 headers,
                                                 datadict=datadict)
                         rx += 1
                         if prot not in plist: plist.append(prot)
                         if datadict['Interactor'] not in ilist:
                             ilist.append(datadict['Interactor'])
                         self.progLog(
                             '\r#REGTAB',
                             'Tabulating regions: %s proteins; %s interactors; %s regions'
                             % (rje.integerString(
                                 len(plist)), rje.integerString(
                                     len(ilist)), rje.integerString(rx)))
         self.printLog(
             '\r#REGTAB',
             'Tabulated regions (%s proteins; %s interactors; %s regions) => %s'
             % (rje.integerString(len(plist)), rje.integerString(
                 len(ilist)), rje.integerString(rx), tabfile))
         return True
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Esempio n. 37
0
 def readHMMPFamSearch(self,resfile=None,readaln=False):  ### Reads HMM PFam Search Results into objects    
     '''
     Reads HMM Search Results into objects.
     >> resfile:str = Results File (set as self.info['OutFile'])
     >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!!
     '''
     try:
         ### Setup ###
         if not resfile or not os.path.exists(resfile):
             self.log.errorLog('Results file "%s" missing!' % resfile,printerror=False)
             return False
         ## Make RegExp for starting next alignment ##
         re_hit = string.join(['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)'],'\s+')
         ## Search dictionary as results come back per sequence, not per HMM! ##
         pfam = {}   # Dictionary of {PFam name:search}
         hitx = 0    # Total number of hits
         hitlist = []        # List of sequences processed from file (may or may not include zero hit sequences)
         ### Read in Search results ###
         if open(resfile,'r').readline().find('hmmpfam') != 0:
             self.errorLog('File "%s" does not appear to be an hmmpfam results file' % resfile,printerror=False)
             if rje.yesNo('Delete incorrect results file? (Check that hmmpfam=T is right!)',default='N'):
                 os.unlink(resfile)
                 self.printLog('#DEL','Dodgy results file "%s" deleted.' % resfile)
             return False
         hitname = None
         i = 0; hx = 0; seqx = 0
         RESFILE = open(resfile,'r')
         #x#resline = self.loadFromFile(resfile,chomplines=True)
         #x#while i < len(resline):
         line = RESFILE.readline()
         newres = [rje.chomp(line)]; newresout = True; newresfile = '%s.partial' % resfile
         if os.path.exists(newresfile): os.unlink(newresfile)
         while line:
             self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx)))
             line = rje.chomp(line)
             #print line
             ## New Sequence ##
             if rje.matchExp('^Query sequence:\s+(\S+)',line):
                 if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n'))
                 newres = ['',line]; newresout = False
                 hitname = rje.matchExp('^Query sequence:\s+(\S+)',line)[0]; hx += 1
                 #x#if hitname not in hitlist: hitlist.append(hitname)
             ## One Line Data for hits ##
             elif line.find('Parsed for domains:') == 0:
                 #x#i += 3      # Skip two complete lines
                 newres += [line,rje.chomp(RESFILE.readline()),rje.chomp(RESFILE.readline())]
                 line = rje.chomp(RESFILE.readline()); newres.append(line)
                 #Model           Domain  seq-f seq-t    hmm-f hmm-t      score  E-value
                 #--------        ------- ----- -----    ----- -----      -----  -------
                 #Lep_receptor_Ig   1/1      24   114 ..     1   103 []   158.4  1.7e-44
                 # ... else ...
                 #         [no hits above thresholds]
                 while rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line):
                     newresout = True
                     (dom,start,end,score,eval) = rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line)
                     if not pfam.has_key(dom):
                         pfam[dom] = self._addSearch()
                         pfam[dom].info['Name'] = dom
                     hit = pfam[dom]._addHit()
                     hit.info['Name'] = hitname
                     aln = hit._addAln()
                     aln.setStat({'SbjStart':string.atoi(start),'SbjEnd':string.atoi(end),'Expect':string.atof(eval),'BitScore':string.atof(score)})
                     hitx += 1
                     self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx)))
                     line = rje.chomp(RESFILE.readline()); newres.append(line)
             ## End of Protein ##
             elif line[:2] == '//': hitname = None; newres.append(line)
             elif rje.matchExp('End of rje_hmm reduced results file: (%d) sequences in original',line):
                 seqx = string.atoi(rje.matchExp('End of rje_hmm reduced results file: (\d+) sequences in original',line)[0])
             elif newres: newres.append(line)
             #x#i += 1
             line = RESFILE.readline()
         if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n'))
         if not seqx: seqx = hx
         if self.opt['CleanRes']:
             open(newresfile,'a').write(string.join(['','End of rje_hmm reduced results file: %d sequences in original' % seqx],'\n'))
             os.unlink(resfile)
             os.rename(newresfile,resfile)
             self.printLog('\r#RED','Results file %s replaced with reduced version (%s Hits only)' % (resfile,rje.integerString(hitx)))
         self.printLog('\r#RES','Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(seqx),rje.integerString(len(pfam)),rje.integerString(hitx)))
         return True
     except:
         self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile))
         return False
Esempio n. 38
0
 def setup(self):    ### Loads data into attributes.
     '''Loads data into attributes.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ UniProt Object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = self.obj['UniProt'] = rje_uniprot.UniProt(self.log,self.cmd_list)
         uniprot.readUniProt()
         if uniprot.entryNum() > 0:  ### UniProt data loaded. Populate seqlist and domain dictionary.
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F'])
             for entry in uniprot.list['Entry']:
                 seq = entry.obj['Sequence']
                 seqlist.seq.append(entry.obj['Sequence'])
                 name = seq.shortName()
                 self.dict['Entry'][name] = entry
                 self.dict['Seq'][name] = seq
                 for ft in entry.list['Feature']:
                     if ft['Type'] in self.list['DomFT']:
                         try:
                             dom = string.split(ft['Desc'])[0]
                             if dom not in self.dict['Domain']: self.dict['Domain'][dom] = []
                             if name not in self.dict['Domain'][dom]: self.dict['Domain'][dom].append(name)
                         except: self.errorLog('Trouble with %s feature %s' % (name,ft))
         ## ~ [1b] ~ SeqList only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             seqlist = rje_seq.SeqList(self.log,self.cmd_list)
             for seq in seqlist.seq:
                 name = seq.shortName()
                 self.dict['Entry'][name] = None
                 self.dict['Seq'][name] = seq
                 #!# Consider adding loading domains from a table #!#
         ## ~ [1c] ~ Add PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['PPI']    # Dictionary of ShortName-centred 
         ppi = rje.dataDict(self,self.info['PPI'])
         for hub in ppi:
             if ppi[hub]['EnsLoci'] == '-': continue
             ens = ppi[hub]['EnsLoci']
             if ens not in self.dict['PPI']: self.dict['PPI'][ens] = []
             self.dict['Gene'][ens] = hub
             for gene in string.split(ppi[hub]['PPI'],','):
                 if ppi[gene]['EnsLoci'] == '-': continue
                 if ppi[gene]['EnsLoci'] not in self.dict['PPI'][ens]: self.dict['PPI'][ens].append(ppi[gene]['EnsLoci'])
         ## ~ [1d] ~ Add DDI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['DDI'] = {}
         if self.info['DDI'].lower() not in ['','none']:                    
             data = rje.dataDict(self,self.info['DDI'],mainkeys=['Name1'],datakeys=['Name2'],
                                 headers=['Pfam1','Pfam2','Name1','Name2','Acc1','Acc2','Code1','Code2'],lists=True)
             ## ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
             (dx,dtot) = (0.0,len(data))
             self.deBug(data)
             try: rje.sortKeys(data)
             except: self.errorLog('F**k',quitchoice=True)
             for p1 in rje.sortKeys(data):
                 self.progLog('\r#DDI','Parsing DDI from iPFam: %.1f%%' % (dx/dtot))
                 if p1 not in self.dict['DDI']: self.dict['DDI'][p1] = []
                 for p2 in data[p1]['Name2']:
                     if p2 not in self.dict['DDI']: self.dict['DDI'][p2] = []
                     if p2 not in self.dict['DDI'][p1]: self.dict['DDI'][p1].append(p2)
                     if p1 not in self.dict['DDI'][p2]: self.dict['DDI'][p2].append(p1)
             self.printLog('\r#DDI','Parsing DDI from iPFam: %s domains' % (rje.integerString(dtot)))
         ## ~ [1e] ~ Family data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['Fam'] = {}
         if self.info['Fam'].lower() not in ['','none']:                    
             data = rje.dataDict(self,self.info['Fam'],mainkeys=['Qry'],datakeys=['Hit'],lists=True)
             for qry in self.dict['Seq']:
                 self.dict['Fam'][qry] = []
                 if qry in data: self.dict['Fam'][qry] = data[qry]['Hit']
                 elif self.dict['Seq'][qry].info['AccNum'] in data: self.dict['Fam'][qry] = data[self.dict['Seq'][qry].info['AccNum']]['Hit']
                 if qry not in self.dict['Fam'][qry]: self.dict['Fam'][qry].append(qry)
     except: self.errorLog('Problem with SLiMPID.setup()',quitchoice=True)
Esempio n. 39
0
    def parseOMIM(self):  ### Main parsing method
        '''Main parsing method.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            aas = string.split(
                string.join(rje_sequence.aa_code_3.values()).upper())
            oline = os.path.exists(self.info['Name'])
            (olen, ox, mx) = (len(open(self.info['Name'],
                                       'r').readlines()), 0.0, 0)
            OMIM = open(self.info['Name'], 'r')

            ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            record = gene = subid = disease = mutation = ''
            av = False  # Whether reading *FIELD* AV for mutation data
            while oline:
                oline = OMIM.readline()
                self.log.printLog(
                    '\r#OMIM',
                    'Processing OMIM: %.2f%% (%s genes)' %
                    (ox / olen, rje.integerString(len(self.dict['Records']))),
                    newline=False,
                    log=False)
                ox += 100.0
                if not av and oline[:1] != '*': continue
                line = rje.chomp(oline)
                while line[-1:] == ' ':
                    line = line[:-1]
                ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if line == '*RECORD*': (record, av) = ('', False)
                elif line == '*FIELD* NO':  # New record
                    record = rje.chomp(OMIM.readline())
                    gene = ''
                    ox += 100.0
                ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* TI':  # New gene
                    gene = string.split(rje.chomp(OMIM.readline()))[-1]
                    subid = ''
                    av = False
                    ox += 100.0
                ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* AV':
                    av = True  # Start of mutation records
                elif av and rje.matchExp('^(\.\d+)',
                                         line):  # New subid mutation record
                    subid = rje.matchExp('^(\.\d+)', line)[0]
                    disease = rje.chomp(OMIM.readline())
                    ox += 100.0
                    try:
                        mutation = rje.matchExp(
                            '^%s, (\D\D\D\d+\D\D\D)' % gene,
                            rje.chomp(OMIM.readline()))[0]
                    except:
                        continue  # No mutation or not coding change
                    ox += 100.0
                    subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)', mutation)
                    if subaa[0] not in aas or subaa[1] not in aas: continue
                    if gene not in self.dict['Records']:
                        self.dict['Records'][gene] = [record]
                    if record not in self.dict['Records'][gene]:
                        self.dict['Records'][gene] += [record]
                    if gene not in self.dict['Mutations']:
                        self.dict['Mutations'][gene] = {}
                    mx += 1
                    self.dict['Mutations'][gene][subid] = (disease, mutation)

            ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OMIM.close()
            self.log.printLog(
                '\r#OMIM',
                'Processing OMIM complete! (%s genes; %s mutations)' %
                (rje.integerString(len(
                    self.dict['Records'])), rje.integerString(mx)))
            self.saveMutations()
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise  # Delete this if method error not terrible
Esempio n. 40
0
 def _setupMapped(self):     ### Sets up list of Previously Mapped Sequences
     '''Sets up list of Previously Mapped Sequences.'''
     ### Setup ###
     self.list['Mapped'] = []    # List of mapped sequence names
     if not self.bool['Append'] or not os.path.exists(self.str['MapFas']): return
     ### Previous Sequences ###
     seqlist = rje_seq.SeqList(None,['i=-1','v=-1','autoload=F','seqin=%s' % self.str['MapFas']])
     SEQFILE = open(filename,'r')
     lastline = ''
     sx = 0
     ### Count ###
     while 1:
         (nextseq,lastline) = seqlist.nextFasSeq(SEQFILE,lastline)
         seqlist.seq = []
         if nextseq:
             sx += 1
             self.list['Mapped'].append(nextseq.shortName())
         else:
             break
     SEQFILE.close()
     self.printLog('#MAP','Read names of %s previously mapped sequences for redundancy checking' % rje.integerString(sx))
Esempio n. 41
0
 def run(self,imenu=False,outputmap=True,returndict=False):      ### Main controlling run Method
     '''
     Main controlling run Method.
     >> imenu:boolean = Whether to initiate interactive menu if appropriate [False].
     >> outputmap:boolean = Whether to output mapping into a file [True]
     >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False]
     '''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(imenu): raise ValueError
         seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file'])
         if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {}
         ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1'])
         blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'),
                      'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))})  
         blast.setStat({'HitAln':blast.getStat('OneLine')})
         blast.list['ResTab'] = ['Search','Hit','GABLAM']
         if seqlist.nt(): blast.str['Type'] = 'blastx'
         ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if outputmap: self._setupOutput()                           ## Output Files ##
         if returndict: mapdict = {}
         else: self._setupMapped()                                   ## Previously Mapped Sequences ##
         seqx = seqlist.seqNum()             ## Number of sequences ##
         ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False)
         ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast.run(format=True)
         self.obj['DB'] = blast.obj['DB']
         ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.debug(self.getStr('MapDB'))
         self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']])
         self.obj['MapDB'].loadSeq(self.getStr('MapDB'))
         self.debug('%s' % self.obj['MapDB'].list['Seq'])
         sx = 0
         while seqlist.nextSeq() != None:
             search = seqlist.getSeq(format='short')
             sx += 1
             ## Check StartFrom ##
             if self.str['StartFrom']:
                 if self.str['StartFrom'] != search:
                     self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx))
                     continue
                 self.str['StartFrom'] = ''
                 self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx))
             ## Check if in Mapped ##
             if search in self.list['Mapped']:
                 resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'}
                 self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas']))
                 rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
                 continue
             ### Map Sequence ###
             self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx)))
             mapname = self.mapSeq(seqlist,blast,search)
             if returndict: mapdict[search] = mapname
         ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx)))           
         if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name'])     #!# Add option to keep BLAST! #!#
         if returndict: return mapdict
     except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise   
Esempio n. 42
0
    def pairwiseAQ(self,seqlist=None,query=None,focus=[0,0]):     ### Performs PAQ on seqlist, adding seq.info['PAQ']
        '''
        Performs PAQ on seqlist, adding seq.info['PAQ']
        >> seqlist:rje_seq.SeqList Object
        - NB. This object will itself have sequences removed from it, so beware!
        - A new info key will be added: PAQ = PAQ sequences with alignment Xs
        >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. 
        '''
        ### <PAQ0> ### Setup
        try:
            _stage = '<0> Setup'
            haqlist = seqlist   # SeqList Object to store individually Xd sequences
            if not query:
                query = haqlist.obj['QuerySeq']
            if self.opt['NoQuery'] or not query:
                query = haqlist.seq[random.randint(0,haqlist.seqNum()-1)]
                self.log.printLog('#QRY','Temp (random) query %s assigned for PAQ' % query.shortName())
            #!# paqx = [False] * seqlist.seq[0].seqLen()    # List of whether a column of the alignment is bad (has an X) [True] or not [False]
            #!# - make this a method?!

            pwaq = {}    # Dictionary of lists of pairwise alignements
            block_align = {}    # Dictionary of whether residue in block of sequence that is well-aligned or not
            for seq in haqlist.seq:
                block_align[seq] = [False] * seq.seqLen()
                seq.info['PAQ'] = seq.info['Sequence'][0:]
                if seq.info.has_key('SAQX') and len(seq.info['SAQX']) == seq.seqLen():   #!# Should no longer be issues due to length changes following realignment
                    seq.info['Sequence'] = seq.info['SAQX'][0:]
                elif seq.info.has_key('SAQX'):
                    self.log.errorLog('Cannot use SAQX for %s in PAQ as wrong length.' % seq.shortName(),printerror=False)
                for otherseq in haqlist.seq:
                    pwaq[(seq,otherseq)] = [False] * seq.seqLen()

        ### <PAQ1> ### Directional Pairwise Comparisons of sequences
            _stage = '<1> Pairwise Comparisons'
            infotxt = 'PAQ%d: Pairwise Comparisons ...' % self.stat['PAQCyc']
            #print self.stat
            for seq in haqlist.seq:
                for otherseq in haqlist.seq:
                    myinfo = '%s %.1f%% %.1f%%   ' % (infotxt,(100.0 * haqlist.seq.index(seq) / haqlist.seqNum()),(100.0 * haqlist.seq.index(otherseq) / haqlist.seqNum()))
                    self.log.printLog('\r#PAQ',myinfo,log=False,newline=False)
                    for r in range(seq.seqLen()):
                        ar = seq.info['Sequence'][r]
                        ## <i> ## Look for PW aligned block
                        _stage = '<1-i> Pairwise Comparisons'
                        if ar not in ['-','X']: # Start of test block
                            blen = 0    # Block length (PAQBlock) = AAs
                            win = 0     # Window length = all sequence
                            matchx = 0  # Score for residues in window 
                            while blen < self.stat['PAQBlock'] and (r+win) < seq.seqLen():     # This time we allow overshoots in both directions
                                ar = seq.info['Sequence'][r+win]
                                at = otherseq.info['Sequence'][r+win]
                                if 'X' in [ar,at]:     # Hit Bad Region: Abort
                                    break
                                else:   # Better region
                                    if ar != '-':   
                                        blen += 1   # Increase Block
                                        matchx += self._saqCon(ar,at)
                                win += 1
                        ## <ii> ## Update pwaq if block good
                            _stage = '<1-ii> Pairwise Comparisons'
                            if matchx >= self.stat['PAQMatch']:
                                for w in range(win):
                                    if seq.info['Sequence'][r+w] in ['-','X']:
                                        pwaq[(seq,otherseq)][r+w] = False
                                    else:
                                        pwaq[(seq,otherseq)][r+w] = True           
            self.log.printLog('\r#PAQ','%s 100.0% 100.0%.   ' % infotxt,log=False)
                
        ### <PAQ2> ### Link back to Query
            _stage = '<2> Linking to Query'
            ### <PAQ2a> ### Network of Pairwise Quality alignments
            _stage = '<2a> Linking to Query'
            #self.verbose(1,3,'PAQ%d: Linking Residues to Query (%s)' % (self.stat['PAQCyc'],query.shortName()),0)
            infotxt = 'PAQ%d: Linking Residues to Query (%s) ...' % (self.stat['PAQCyc'],query.shortName())
            for r in range(query.seqLen()):
                _stage = '<2a> Linking to Query'
                self.log.printLog('\r#PAQ','%s %.1f%%' % (infotxt,(100.0 * r / query.seqLen())),log=False,newline=False)
                qok = {}    # Dictionary of whether residue in seq OK, i.e. linked to query
                for seq in haqlist.seq:
                    qok[seq] = False
                qok[query] = True
                sok = [0,1] # List of OK sequence for residue
                while sok[-2] != sok[-1]:
                    ## <i> ## Match pairs, starting with query
                    _stage = '<2a-i> Linking to Query'
                    for seq in haqlist.seq:
                        if qok[seq]:
                            for otherseq in haqlist.seq:
                                if pwaq[(seq,otherseq)][r] or pwaq[(otherseq,seq)][r]:
                                    qok[otherseq] = True
                    ## <ii> ## Update sok
                    _stage = '<2a-ii> Linking to Query'
                    sok.append(0)
                    for seq in haqlist.seq:
                        if qok[seq]:
                            sok[-1] += 1
                            block_align[seq][r] = True
                _stage = '<2a-iii> Linking to Query'
                if sok[-1] == 1:    # Only query OK!
                    block_align[query][r] = False
            self.log.printLog('\r#PAQ','%s 100.0%%' % infotxt,log=False)
            
            ### <PAQ2b> ### Allow for divergence (Conserved Anchors)
            _stage = '<2b> Anchors'
            if self.opt['Anchors']:
                infotxt = 'PAQ%d: Accounting for divergence within aligned regions ...' % self.stat['PAQCyc']
                ## <i> ## Setup gapped list
                gapped = [False] * query.seqLen()   # Whether column of alignment is gapped
                for seq in haqlist.seq:
                    self.log.printLog('\r#PAQ','%s %.1f%%  ' % (infotxt,(50.0 * haqlist.seq.index(seq) / haqlist.seqNum())),log=False,newline=False)
                    (start,end) = (0,seq.seqLen())
                    while seq.info['Sequence'][start] == '-':
                        start += 1
                    while seq.info['Sequence'][end-1] == '-':
                        end -=1
                    for r in range(start,end):
                        if seq.info['Sequence'][r] == '-':
                            gapped[r] = True
                ## <ii> ## Correction
                for seq in haqlist.seq:
                    self.log.printLog('\r#PAQ','%s %.1f%%  ' % (infotxt,(50 + (50.0 * haqlist.seq.index(seq) / haqlist.seqNum()))),log=False,newline=False)
                    for r in range(seq.seqLen()):
                        if block_align[seq][r] or gapped[r]:    # No need for correction
                            continue
                        # Move in both directions: if good residues (or sequence end) reached before gaps then reinstate
                        winf = 0
                        fwd = True
                        fok = False
                        winb = 0
                        bwd = True
                        bok = False
                        while fwd or bwd:
                            # End of seqs
                            if (r + winf) >= seq.seqLen():
                                fwd = False
                            if (r - winb) < 0:
                                bwd = False
                            # Gaps/OK
                            if fwd:
                                if gapped[r+winf]:
                                    fok = False
                                    fwd = False
                                elif block_align[seq][r+winf]:
                                    fwd = False
                                else:
                                    winf += 1
                            if bwd:
                                if gapped[r-winb]:
                                    bok = False
                                    bwd = False
                                elif block_align[seq][r-winb]:
                                    bwd = False
                                else:
                                    winb += 1
                        if fok and bok: # Reinstate
                            for w in range(r-winb,r+winf+1):
                                block_align[seq][w] = True
                self.log.printLog('\r#PAQ','%s 100.0%%  ' % infotxt,log=False)

        ### <PAQ3> ### X out badly-aligned blocks
            _stage = '<3> Making bad sequence blocks'
            for seq in haqlist.seq:
                newseq = ''
                for r in range(seq.seqLen()):
                    if block_align[seq][r] or seq.info['Sequence'][r] == '-':
                        newseq += seq.info['Sequence'][r]
                    else: # Bad residue
                        newseq += 'X'
                seq.info['Sequence'] = newseq[0:]
            #!# Add saving of data in 'datafull' option

        ### <PAQ4> ### Remove sequences and/or badly-aligned regions
            _stage = '<4> Removing sequences/regions'
            self.verbose(0,4,'PAQ%d: Removing bad sequences and/or dodgy regions...' % self.stat['PAQCyc'],0)
            ## <PAQ4a> ## Process Query first - only interested in good regions within query
            if self.opt['NoQuery']:  # No preprocessing of Query
                self.verbose(0,4,'no Master Query processing...',0)
            else:
                haqlist.mapX(query, qtrim=True, focus=focus) # Replaces other sequence ends and query X columns with Xs
                self.verbose(0,4,'Query (%s) processed...' % query.shortName(),0)
            self.verbose(0,3,'',1)
            if self.opt['ManPAQ']:
                haqlist.saveFasta(seqfile='%s.manpaq.fas' % haqlist.info['Basefile'])

            ## <PAQ4b> ## Cycle through other sequences (worst first) until no more good residues are lost
            goodres = [0, self._getGood(haqlist.seq)]   # List of number of 'good' residues
            goodseq = [0, haqlist.seqNum()]
            while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]:
                colgood = [0] * haqlist.seq[0].seqLen()    # Good residues per column
                for r in range(haqlist.seq[0].seqLen()):
                    for seq in haqlist.seq:
                        if seq.info['Sequence'][r] != '-' and seq.info['Sequence'][r] != 'X':
                            colgood[r] += 1
                ## <i> ## Compare relative loss of masking and losing each sequence
                keepx = {}  # Dictionary of seq:number of lost residues if seq kept
                losex = {}  # Dictionary of seq:number of lost residues if seq lost
                badkx = -1  # Biggest loss if kept
                badlx = -1  # Biggest loss if lost
                bads = None # Worst sequence
                for seq in haqlist.seq:
                    if seq == query and self.opt['NoQuery'] == False:
                        continue    # Next sequence
                    # Calculate keepx and losex
                    keepx[seq] = 0
                    for r in range(seq.seqLen()):
                        if seq.info['Sequence'][r] == 'X':
                            keepx[seq] += colgood[r]
                        #?# In Perl HAQESAC there was an option to ignore Orphans in this calculation. Reinstate?
                    losex[seq] = self._getGood([seq])
                    # Update bads if worse
                    if keepx[seq] > badkx:
                        badkx = keepx[seq]
                        badlx = losex[seq]
                        bads = seq
                    elif keepx[seq] == badkx and losex[seq] < badlx:
                        badlx = losex[seq]
                        bads = seq
                ## <ii> ## Remove bad sequences and/or regions
                if badkx > 0:
                    if self.opt['ManPAQ']:
                        default = 'N'
                        if badkx * self.stat['PAQKeepLen'] > badlx * self.stat['PAQKeepSeq']:   # Lose sequence!
                            default = 'Y'
                        if rje.yesNo('%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),default):
                            seqlist.removeSeq(text='PAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['PAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads)
                        else:   # X out
                            haqlist.mapX(bads)
                    else:
                        self.verbose(1,3,'%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),1)
                        #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?)
                        if badkx * self.stat['PAQKeepLen'] > badlx * self.stat['PAQKeepSeq']:   # Lose sequence!
                            seqlist.removeSeq(text='PAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['PAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads)
                        else:   # X out
                            haqlist.mapX(bads)
                ### <iii> ### Recalculate goodres
                goodres.append(self._getGood(haqlist.seq))
                goodseq.append(haqlist.seqNum())
                self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1)
                        
        ### <PAQ5> ### Reinstate UnX'd sequence:
            _stage = '<5> Replacing sequences'
            for seq in haqlist.seq:
                [seq.info['PAQ'],seq.info['Sequence']] = [seq.info['Sequence'],seq.info['PAQ']]
            if self.opt['ManPAQ'] and rje.checkForFile('%s.manpaq.fas' % haqlist.info['Basefile']):
                os.unlink('%s.manpaq.fas' % haqlist.info['Basefile'])

        except:
            self.log.errorLog('rje_haq.py ~ Problem with pairwiseAQ %s.' % _stage, True)
Esempio n. 43
0
 def run(self,imenu=False,outputmap=True,returndict=False):      ### Main controlling run Method
     '''
     Main controlling run Method.
     >> imenu:boolean = Whether to initiate interactive menu if appropriate [False].
     >> outputmap:boolean = Whether to output mapping into a file [True]
     >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False]
     '''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not self.setup(imenu): raise ValueError
         seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file'])
         if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {}
         ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1'])
         blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'),
                      'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))})  
         blast.setStat({'HitAln':blast.getStat('OneLine')})
         blast.list['ResTab'] = ['Search','Hit','GABLAM']
         if seqlist.nt(): blast.str['Type'] = 'blastx'
         ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if outputmap: self._setupOutput()                           ## Output Files ##
         if returndict: mapdict = {}
         else: self._setupMapped()                                   ## Previously Mapped Sequences ##
         seqx = seqlist.seqNum()             ## Number of sequences ##
         ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False)
         ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         blast.run(format=True)
         self.obj['DB'] = blast.obj['DB']
         ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.debug(self.getStr('MapDB'))
         self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']])
         self.obj['MapDB'].loadSeq(self.getStr('MapDB'))
         self.debug('%s' % self.obj['MapDB'].list['Seq'])
         sx = 0
         while seqlist.nextSeq() != None:
             search = seqlist.getSeq(format='short')
             sx += 1
             ## Check StartFrom ##
             if self.str['StartFrom']:
                 if self.str['StartFrom'] != search:
                     self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx))
                     continue
                 self.str['StartFrom'] = ''
                 self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx))
             ## Check if in Mapped ##
             if search in self.list['Mapped']:
                 resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'}
                 self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas']))
                 rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict)
                 continue
             ### Map Sequence ###
             self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx)))
             mapname = self.mapSeq(seqlist,blast,search)
             if returndict: mapdict[search] = mapname
         ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx)))           
         if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name'])     #!# Add option to keep BLAST! #!#
         if returndict: return mapdict
     except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise   
Esempio n. 44
0
 def setup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt'
         self.progLog('\r#PPI', 'Loading pairwise data...')
         pairwise = rje.dataDict(self, ppipairwise, ['Hub', 'Spoke'],
                                 ['Spoke', 'SpokeSeq', 'Evidence'])
         gene2seq = {}
         seq2gene = {}
         fullppi = {}
         px = 0.0
         ptot = len(pairwise)
         ppix = 0
         for pair in rje.sortKeys(pairwise):
             self.progLog(
                 '\r#PPI',
                 'Processing full pairwise PPI: %.2f%%' % (px / ptot))
             px += 100.0
             [hub, spoke] = string.split(pair, '\t')
             if spoke not in gene2seq:
                 sseq = pairwise[pair]['SpokeSeq']
                 gene2seq[spoke] = sseq
                 seq2gene[string.split(sseq, '__')[0]] = spoke
             if hub not in fullppi: fullppi[hub] = {}
             if spoke not in fullppi[hub]:
                 fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']
                 ppix += 1
         self.printLog(
             '\r#PPI', 'Processed full pairwise PPI: %s genes; %s ppi.' %
             (rje.integerString(len(fullppi)), rje.integerString(ppix / 2)))
         ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt'
         goodppi = self.loadFromFile(goodppifile, chomplines=True)
         self.dict['PPI'] = {}
         px = 0.0
         ptot = len(fullppi)
         fppix = ppix
         ppix = 0
         for hub in fullppi:
             self.progLog(
                 '\r#PPI', 'Filtering complexes: %.2f%% (%s hubs; %s ppi)' %
                 (px / ptot, rje.integerString(len(
                     self.dict['PPI'])), rje.integerString(ppix)))
             px += 100.0
             self.dict['PPI'][hub] = []
             for spoke in fullppi[hub]:
                 goodspoke = False
                 for ptype in goodppi:
                     if rje.matchExp(':(%s)($|\|)' % ptype,
                                     fullppi[hub][spoke]):
                         goodspoke = True
                         break
                 if goodspoke:
                     self.dict['PPI'][hub].append(spoke)
                     continue
                 goodspoke = True
                 for spoke2 in fullppi[hub]:
                     if spoke2 in [hub, spoke]: continue
                     if spoke2 in fullppi[spoke]:
                         goodspoke = False
                         break
                 if goodspoke: self.dict['PPI'][hub].append(spoke)
             ppix += len(self.dict['PPI'][hub])
             if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub)
         self.printLog(
             '\r#PPI', 'Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' %
             (rje.integerString(
                 len(fullppi)), rje.integerString(len(self.dict['PPI'])),
              rje.integerString(fppix / 2), rje.integerString(ppix / 2)))
         ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas'
         scmd = ['accnr=F', 'seqnr=F',
                 'seqin=%s' % seqfile] + self.cmd_list + ['autoload=T']
         seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd)
         self.dict['SeqObj'] = seqlist.seqNameDic('Max')
         self.dict['Gene2Seq'] = gene2seq
         self.dict['Seq2Gene'] = seq2gene
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
Esempio n. 45
0
 def _setupMapped(self):     ### Sets up list of Previously Mapped Sequences
     '''Sets up list of Previously Mapped Sequences.'''
     ### Setup ###
     self.list['Mapped'] = []    # List of mapped sequence names
     if not self.bool['Append'] or not os.path.exists(self.str['MapFas']): return
     ### Previous Sequences ###
     seqlist = rje_seq.SeqList(None,['i=-1','v=-1','autoload=F','seqin=%s' % self.str['MapFas']])
     SEQFILE = open(filename,'r')
     lastline = ''
     sx = 0
     ### Count ###
     while 1:
         (nextseq,lastline) = seqlist.nextFasSeq(SEQFILE,lastline)
         seqlist.seq = []
         if nextseq:
             sx += 1
             self.list['Mapped'].append(nextseq.shortName())
         else:
             break
     SEQFILE.close()
     self.printLog('#MAP','Read names of %s previously mapped sequences for redundancy checking' % rje.integerString(sx))
Esempio n. 46
0
 def readSLiMSearchOcc(self,motifs=[]):   ### Reads SLiMSearch results into data dictionary
     '''Reads SLiMSearch results into data dictionary.'''
     try:### ~ [1] Read ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not motifs: self.printLog('#OCC','Cannot process occurrences for No motifs!')
         occfile = '%s.csv' % self.info['ResFile']
         delimit = rje.delimitFromExt(filename=occfile)
         data = rje.dataDict(self,occfile,mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=string.split('Seq,Desc,Start_Pos,End_Pos,Cons,HomNum,GlobID,LocID,Hyd,SA',','))
         self.dict['Occ'] = {}
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (mx,ox,otot) = (0,0.0,len(data))
         for occ in data:
             self.progLog('\r#OCC','Processing occurrences (%d motifs): %.2f%%' % (mx,ox/otot)); ox += 100.0
             #x#self.deBug('%s vs MinHom %d' % (data[occ],self.stat['MinHom']))
             if string.atoi(data[occ]['HomNum']) < self.stat['MinHom']: continue
             (motif,seq,start,end) = string.split(occ,delimit)
             if motif not in motifs: continue
             try:
                 gene = rje.matchExp('gene:(\S+)\]',data[occ]['Desc'])[0]
                 self.deBug('%s:%s' % (gene,self.ensGO(gene)))
                 if not self.ensGO(gene): continue
             except: continue
             if motif[-3:] == 'rev': (motif,type) = (motif[:-4],'Rev')
             elif motif[-5:] == 'scram': (motif,type) = (motif[:-6],'Scr')
             else: type = 'ELM'
             if motif not in self.dict['Occ']: self.dict['Occ'][motif] = {}; mx += 1
             if type not in self.dict['Occ'][motif]: self.dict['Occ'][motif][type] = {}
             if gene not in self.dict['Occ'][motif][type]: self.dict['Occ'][motif][type][gene] = []
             self.dict['Occ'][motif][type][gene].append(data[occ])
         self.printLog('\r#OCC','Processed %s occurrences: %d motifs with GO-links' % (rje.integerString(otot),mx))
     except: self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 47
0
    def parse(
        self,
        parsedom=True,
        parseseq=True,
        parsecomplex=True
    ):  ### HPRD Parsing method. Generates Mappings, HPRD data dictionary, Domain dictionary & Sequences
        '''HPRD Parsing method. Generates Mappings, HPRD data dictionary, Domain dictionary & Sequences.'''
        try:
            ### ~ Parse HPRD Mappings onto other database IDs from HPRD_ID_MAPPINGS.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['HPRD'] = {}
            self.dict['Mapping'] = {}
            hprd = self.loadFromFile('%sHPRD_ID_MAPPINGS.txt' %
                                     self.info['HPRDPath'],
                                     v=1,
                                     checkpath=True,
                                     chomplines=True)
            hx = float(len(hprd))
            while hprd:
                entry = hprd.pop(0)
                px = 100.0 * (hx - len(hprd)) / hx
                self.log.printLog('\r#HPRD',
                                  'Parsing HPRD_ID_MAPPINGS: %.1f%%' % px,
                                  newline=False,
                                  log=False)
                data = string.split(entry)
                ## Check ##
                if len(data) < 7: continue
                if self.dict['HPRD'].has_key(data[0]):
                    self.log.errorLog('HPRD ID %s duplicated! Aaargh!' %
                                      data[0],
                                      printerror=False)
                ## Update ##
                self.dict['HPRD'][data[0].upper()] = {
                    'gene': data[1].upper(),
                    'gb': data[3],
                    'entrez': data[4],
                    'omim': data[5],
                    'sp': data[6].upper(),
                    'desc': string.join(data[7:])
                }
                for i in [1, 3, 6]:
                    self.dict['Mapping'][data[i].upper()] = data[0]
            self.log.printLog('\r#HPRD', 'Parsing HPRD_ID_MAPPINGS complete!')

            ### ~ Parse HPRD Domain Mappings from PROTEIN_Architecture.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Domains'] = {}
            self.dict['DomainSource'] = {}
            if parsedom:
                hprd = self.loadFromFile('%sPROTEIN_Architecture.txt' %
                                         self.info['HPRDPath'],
                                         v=1,
                                         checkpath=True,
                                         chomplines=True)
                hx = float(len(hprd))
                while hprd:
                    entry = hprd.pop(0)
                    px = 100.0 * (hx - len(hprd)) / hx
                    self.log.printLog('\r#HPRD',
                                      'Parsing PROTEIN_Architecture: %.1f%%' %
                                      px,
                                      newline=False,
                                      log=False)
                    data = string.split(entry)
                    ## Check ##
                    if len(data) < 9: continue
                    (hid, domain, type, source) = (data[0], data[4], data[5],
                                                   data[8])
                    if type != 'Domain': continue
                    ## Update ##
                    if domain not in self.dict['Domains']:
                        self.dict['Domains'][domain] = [hid]
                    elif hid not in self.dict['Domains'][domain]:
                        self.dict['Domains'][domain].append(hid)
                    if domain not in self.dict['DomainSource']:
                        self.dict['DomainSource'][domain] = [source]
                    elif source not in self.dict['DomainSource'][domain]:
                        self.dict['DomainSource'][domain].append(source)
                self.log.printLog('\r#HPRD',
                                  'Parsing PROTEIN_Architecture complete!')

            ### ~ Make SeqList from PROTEIN_SEQUENCES.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if parseseq:
                scmd = self.cmd_list + [
                    'autoload=T', 'gnspacc=F',
                    'seqin=%sPROTEIN_SEQUENCES.txt' % self.info['HPRDPath'],
                    'autofilter=F', 'accnr=F', 'seqnr=F'
                ]
                self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd)
                self.obj['SeqList'].info[
                    'Name'] = self.info['OutDir'] + 'hprd.fas'
                sx = 0.0
                for seq in self.obj['SeqList'].seq[
                        0:]:  # seq.info['ID'] should be the HPRD ID #
                    ## Initial processing of sequence. Only keep if AllIso or isoform 1 ##
                    self.log.printLog('\r#SEQ',
                                      'Processing HPRD Sequences: %.1f%%' %
                                      (sx / self.obj['SeqList'].seqNum()),
                                      newline=False,
                                      log=False)
                    iso = 'X'
                    h = seq.info['ID']
                    try:
                        iso = rje.matchExp('^\d+\|\d+_(\d+)\|',
                                           seq.info['Name'])[0]
                    except:
                        self.deBug(seq.info['Name'])
                    try:
                        if h not in self.dict['HPRD']:
                            self.printLog(
                                '\r#ERR',
                                'Missing from HPRD_ID_MAPPINGS?: %s' %
                                seq.info['Name'])
                            data = string.split(seq.info['Name'], '|')
                            self.dict['HPRD'][h] = {
                                'gene': '-',
                                'gb': data[2],
                                'entrez': '',
                                'omim': '',
                                'sp': '',
                                'desc': string.join(data[3:], '|')
                            }
                        if not self.opt['AllIso'] and self.dict['HPRD'][
                                h].has_key('Seq') and iso != '1':
                            self.obj['SeqList'].seq.remove(seq)
                            continue
                        #x#if h == '00001': self.deBug('%s = %s' % (h,iso))
                        sx += 100.0
                        seq.setInfo({
                            'Gene':
                            self.dict['HPRD'][h]['gene'],
                            'Description':
                            self.dict['HPRD'][h]['desc'] +
                            ' [Gene:%s HPRD:%s; gb:%s; sp:%s]' %
                            (self.dict['HPRD'][h]['gene'], h, self.dict['HPRD']
                             [h]['gb'], self.dict['HPRD'][h]['sp']),
                            'AccNum':
                            self.dict['HPRD'][h]['sp']
                        })
                        ## AllIso options ##
                        if self.opt['AllIso']:
                            if 'Seq' not in self.dict['HPRD'][h]:
                                self.dict['HPRD'][h]['Seq'] = [seq]
                            else:
                                self.dict['HPRD'][h]['Seq'].append(seq)
                            seq.setInfo({'AccNum': '%s-%s' % (h, iso)})
                        else:
                            self.dict['HPRD'][h]['Seq'] = seq
                        #x#print h, self.dict['HPRD'][h]['Seq']
                        ## Finish formatting ##
                        if seq.info['Gene'] == '-':
                            self.dict['HPRD'][h]['gene'] = seq.info[
                                'Gene'] = 'HPRD' + h
                        if seq.info['AccNum'] == '-':
                            seq.info['AccNum'] = self.dict['HPRD'][h]['gb']
                        seq.info['ID'] = '%s_HUMAN' % seq.info['Gene']
                        seq.info['Name'] = '%s__%s %s' % (
                            seq.info['ID'], seq.info['AccNum'],
                            seq.info['Description'])
                    except:
                        self.errorLog('Protein Parse Error (%s)' %
                                      seq.info['Name'])
                self.log.printLog('\r#SEQ',
                                  'Processing HPRD Sequences complete!')

            ### ~ Make PPI Data from BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            missing = []
            self.dict['PPI'] = {}
            ppi = self.loadFromFile(
                '%sBINARY_PROTEIN_PROTEIN_INTERACTIONS.txt' %
                self.info['HPRDPath'],
                v=1,
                checkpath=True,
                chomplines=True)
            hx = float(len(ppi))
            ix = 0
            while ppi:
                entry = ppi.pop(0)
                px = 100.0 * (hx - len(ppi)) / hx
                self.log.printLog(
                    '\r#PPI',
                    'Parsing BINARY_PROTEIN_PROTEIN_INTERACTIONS: %.1f%%' % px,
                    newline=False,
                    log=False)
                data = string.split(entry, '\t')
                ## Check ##
                if len(data) < 7: continue
                types = string.split(data[6], ';')
                if not types: types = ['unknown']
                for type in types[0:]:
                    if type in self.list['BadType'] or (
                            self.list['PPIType']
                            and type not in self.list['PPIType']):
                        types.remove(type)
                if not types: continue
                ix += 1
                ## Update ##
                (p1, p2) = (data[1].upper(), data[4].upper())
                if p1 not in self.dict['HPRD']:
                    if p1 not in missing:
                        missing.append(p1)
                        self.log.printLog(
                            '#ERR',
                            'HPRD ID "%s" missing from HPRD_ID_MAPPINGS!' % p1,
                            screen=False)
                    continue
                if p2 not in self.dict['HPRD']:
                    if p2 not in missing:
                        missing.append(p2)
                        self.log.printLog(
                            '#ERR',
                            'HPRD ID "%s" missing from HPRD_ID_MAPPINGS!' % p1,
                            screen=False)
                    continue
                if not self.dict['PPI'].has_key(p1): self.dict['PPI'][p1] = []
                if p2 not in self.dict['PPI'][p1]:
                    self.dict['PPI'][p1].append(p2)
                if not self.dict['PPI'].has_key(p2): self.dict['PPI'][p2] = []
                if p1 not in self.dict['PPI'][p2]:
                    self.dict['PPI'][p2].append(p1)
                if p1 not in self.dict['Evidence']:
                    self.dict['Evidence'][p1] = {}
                if p2 not in self.dict['Evidence'][p1]:
                    self.dict['Evidence'][p1][p2] = []
                for type in types:
                    if type not in self.dict['Evidence'][p1][p2]:
                        self.dict['Evidence'][p1][p2].append(type)
                #x#if p1 == '12422': self.deBug(self.dict['PPI'][p1])
            self.log.printLog(
                '\r#PPI',
                'Parsing BINARY_PROTEIN_PROTEIN_INTERACTIONS complete!')

            ### ~ Parse protein Complex data from PROTEIN_COMPLEXES.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Complex'] = {}
            ppi = self.loadFromFile('%sPROTEIN_COMPLEXES.txt' %
                                    self.info['HPRDPath'],
                                    v=1,
                                    checkpath=True,
                                    chomplines=True)
            hx = float(len(ppi))
            while ppi:
                entry = ppi.pop(0)
                px = 100.0 * (hx - len(ppi)) / hx
                self.log.printLog('\r#PPI',
                                  'Parsing PROTEIN_COMPLEXES: %.1f%%' % px,
                                  newline=False,
                                  log=False)
                data = string.split(entry)
                ## Check ##
                if len(data) < 5: continue
                ## Update ##
                (complex, hprd) = (data[0], data[1])
                if hprd == 'None': continue
                if not self.dict['Complex'].has_key(complex):
                    self.dict['Complex'][complex] = []
                if hprd not in self.dict['Complex'][complex]:
                    self.dict['Complex'][complex].append(hprd)
                #x#if p1 == '12422': self.deBug(self.dict['PPI'][p1])
            self.log.printLog('\r#PPI', 'Parsing PROTEIN_COMPLEXES complete!')

            ### ~ Update PPI from protein Complex data if appropriate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            type = 'complex'
            if type not in self.list['BadType'] and (
                    not self.list['PPIType'] or type in self.list['PPIType']):
                cx = 0.0
                for complex in self.dict['Complex']:
                    self.log.printLog(
                        '\r#PPI',
                        'Adding protein complex data to PPI: %.1f%%' %
                        (cx / len(self.dict['Complex'])),
                        newline=False,
                        log=False)
                    cx += 100.0
                    for p1 in self.dict['Complex'][complex]:
                        for p2 in self.dict['Complex'][complex]:
                            if not self.dict['PPI'].has_key(p1):
                                self.dict['PPI'][p1] = []
                            if p2 not in self.dict['PPI'][p1]:
                                self.dict['PPI'][p1].append(p2)
                            if p1 not in self.dict['Evidence']:
                                self.dict['Evidence'][p1] = {}
                            if p2 not in self.dict['Evidence'][p1]:
                                self.dict['Evidence'][p1][p2] = []
                            if type not in self.dict['Evidence'][p1][p2]:
                                self.dict['Evidence'][p1][p2].append(type)
                self.log.printLog(
                    '\r#PPI',
                    'Added protein complex data to PPI for %s complexes' %
                    rje.integerString(len(self.dict['Complex'])))
            ptxt = '%s proteins; %s interactions' % (rje.integerString(
                len(self.dict['PPI'])), rje.integerString(ix))
            self.log.printLog('\r#PPI',
                              'Parsing interactions complete: %s.' % ptxt)
            if missing:
                open('HPRD.missing.txt', 'w').write(string.join(missing, '\n'))
        except:
            self.log.errorLog('Error in HPRD.parse()',
                              printerror=True,
                              quitchoice=False)
            raise
Esempio n. 48
0
 def scap(self):     ### Full SCAP method
     '''Full SCAP method.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         markov = self.obj['Markov']
         minx = markov.stat['MinXmer']
         maxx = markov.stat['MaxXmer']
         headers = ['seq','type','sorted']
         for x in range(minx,maxx+1): headers.append('X%d' % x)
         delimit = rje.getDelimit(self.cmd_list,'\t')
         scapfile = '%s.%s' % (self.info['Basefile'],rje.delimitExt(delimit))
         rje.delimitedFileOutput(self,scapfile,headers,delimit,rje_backup=True)
         ### ~ [2] SCAP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [2a] Query ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         (sx,stot) = (0.0,self.obj['SeqList'].seqNum())
         for seq in self.obj['SeqList'].seq:
             self.progLog('\r#SCAP','SCAP processing Query to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0
             datadict = {'seq':seq.shortName(),'type':'qry','sorted':markov.opt['Sorted']}
             for x in range(minx,maxx+1): 
                 datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x)
                 if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x]
                 else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x]
             rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict)
         self.printLog('\r#SCAP','SCAP processed Query to %s for %s sequences.' % (scapfile,rje.integerString(stot)))
         ## ~ [2b] Background ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.obj['ScapBack'] != self.obj['SeqList']:
             (sx,stot) = (0.0,self.obj['ScapBack'].seqNum())
             for seq in self.obj['ScapBack'].seq:
                 self.progLog('\r#SCAP','SCAP processing Background to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0
                 datadict = {'seq':seq.shortName(),'type':'bg','sorted':markov.opt['Sorted']}
                 for x in range(minx,maxx+1):
                     datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x)
                     if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x]
                     else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x]
                 rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict)
             self.printLog('\r#SCAP','SCAP processed Background to %s for %s sequences.' % (scapfile,rje.integerString(stot)))
         if markov.opt['Sorted']: self.printLog('#SCAP','Sorted SCAP run complete')
         else: self.printLog('#SCAP','UnSorted SCAP run complete')
     except: self.errorLog(rje_zen.Zen().wisdom())
Esempio n. 49
0
 def uniFake(
     self,
     seqs=[],
     store=False
 ):  ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs.
     '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         unifake = string.split(string.join(self.list['UniFake']).lower())
         seqlist = self.obj['SeqList']
         if seqs: seqlist.seq = seqs
         else: seqs = seqlist.seq
         (sx, seqnum) = (0, seqlist.seqNum())
         ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = rje_uniprot.UniProt(
             self.log, self.cmd_list)  # UniProt object for saving data
         if self.info['DatOut'].lower() in ['', 'none']:
             self.info['DatOut'] = rje.baseFile(
                 seqlist.info['Name']) + '.dat'
         datfile = self.info['DatOut']
         if os.path.exists(datfile): rje.backup(self, datfile)
         if store: seqlist.obj['UniProt'] = uniprot
         ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'pfam' in unifake:
             hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T'])
             hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile)
             if os.path.exists(hmmfile): rje.backup(self, hmmfile)
             hmm.list['HMM'] = [self.info['PFam']]
             hmm.opt['HMMPFam'] = True
         else:
             hmm = None
         ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list)
         else: tm = None
         ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for seq in seqs:
             sx += 1
             name = seq.shortName()
             self.printLog(
                 '#SEQ', 'Processing %s (%s aa) %s...' %
                 (seq.shortName(), rje.integerString(
                     seq.aaLen()), seq.info['Description'][:50]))
             try:
                 ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 utmp = 'tmp%s.%s' % (rje.randomString(5),
                                      seq.info['AccNum'])
                 open('%s.fas' % utmp, 'w').write(
                     '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence']))
                 udata = {
                     'CC': ['-!- Features generated using unifake.py'],
                     'AC': []
                 }
                 if seq.info['SpecCode'] in ['Unknown', 'UNK']:
                     seq.info['SpecCode'] = self.info['SPCode']
                 #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']]     #!# Check how well this works. Add spectable? #!#
                 ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['EnsDat'] and rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name']):
                     details = rje.matchExp(
                         '\[acc:(\S+) pep:(\S+) gene:(\S+)\]',
                         seq.info['Name'])
                     self.addAlias(seq.info['AccNum'], details[0])
                     self.addAlias(seq.info['AccNum'], details[1])
                     self.addAlias(seq.info['AccNum'], details[2])
                     udata['GN'] = [details[2]]
                 for id in [seq.shortName(), seq.info['AccNum']]:
                     if id in self.dict['Aliases']:
                         udata['AC'].append(
                             '%s;' %
                             string.join(self.dict['Aliases'][id], '; '))
                 ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 ft = []  # List of features for sequence
                 for id in [
                         seq.shortName(), seq.info['AccNum'], seq.info['ID']
                 ]:
                     if id in self.dict['Features']:
                         ft += self.dict['Features'][id]
                 ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'disorder' in self.list['UniFake']:
                     try:
                         seq.disorder()
                         dis = seq.obj['Disorder']
                         for disorder in seq.obj['Disorder'].list[
                                 'RegionDisorder']:
                             ft.append({
                                 'Type':
                                 'DISORDER',
                                 'Desc':
                                 'Predicted disorder: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 disorder[0],
                                 'End':
                                 disorder[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s > %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                         for fold in seq.obj['Disorder'].list['RegionFold']:
                             ft.append({
                                 'Type':
                                 'ORDER',
                                 'Desc':
                                 'Predicted order: %s' %
                                 seq.obj['Disorder'].info['Disorder'],
                                 'Start':
                                 fold[0],
                                 'End':
                                 fold[1]
                             })
                             if dis.info['Disorder'].lower() == 'iupred':
                                 ft[-1]['Desc'] = '%s <= %.2f' % (
                                     ft[-1]['Desc'], dis.stat['IUCut'])
                     except:
                         self.log.errorLog(
                             'UniFake disorder problem for %s.' % name)
                 ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if hmm:
                     try:
                         hmm.setInfo({
                             'SearchDB': '%s.fas' % utmp,
                             'HMMOut': '%s.hmm.out' % utmp
                         })  # This will be made for each sequence
                         hmm.search = []
                         hmm.list['HMMRes'] = [
                             hmm.hmmSearch(self.info['PFam'],
                                           outfile=hmm.info['HMMOut'])
                         ]  # Used in hmmTable
                         hmm.hmmTable(outfile=hmmfile, append=True)
                         if 'disorder' in self.list['UniFake']:
                             disorder = seq.obj['Disorder'].list[
                                 'ResidueDisorder']  # individual (IUPRed) residue results
                         else:
                             disorder = []
                         if hmm.search:
                             udata['CC'].append(
                                 'PFam: HMMer PFam search vs %s (Modified %s)'
                                 %
                                 (self.info['PFam'],
                                  time.ctime(
                                      os.path.getmtime(self.info['PFam']))))
                         else:
                             udata['CC'].append(
                                 '-!- ERROR: PFam HMMer Search failure!')
                             out = {'Type': '!ERROR!', 'Name': name}
                             rje.delimitedFileOutput(
                                 self,
                                 hmmfile, [
                                     'Type', 'Name', 'Start', 'End', 'Eval',
                                     'Score'
                                 ],
                                 datadict=out)
                         for search in hmm.search:
                             for hit in search.hit:
                                 for aln in hit.aln:
                                     pfamft = {
                                         'Start':
                                         aln.stat['SbjStart'],
                                         'End':
                                         aln.stat['SbjEnd'],
                                         'Type':
                                         'PFAM',
                                         'Desc':
                                         '%s PFam HMM Eval: %.2e; Score: %.1f'
                                         % (search.info['Name'],
                                            aln.stat['Expect'],
                                            aln.stat['BitScore'])
                                     }
                                     if disorder:
                                         region = disorder[
                                             aln.stat['SbjStart'] -
                                             1:aln.stat['SbjEnd']]
                                         hmmdisorder = float(
                                             sum(region)) / len(region)
                                         pfamft[
                                             'Desc'] = '%s; IUPRed: %.2f' % (
                                                 pfamft['Desc'],
                                                 hmmdisorder)
                                         if hmmdisorder < self.stat[
                                                 'DisDom']:
                                             pfamft['Type'] = 'DOMAIN'
                                     ft.append(pfamft)
                     except:
                         self.log.errorLog(
                             'UniFake PFam HMM problem for %s.' % name)
                 ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'tmhmm' in unifake:
                     try:
                         tmdat = os.popen(
                             '%s %s.fas -short' %
                             (self.info['TMHMM'], utmp)).readlines()
                         domlist = rje_tm.domainList(
                             rje_tm.parseTMHMM(tmdat[0]))
                         for tmdom in domlist:
                             ft.append(tmdom)
                             ft[-1]['Desc'] = 'TMHMM topology prediction'
                             ft[-1]['Start'] = string.atoi(ft[-1]['Start'])
                             ft[-1]['End'] = string.atoi(ft[-1]['End'])
                         if len(domlist) > 1:
                             udata['CC'].append(
                                 'TMHMM: %d TM domains; N-Term %s' %
                                 ((len(domlist) - 1) / 2,
                                  domlist[0]['Type']))
                         else:
                             udata['CC'].append('TMHMM: 0 TM domains')
                     except:
                         self.log.errorLog('UniFake TMHMM problem for %s.' %
                                           name)
                 ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'signalp' in unifake:
                     try:
                         os.system(
                             '%s -f short -t euk %s.fas > %s.signalp' %
                             (self.info['SignalP'], utmp, utmp))
                         tm.signalp = {}
                         tm.parseSignalP('%s.signalp' % utmp)
                         sigp = tm.signalp.pop(seq.shortName())
                         cpos = 0
                         if sigp['nn_ymax?'] == 'Y':
                             cpos = string.atoi(sigp['nn_ymaxpos'])
                             desc = 'SignalP NN prediction'
                         if sigp['hmm_cmax?'] == 'Y':
                             hmm_c = string.atoi(sigp['hmm_cmaxpos'])
                             if cpos == 0:
                                 cpos = hmm_c
                                 desc = 'SignalP HMM prediction'
                             else:
                                 if hmm_c < cpos:
                                     cpos = hmm_c
                                     desc = 'SignalP HMM prediction (NN also Y)'
                                 else:
                                     desc += ' (HMM also Y)'
                         if cpos > 0:
                             ft.append({
                                 'Type': 'SIGNALP',
                                 'Desc': desc,
                                 'Start': 1,
                                 'End': cpos
                             })
                     except:
                         self.log.errorLog(
                             'UniFake SignalP problem for %s.' % name)
                 ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 self.addRealUniProt(seq, udata, ft)
                 self.deBug(ft)
                 if not store: uniprot.list['Entry'] = []
                 if uniprot.addFromSeq(
                         seq, data=udata,
                         ft=ft):  ### Converts into UniProtEntry object
                     if not store: uniprot.saveUniProt(datfile, append=True)
                     #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName())
             ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             except:
                 self.log.errorLog('Problem during UniFake(%s)' % name)
             for tmp in glob.glob('%s*' % utmp):
                 os.unlink(tmp)
             self.printLog(
                 '#UNIFAKE',
                 '|---------- %s run <<<|>>> %s to go -----------|' %
                 (rje.integerString(sx), rje.integerString(seqnum - sx)),
                 log=False)
         if store: uniprot.saveUniProt(datfile, append=False)
         if self.opt['CleanUp']:
             for tmp in glob.glob('TMHMM*'):
                 if os.path.isdir(tmp): os.rmdir(tmp)
     except:
         self.errorLog(
             'Oh, the shame of it! Trouble during UniFake.uniFake()')
Esempio n. 50
0
 def tabulatePPIRegion(self):    ### Tabulates regions of known PPI from DAT file
     '''Tabulates regions of known PPI from DAT file.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tabfile = 'ppi_region.tdt'
         unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat'
         if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB','%s found. (Force=F)' % tabfile)
         headers = ['Protein','Start','End','Interactor']
         rje.delimitedFileOutput(self,tabfile,headers,rje_backup=True)
         ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gcmd = "grep -P '(ID   |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile
         self.printLog('#GREP',gcmd)
         prot = None; rx = 0; plist = []; ilist = []
         for gline in os.popen(gcmd).readlines():
             if rje.matchExp('ID   (\S+)',gline): prot = rje.matchExp('ID   (\S+)',gline)[0]
             if rje.matchExp('FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline):
                 (rstart,rend,rint) = rje.matchExp('FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline)
                 for ppi in string.split(rint):
                     if rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi):
                         datadict = {'Protein':prot,'Start':rstart,'End':rend,'Interactor':rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi)[0]}
                         rje.delimitedFileOutput(self,tabfile,headers,datadict=datadict); rx += 1
                         if prot not in plist: plist.append(prot)
                         if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor'])
                         self.progLog('\r#REGTAB','Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString(len(plist)),rje.integerString(len(ilist)), rje.integerString(rx)))
         self.printLog('\r#REGTAB','Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)),rje.integerString(len(ilist)),rje.integerString(rx),tabfile))
         return True
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Esempio n. 51
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for fasta in glob.glob('*.fasta'):
             fas = fasta[:-2]
             if os.path.exists(fas): continue
             sx = 0
             for line in open(fasta,'r').readlines():
                 if line[:1] == '>':
                     try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line)
                     except: name = rje.matchExp('^>(\S+)',line)[0]
                     if len(string.split(name,'|')) == 3:
                         name = '6rf_NEIME__%s' % string.split(name,'|')[2]
                         open(fas,'a').write('>%s\n' % name)
                     elif len(string.split(name,'|')) == 5:
                         name = 'ref_NEIME__%s' % string.split(name,'|')[3]
                         open(fas,'a').write('>%s %s\n' % (name,desc))
                     else: print string.split(name,'|'); raise ValueError
                     self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1
                 else: open(fas,'a').write(line)
             self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta))
             rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True)
         ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhits = {}     # Dictionary of {hit:['File:hit_num']}
         acc = 'MC58_6RF_Hits.acc'; open(acc,'w')
         gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
         cx = 0
         for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
             cx += 1
             file = os.path.basename(csv)[:-4]
             hits = False
             for line in open(csv,'r').readlines():
                 if line.find('prot_hit_num,prot_acc') == 0: hits = True
                 elif hits:
                     data = rje.readDelimit(line,',')
                     if len(data) < 2: continue
                     [num,name] = data[:2]
                     try: name = string.split(name,'|')[2]
                     except: continue
                     if name not in rfhits:
                         open(acc,'a').write('6rf_NEIME__%s\n' % name)
                         rfhits[name] = []
                     id = '%s:%s' % (file,num)
                     if id not in rfhits[name]: rfhits[name].append(id)
                     self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits))))
         self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc))
         ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not os.path.exists(gfile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam()
         ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdata = rje.dataDict(self,gfile,['Qry'],['HitNum'])
         zeros = []
         for hit in gdata:
             if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
         zeros = rje.sortUnique(zeros,False)
         open('6rf_zeros.acc','w').write(string.join(zeros,'\n'))
         self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
         ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
         if not os.path.exists(ufile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam()
         gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True)
         fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True)
         headers = gdata.pop('Headers')
         headers.insert(1,'Sample')
         headers.append('BestHit')
         rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True)
         for rf in rje.sortKeys(gdata):
             rfcut = string.split(rf,'__')[1]
             gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ')
             gdata[rf]['Qry'] = rfcut
             try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
             except: gdata[rf]['BestHit']  = '-'
             rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf])
         
     except: self.errorLog(rje_zen.Zen().wisdom())
     self.printLog('#ZEN',rje_zen.Zen().wisdom())
Esempio n. 52
0
 def addToGeneCards(
     self,
     cards,
     addcards=True
 ):  ### Reconfigures and adds parsed HPRD data to GeneCards
     '''
     Reconfigures and adds parsed HPRD data to GeneCards.
     >> cards:rje_genecards.GeneCards object
     >> addcards:boolean [True] = whether to add genes from HPRD to the GeneCards dictionary
     '''
     ### Add relevant headers for future output ###
     for h in ['HPRD', 'OMIM', 'EntrezCheck', 'Desc']:
         if h not in cards.list['Headers']:
             cards.list['Headers'].append(h)
         for gene in cards.list['Genes']:
             if h not in cards.dict['GeneCard'][gene]:
                 cards.dict['GeneCard'][gene][h] = ''
     ### Add to GeneCards ###
     (hx, htot) = (0.0, len(self.dict['HPRD']))
     for hprd in self.dict['HPRD']:
         self.log.printLog('\r#HPRD',
                           'Adding HPRD to GeneCards: %.1f%%' % (hx / htot),
                           newline=False,
                           log=False)
         hx += 100.0
         self.deBug(self.dict['HPRD'][hprd])
         gene = self.dict['HPRD'][hprd]['gene']
         omim = self.dict['HPRD'][hprd]['omim']
         entrez = self.dict['HPRD'][hprd]['entrez']
         if gene in cards.list['Genes']:
             if cards.dict['GeneCard'][gene]['HPRD'] == '':
                 cards.dict['GeneCard'][gene]['HPRD'] = hprd
             elif hprd not in string.split(
                     cards.dict['GeneCard'][gene]['HPRD'], ','):
                 cards.dict['GeneCard'][gene]['HPRD'] = string.join(
                     string.split(cards.dict['GeneCard'][gene]['HPRD'], ',')
                     + [hprd], ',')
             if cards.dict['GeneCard'][gene]['OMIM'] == '':
                 cards.dict['GeneCard'][gene]['OMIM'] = omim
             elif omim not in string.split(
                     cards.dict['GeneCard'][gene]['OMIM'], ','):
                 cards.dict['GeneCard'][gene]['OMIM'] = string.join(
                     string.split(cards.dict['GeneCard'][gene]['OMIM'], ',')
                     + [omim], ',')
             if cards.dict['GeneCard'][gene]['EntrezCheck'] == '':
                 cards.dict['GeneCard'][gene]['EntrezCheck'] = entrez
             elif entrez not in string.split(
                     cards.dict['GeneCard'][gene]['EntrezCheck'], ','):
                 cards.dict['GeneCard'][gene]['EntrezCheck'] = string.join(
                     string.split(
                         cards.dict['GeneCard'][gene]['EntrezCheck'], ',') +
                     [entrez], ',')
         elif addcards:
             if gene == '-': gene = 'HPRD' + hprd
             cards.list['Genes'].append(gene)
             cards.dict['GeneCard'][gene] = {
                 'Symbol': '!FAILED!',
                 'HPRD': hprd,
                 'OMIM': omim,
                 'EntrezCheck': entrez,
                 'Desc': self.dict['HPRD'][hprd]['desc']
             }
     self.log.printLog(
         '\r#HPRD',
         'Added %s HPRD genes to GeneCards.' % (rje.integerString(htot)))
Esempio n. 53
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = ''
         elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.'
         self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T'])
         self.list['PlotFT'] = string.split(string.join(self.list['PlotFT']).upper())
         if self.info['OccFile'].lower() not in ['','none']:
             self.info['Delimit'] = rje.delimitFromExt(filename=self.info['OccFile'])
             self.dict['OccData'] = {}
             occdata = rje.dataDict(self,self.info['OccFile'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'])
             for key in rje.sortKeys(occdata):
                 seq = occdata[key].pop('Seq')
                 if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {}
                 dataset = occdata[key].pop('Dataset')
                 if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = []
                 self.dict['OccData'][seq][dataset].append(occdata[key])
             self.printLog('#OCC','Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)),rje.integerString(len(self.dict['OccData']))))
             self.obj['SeqList'].autoFilter(['GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']),',')])
         ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['PlotStat'] = string.split(string.join(self.list['PlotStat']).lower())
         if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log,self.cmd_list)
         seqdict = self.obj['SeqList'].seqNameDic()
         for name in rje.sortKeys(seqdict):
             if self.opt['OccOnly'] and not name in self.dict['OccData']: continue
             seq = seqdict[name]
             sequence = seq.getSequence(gaps=False)
             seq.dict['PlotStat'] = {}
             if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(sequence,returnlist=True)
             if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat']['Hydropathy'] = rje_seq.eisenbergHydropathy(sequence,returnlist=True)
             if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder(returnlist=True)
             if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']:
                 slimcalc.relConListFromSeq(seq,slimcalc.stat['RelConWin'],store=True)
                 try:
                     seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons')
                     seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop('RelCons')
                 except: self.printLog('#CONS','No conservation stats for %s' % name)
             self.printLog('#STAT','PlotStats calculated for %s' % name)
             for stat in seq.dict['PlotStat']:
                 if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin(seq.dict['PlotStat'][stat])
                 seq.dict['PlotStat'][stat] = self.convertStat(seq.dict['PlotStat'][stat])
             self.printLog('#STAT','PlotStats converted for %s' % name)                
         ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             if name in self.dict['OccData']:
                 for dataset in self.dict['OccData'][name]:
                     ofile = '%s%s.%s.plot.txt' % (self.info['Basefile'],dataset,seq.info['AccNum'])
                     self.output(seq,ofile,self.dict['OccData'][name][dataset])
             else: self.output(seq,'%s%s.plot.txt' % (self.info['Basefile'],seq.info['AccNum']))
         return
     except: self.errorLog(rje_zen.Zen().wisdom())
Esempio n. 54
0
    def parseOMIM(self):    ### Main parsing method
        '''Main parsing method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            aas = string.split(string.join(rje_sequence.aa_code_3.values()).upper())
            oline = os.path.exists(self.info['Name'])
            (olen,ox,mx) = (len(open(self.info['Name'],'r').readlines()),0.0,0)
            OMIM = open(self.info['Name'],'r')

            ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            record = gene = subid = disease = mutation = ''
            av = False      # Whether reading *FIELD* AV for mutation data
            while oline:
                oline = OMIM.readline()
                self.log.printLog('\r#OMIM','Processing OMIM: %.2f%% (%s genes)' % (ox/olen,rje.integerString(len(self.dict['Records']))),newline=False,log=False)
                ox += 100.0
                if not av and oline[:1] != '*': continue
                line = rje.chomp(oline)
                while line[-1:] == ' ': line = line[:-1]
                ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if line == '*RECORD*': (record,av) = ('',False)
                elif line == '*FIELD* NO':    # New record
                    record = rje.chomp(OMIM.readline())
                    gene = ''
                    ox += 100.0
                ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* TI':      # New gene
                    gene = string.split(rje.chomp(OMIM.readline()))[-1]
                    subid = ''
                    av = False
                    ox += 100.0
                ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* AV': av = True        # Start of mutation records
                elif av and rje.matchExp('^(\.\d+)',line):  # New subid mutation record
                    subid = rje.matchExp('^(\.\d+)',line)[0]
                    disease = rje.chomp(OMIM.readline())
                    ox += 100.0
                    try: mutation = rje.matchExp('^%s, (\D\D\D\d+\D\D\D)' % gene,rje.chomp(OMIM.readline()))[0]
                    except: continue    # No mutation or not coding change
                    ox += 100.0
                    subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)',mutation)
                    if subaa[0] not in aas or subaa[1] not in aas: continue
                    if gene not in self.dict['Records']: self.dict['Records'][gene] = [record]
                    if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record]
                    if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {}
                    mx += 1
                    self.dict['Mutations'][gene][subid] = (disease,mutation)
                        
            ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OMIM.close()
            self.log.printLog('\r#OMIM','Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len(self.dict['Records'])),rje.integerString(mx)))
            self.saveMutations()
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible