コード例 #1
0
ファイル: unifake.py プロジェクト: kwikwag/SLiMSuite
 def loadFeatures(self,ftfile):  ### Loads features from given file
     '''Loads features from given file.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if ftfile in ['','none']: return
         if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing')
         delimit = rje.delimitFromExt(filename=ftfile)
         ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         headers = rje.readDelimit(open(ftfile,'r').readline(),delimit)
         mainkeys = [headers[0]]
         hmap = {}
         for h in headers: hmap[h.lower()] = h
         pos = ''    # Leader for start/end positions
         if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_'
         for h in ['feature','%sstart' % pos,'%send' % pos,'description']:
             if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile))
             mainkeys.append(hmap[h])
         mainkeys.remove(hmap['description'])
         ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True)
         (mx,mtot,fx) = (0.0,len(ftdata),0)
         for mainkey in rje.sortKeys(ftdata):
             self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot))
             mx += 100.0                                                                           
             (id,ft,start,end) = string.split(mainkey,delimit)
             if id == mainkeys[0]: continue
             if id not in self.dict['Features']: self.dict['Features'][id] = []
             for desc in ftdata[mainkey][hmap['description']]:
                 fx += 1
                 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc})
         self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile))
     except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
コード例 #2
0
ファイル: slimgoer.py プロジェクト: lyhniupi1/SLiMSuite
 def readSLiMSearchOcc(self,motifs=[]):   ### Reads SLiMSearch results into data dictionary
     '''Reads SLiMSearch results into data dictionary.'''
     try:### ~ [1] Read ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not motifs: self.printLog('#OCC','Cannot process occurrences for No motifs!')
         occfile = '%s.csv' % self.info['ResFile']
         delimit = rje.delimitFromExt(filename=occfile)
         data = rje.dataDict(self,occfile,mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=string.split('Seq,Desc,Start_Pos,End_Pos,Cons,HomNum,GlobID,LocID,Hyd,SA',','))
         self.dict['Occ'] = {}
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (mx,ox,otot) = (0,0.0,len(data))
         for occ in data:
             self.progLog('\r#OCC','Processing occurrences (%d motifs): %.2f%%' % (mx,ox/otot)); ox += 100.0
             #x#self.deBug('%s vs MinHom %d' % (data[occ],self.stat['MinHom']))
             if string.atoi(data[occ]['HomNum']) < self.stat['MinHom']: continue
             (motif,seq,start,end) = string.split(occ,delimit)
             if motif not in motifs: continue
             try:
                 gene = rje.matchExp('gene:(\S+)\]',data[occ]['Desc'])[0]
                 self.deBug('%s:%s' % (gene,self.ensGO(gene)))
                 if not self.ensGO(gene): continue
             except: continue
             if motif[-3:] == 'rev': (motif,type) = (motif[:-4],'Rev')
             elif motif[-5:] == 'scram': (motif,type) = (motif[:-6],'Scr')
             else: type = 'ELM'
             if motif not in self.dict['Occ']: self.dict['Occ'][motif] = {}; mx += 1
             if type not in self.dict['Occ'][motif]: self.dict['Occ'][motif][type] = {}
             if gene not in self.dict['Occ'][motif][type]: self.dict['Occ'][motif][type][gene] = []
             self.dict['Occ'][motif][type][gene].append(data[occ])
         self.printLog('\r#OCC','Processed %s occurrences: %d motifs with GO-links' % (rje.integerString(otot),mx))
     except: self.log.errorLog(rje_zen.Zen().wisdom())
コード例 #3
0
ファイル: rje_omim.py プロジェクト: slimsuite/SLiMSuite
    def loadMutations(self):    ### Inputs parsed mutations back into dictionaries
        '''Inputs parsed mutations back into dictionaries.'''
        try:### ~ [1] Setup input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease']
            infile = 'omim_mutations.tdt'
            if not os.path.exists(infile): return False
            datadict = rje.dataDict(self,infile,headers[:2],headers,'\t')
            mx = len(datadict)

            ### ~ [2] Process into dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for dkey in datadict.keys()[0:]:
                data = datadict.pop(dkey)
                record = data['OMIM_ID']
                subid = data['SubID']
                gene = data['Gene']
                mutation = '%s%s%s' % (data['WildAA'],data['Pos'],data['MutAA'])
                disease = data['Disease']
                if gene not in self.dict['Records']: self.dict['Records'][gene] = [record]
                if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record]
                if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {}
                self.dict['Mutations'][gene][subid] = (disease,mutation)
            self.log.printLog('\r#OMIM','Loaded %s OMIM mutations (%s genes).' % (rje.integerString(mx),rje.integerString(len(self.dict['Records']))))
            return True
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            return False
コード例 #4
0
 def loadAlias(self, sourcefile):  ### Loads Alias data
     '''
     Loads Alias data.
     >> sourcefile:str = Source filename
     '''
     try:  ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if sourcefile.lower() in ['', 'none']: return
         if not os.path.exists(sourcefile):
             return self.log.errorLog('Alias file "%s" not found' %
                                      (sourcefile),
                                      printerror=False)
         data = rje.dataDict(self,
                             sourcefile,
                             datakeys=['Aliases'],
                             lists=True)
         ### ~ [2] Parse out Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (hx, htot) = (0.0, len(data))
         for id in data:
             self.log.printLog('\r#ALIAS',
                               'Processing %s: %.1f%%' %
                               (sourcefile, hx / htot),
                               newline=False,
                               log=False)
             hx += 100.0
             ## ~ [2a] Update self.dict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             for alist in data[id]['Aliases']:
                 for alias in string.split(alist, ','):
                     self.addAlias(id, alias)
             if id in self.dict['Aliases']: self.dict['Aliases'][id].sort()
         self.log.printLog(
             '\r#ALIAS', 'Processed %s: %s IDs with aliases' %
             (sourcefile, rje.integerString(len(self.dict['Aliases']))))
     except:
         self.log.errorLog(rje_zen.Zen().wisdom())
コード例 #5
0
 def loadFeatures(self, ftfile):  ### Loads features from given file
     '''Loads features from given file.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if ftfile in ['', 'none']: return
         if not os.path.exists(ftfile):
             return self.printLog('#ERR', 'Features file "%s" missing')
         delimit = rje.delimitFromExt(filename=ftfile)
         ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit)
         mainkeys = [headers[0]]
         hmap = {}
         for h in headers:
             hmap[h.lower()] = h
         pos = ''  # Leader for start/end positions
         if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_'
         for h in [
                 'feature',
                 '%sstart' % pos,
                 '%send' % pos, 'description'
         ]:
             if h not in hmap:
                 return self.printLog(
                     '#ERR', 'No %s field detected in "%s" features file' %
                     (h, ftfile))
             mainkeys.append(hmap[h])
         mainkeys.remove(hmap['description'])
         ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ftdata = rje.dataDict(self,
                               ftfile,
                               mainkeys, ['description'],
                               delimit,
                               headers,
                               lists=True)
         (mx, mtot, fx) = (0.0, len(ftdata), 0)
         for mainkey in rje.sortKeys(ftdata):
             self.progLog(
                 '\r#FT',
                 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot))
             mx += 100.0
             (id, ft, start, end) = string.split(mainkey, delimit)
             if id == mainkeys[0]: continue
             if id not in self.dict['Features']:
                 self.dict['Features'][id] = []
             for desc in ftdata[mainkey][hmap['description']]:
                 fx += 1
                 self.dict['Features'][id].append({
                     'Type': ft,
                     'Start': int(start),
                     'End': int(end),
                     'Desc': desc
                 })
         self.printLog(
             '\r#FT', 'Loaded %s features for %s IDs from %s' %
             (rje.integerString(fx),
              rje.integerString(len(self.dict['Features'])), ftfile))
     except:
         self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
コード例 #6
0
ファイル: rje_hpc.py プロジェクト: slimsuite/SLiMSuite
 def pickupList(self):   ### Generates Pickup List from file(s)                                                  #V1.0
     '''Generates Pickup List from file(s).'''
     if not self.getStrLC('PickHead').lower(): return []
     picklist = []
     for out in self.list['OutList']:
         resfile = '%s.%s' % (self.baseFile(),out)
         try: pickdat = rje.dataDict(self,resfile,[self.getStr('PickHead')])
         except: pickdat = {}
         picklist = picklist + pickdat.keys()
     return picklist
コード例 #7
0
ファイル: rje_phos.py プロジェクト: kwikwag/SLiMSuite
    def readPELM(self): ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.
        '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.'''
        try:### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            data = rje.dataDict(self,self.info['PELM'],mainkeys=['acc','position'])
            seqdict = {}    # Dictionary of Acc:Sequence

            ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pdict = self.dict['PhosphoSites']
            for dkey in data:
                ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                (acc,pos) = string.split(dkey)
                pos = string.atoi(pos)
                if acc not in pdict: pdict[acc] = {}
                if pos not in pdict[acc]: pdict[acc][pos] = {}
                ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if acc not in seqdict: seqdict[acc] = data[dkey]['sequence']
                elif seqdict[acc] != data[dkey]['sequence']: self.log.printLog('#ERR','Warning. Sequence mismatch for %s' % acc)
                if 'aa' not in pdict[acc][pos]: pdict[acc][pos]['aa'] = data[dkey]['code']
                elif pdict[acc][pos]['aa'] != data[dkey]['code']: self.log.printLog('#ERR','Warning. PhosphoSite mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],pdict[acc][pos]['aa']))
                if data[dkey]['code'] != seqdict[acc][(pos-1):pos]: self.log.printLog('#ERR','Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s' % (acc,pos,data[dkey]['code'],seqdict[acc][pos-1:pos]))

            ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            acclist = rje.sortKeys(seqdict)
            pelmuni = rje_uniprot.UniProt(self.log,self.cmd_list)   # UniProt entry
            unidict = pelmuni.accDict(acclist)        # Dictionary of {acc:UniProtEntry}
            pelmseq = rje_seq.SeqList(self.log,self.cmd_list+['seqin=None'])            # SeqList object
            ## ~ [3b] Add one sequence for each AccNum and update seqdict  ~~~~~~~~~~~~~~~~~~~~~~~~ ##
            #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!#
            for acc in acclist:     #!# Make accdict of {acc:Seq} using unidict and seqlist #!#
                sequence = seqdict[acc]
                try:
                    uni = unidict[string.split(acc,'-')[0]]
                    desc = uni.obj['Sequence'].info['Description']
                    name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'],acc,desc)
                    if sequence != uni.obj['Sequence'].info['Sequence']:
                        self.log.printLog('#WARNING','Sequence mismatch for UniProt entry %s' % acc)
                except:
                    self.log.errorLog('Problem with %s' % acc)
                    name = '%s_UNK__%s' % (acc,acc)             #!# Add sequences where UniProt missing #!#
                seqdict[acc] = pelmseq._addSeq(name,sequence)
            ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if self.opt['FilterSeq']:
                pelmseq.autoFilter()
                for acc in acclist:
                    if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc)
                acclist = rje.sortKeys(seqdict)
            ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not os.path.exists(self.info['PELMFas']) or self.stat['Interactive'] < 0 or rje.yesNo('%s exists: overwrite?' % self.info['PELMFas']):
                pelmseq.saveFasta(seqfile=self.info['PELMFas'])
            self.obj['SeqList'] = pelmseq
            self.obj['UniProt'] = pelmuni
        except: self.log.errorLog('Problem during PhosphoSeq.readPELM')
コード例 #8
0
ファイル: rje_genecards.py プロジェクト: kwikwag/SLiMSuite
 def setup(self):    ### Sets up headers and reads in existing data if present
     '''Sets up headers and reads in existing data if present.'''
     try:
         ### ~ Setup Basic Headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         #X#headers = ['Alias','Species','Symbol','HGNC','Entrez','UniProt','EnsEMBL','HPRD','OMIM','EnsLoci','Desc']
         headers = ['Alias','Species'] + gc_headers  # All other headers added from altsource list
         ### ~ Read in data from existing files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.readHGNC()
         if self.opt['Update'] and os.path.exists(self.info['CardOut']): self.list['AltSource'].append(self.info['CardOut'])
         for altsource in self.list['AltSource']:
             sourcefile = rje.makePath(altsource,True)
             if not os.path.exists(sourcefile):
                 self.log.errorLog('Alternative source "%s" missing!' % sourcefile,printerror=False,quitchoice=True)
                 continue
             update = rje.dataDict(self,sourcefile,getheaders=True,ignore=['#'])
             for h in update.pop('Headers'):
                 if h not in headers:
                     headers.append(h)
             self.log.printLog('#DATA','Read GeneCards data for %d genes.' % (len(update)))
             for gene in rje.sortKeys(update):     # Each source will overwrite data from the file before
                 ## ~ Convert to Upper Case for consistency ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if gene != gene.upper() and gene.upper() in update: continue    # Only use upper case one!
                 elif gene != gene.upper():
                     update[gene.upper()] = update.pop(gene)
                     gene = gene.upper()
                 if gene == '!FAILED!': continue
                 ## ~ Update main dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if self.opt['Update'] and altsource == self.info['CardOut'] and gene not in self.list['Genes']: self.list['Genes'].append(gene)
                 if gene in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][gene],update[gene])
                 else: self.dict['GeneCard'][gene] = update[gene]
                 ## ~ Temp Debugging ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if gene in self.list['TestGenes']:
                     print gene
                     print update[gene]
                     self.deBug(self.dict['GeneCard'][gene])
                 ## ~ Check Aliases etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                 if 'Symbol' in self.dict['GeneCard'][gene]: self.dict['GeneCard'][gene]['Symbol'] = self.dict['GeneCard'][gene]['Symbol'].upper()
                 if 'Symbol' in update[gene] and update[gene]['Symbol'] != '!FAILED!':
                     symbol = update[gene]['Symbol']
                     if symbol in self.dict['GeneCard']: rje.combineDict(self.dict['GeneCard'][symbol],update[gene],overwrite=False,replaceblanks=True)
                     else: self.dict['GeneCard'][symbol] = update[gene]
                 self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])),newline=False,log=False)
                 if len(string.split(gene)) > 1: print '!!!', gene, '!!!'
         ### ~ Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.log.printLog('\r#CARD','Extracted GeneCards data for %d genes.' % (len(self.dict['GeneCard'])))
         self.list['Headers'] = headers[0:]
         if self.opt['Update']: self.opt['Append'] = False
         #x#if 'TASP1' in self.dict['GeneCard']: self.deBug(self.dict['GeneCard']['TASP1'])
         #x#else: self.deBug(rje.sortKeys(self.dict['GeneCard']))
     except:
         self.log.errorLog('Problem during GeneCards.setup()')
         raise
コード例 #9
0
ファイル: rje_glossary.py プロジェクト: slimsuite/SLiMSuite
 def setup(self,gtext=''):    ### Main class setup method. gtext will over-ride input file.
     '''Main class setup method. gtext will over-ride input file.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['HTML'] = rje_html.HTML(self.log,self.cmd_list)
         ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.basefile().lower() in ['','none']: self.basefile(rje.baseFile(self.getStr('InFile')))
         if self.getStr('OutFile').lower() in ['','none']: self.str['OutFile'] = '%s.html' % self.basefile()
         ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         interms = []
         if gtext:
             delimit = self.getStr('TermSplit')
             if delimit.lower() == 'tab': delimit = '\t'
             if delimit.lower() == 'space': delimit = ' '
             if delimit.lower() == 'comma': delimit = ','
             if delimit.lower() == 'period (.)': delimit = '.'
             if delimit.lower() == 'colon': delimit = ':'
             glossary = {}
             for line in string.split(gtext,'\n'):
                 splitline = string.split(line,delimit)
                 if delimit == '.' and (splitline[-1] in ['',' ']): splitline = splitline[:-1]
                 if not splitline: continue
                 (term,definition) = (splitline[0],string.join(splitline[1:],delimit))
                 if term == 'Term' and not glossary: continue
                 if term:
                     glossary[term] = {'Definition':definition}
                     interms.append(term)
         else: 
             try:
                 if not self.getBool('KeepOrder') and open(self.getStr('InFile'),'r').readline()[:4] == 'Term': 
                     glossary = rje.dataDict(self,self.getStr('InFile'),mainkeys=['Term'],datakeys=['Term','Definition'])
                 else: return self.setup(open(self.getStr('InFile'),'r').read())
             except: 
                 self.errorLog('Problem reading input as dataDict(). Will try as text.')
                 return self.setup(open(self.getStr('InFile'),'r').read())
         if self.list['Terms']:
             for term in glossary:
                 if term not in self.list['Terms']: glossary.pop(term)
         elif self.getBool('KeepOrder'): self.list['Terms'] = interms
         else: self.list['Terms'] = rje.sortKeys(glossary)
         for term in glossary: glossary[term] = glossary[term]['Definition']
         ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         nested = {}
         for term in glossary:
             tdict = nested
             for word in string.split(term.lower()):
                 if word not in tdict: tdict[word] = {}
                 tdict = tdict[word]
             tdict['='] = glossary[term]
         self.dict['Glossary'] = nested
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
コード例 #10
0
    def ppiDisMatrix(self):  ### Converts PPI Table into distance matrix
        '''Converts PPI Table into distance matrix.'''
        try:
            ### Check File ###
            if not os.path.exists(self.info['PPITab']):
                self.log.errorLog('PPI Table file "%s" missing!' %
                                  self.info['PPITab'],
                                  printerror=False)
                return False

            ### Setup ###
            data = rje.dataDict(self, self.info['PPITab'], getheaders=True)
            headers = data.pop('Headers')
            ppidis = rje_dismatrix.DisMatrix(self.log, self.cmd_list)
            ppidis.opt['Symmetric'] = True
            ppidis.setInfo({
                'Name': '%s.ppi_dis.txt' % self.info['Basefile'],
                'Type': 'PPI'
            })

            ### Make DisMatrix ###
            for p1 in headers[1:]:
                ppidis.addDis(p1, p1, 0)
                for p2 in headers[headers.index(p1) + 1:]:
                    ppi = 0
                    unique = 0
                    for i in data.keys():
                        try:
                            v1 = int(data[i][p1])
                        except:
                            v1 = data[i][p1]
                        try:
                            v2 = int(data[i][p2])
                        except:
                            v2 = data[i][p2]
                        if v1 or v2:
                            ppi += 1
                            if not (v1 and v2):
                                unique += 1
                    if self.opt['Scaled']:
                        ppidis.addDis(p1, p2, float(unique) / float(ppi))
                    else:
                        ppidis.addDis(p1, p2, unique)

            ### Output ###
            delimit = rje.getDelimit(self.cmd_list, default=',')
            ppidis.saveMatrix(headers[1:], ppidis.info['Name'], delimit)

        except:
            self.log.errorLog('Major problem with rje_ppi.ppiDisMatrix')
            return False
コード例 #11
0
ファイル: rje_seqplot.py プロジェクト: kwikwag/SLiMSuite
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = ''
         elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.'
         self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T'])
         self.list['PlotFT'] = string.split(string.join(self.list['PlotFT']).upper())
         if self.info['OccFile'].lower() not in ['','none']:
             self.info['Delimit'] = rje.delimitFromExt(filename=self.info['OccFile'])
             self.dict['OccData'] = {}
             occdata = rje.dataDict(self,self.info['OccFile'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'])
             for key in rje.sortKeys(occdata):
                 seq = occdata[key].pop('Seq')
                 if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {}
                 dataset = occdata[key].pop('Dataset')
                 if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = []
                 self.dict['OccData'][seq][dataset].append(occdata[key])
             self.printLog('#OCC','Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)),rje.integerString(len(self.dict['OccData']))))
             self.obj['SeqList'].autoFilter(['GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']),',')])
         ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['PlotStat'] = string.split(string.join(self.list['PlotStat']).lower())
         if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log,self.cmd_list)
         seqdict = self.obj['SeqList'].seqNameDic()
         for name in rje.sortKeys(seqdict):
             if self.opt['OccOnly'] and not name in self.dict['OccData']: continue
             seq = seqdict[name]
             sequence = seq.getSequence(gaps=False)
             seq.dict['PlotStat'] = {}
             if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(sequence,returnlist=True)
             if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat']['Hydropathy'] = rje_seq.eisenbergHydropathy(sequence,returnlist=True)
             if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder(returnlist=True)
             if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']:
                 slimcalc.relConListFromSeq(seq,slimcalc.stat['RelConWin'],store=True)
                 try:
                     seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons')
                     seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop('RelCons')
                 except: self.printLog('#CONS','No conservation stats for %s' % name)
             self.printLog('#STAT','PlotStats calculated for %s' % name)
             for stat in seq.dict['PlotStat']:
                 if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin(seq.dict['PlotStat'][stat])
                 seq.dict['PlotStat'][stat] = self.convertStat(seq.dict['PlotStat'][stat])
             self.printLog('#STAT','PlotStats converted for %s' % name)                
         ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             if name in self.dict['OccData']:
                 for dataset in self.dict['OccData'][name]:
                     ofile = '%s%s.%s.plot.txt' % (self.info['Basefile'],dataset,seq.info['AccNum'])
                     self.output(seq,ofile,self.dict['OccData'][name][dataset])
             else: self.output(seq,'%s%s.plot.txt' % (self.info['Basefile'],seq.info['AccNum']))
         return
     except: self.errorLog(rje_zen.Zen().wisdom())
コード例 #12
0
ファイル: rje_hpc.py プロジェクト: lyhniupi1/SLiMSuite
 def pickupList(
     self
 ):  ### Generates Pickup List from file(s)                                                  #V1.0
     '''Generates Pickup List from file(s).'''
     if not self.getStrLC('PickHead').lower(): return []
     picklist = []
     for out in self.list['OutList']:
         resfile = '%s.%s' % (self.baseFile(), out)
         try:
             pickdat = rje.dataDict(self, resfile,
                                    [self.getStr('PickHead')])
         except:
             pickdat = {}
         picklist = picklist + pickdat.keys()
     return picklist
コード例 #13
0
ファイル: rje_hprd.py プロジェクト: kwikwag/SLiMSuite
    def ppiDisMatrix(self): ### Converts PPI Table into distance matrix
        '''Converts PPI Table into distance matrix.'''
        try:
            ### Check File ###
            if not os.path.exists(self.info['PPITab']):
                self.log.errorLog('PPI Table file "%s" missing!' % self.info['PPITab'],printerror=False)
                return False

            ### Setup ###            
            data = rje.dataDict(self,self.info['PPITab'],getheaders=True)
            headers = data.pop('Headers')
            ppidis = rje_dismatrix.DisMatrix(self.log,self.cmd_list)
            ppidis.opt['Symmetric'] = True
            ppidis.setInfo({'Name':'%s.ppi_dis.txt' % self.info['Basefile'],'Type':'PPI'})

            ### Make DisMatrix ###
            for p1 in headers[1:]:
                ppidis.addDis(p1,p1,0)
                for p2 in headers[headers.index(p1)+1:]:
                    ppi = 0
                    unique = 0
                    for i in data.keys():
                        try:
                            v1 = int(data[i][p1])
                        except:
                            v1 = data[i][p1]
                        try:
                            v2 = int(data[i][p2])
                        except:
                            v2 = data[i][p2]
                        if v1 or v2:
                            ppi += 1
                            if not (v1 and v2):
                                unique += 1
                    if self.opt['Scaled']:
                        ppidis.addDis(p1,p2,float(unique)/float(ppi))
                    else:
                        ppidis.addDis(p1,p2,unique)

            ### Output ###
            delimit = rje.getDelimit(self.cmd_list,default=',')
            ppidis.saveMatrix(headers[1:],ppidis.info['Name'],delimit)
            
        except:
            self.log.errorLog('Major problem with rje_ppi.ppiDisMatrix')
            return False
コード例 #14
0
ファイル: slimgoer.py プロジェクト: lyhniupi1/SLiMSuite
 def readSLiMSearch(self):   ### Reads SLiMSearch results into data dictionary
     '''Reads SLiMSearch results into data dictionary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumfile = '%s.summary.csv' % self.info['ResFile']
         occfile = '%s.csv' % self.info['ResFile']
         if not os.path.exists(sumfile): return self.errorLog('No Summary file "%s"!' % sumfile,printerror=False)
         if not os.path.exists(occfile): return self.errorLog('No Occurrence file "%s"!' % occfile,printerror=False)
         ### ~ [2] Read Summary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         esum = rje.dataDict(self,sumfile,mainkeys=['Motif'],datakeys='All',getheaders=False)
         occmotifs = []      # List of motifs with enough occurrences
         for motif in rje.sortKeys(esum):
             if string.atoi(esum[motif]['N_Occ']) < self.stat['MinOcc']: continue
             occmotifs.append(motif)
         ### ~ [3] Read Occurrences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#MOTIF','%d motifs with N_Occ >= MinOcc (%d)' % (len(occmotifs),self.stat['MinOcc']))
         self.readSLiMSearchOcc(occmotifs)
     except: self.log.errorLog(rje_zen.Zen().wisdom())
コード例 #15
0
ファイル: qsf_analysis.py プロジェクト: slimsuite/SLiMSuite
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt'
         self.progLog('\r#PPI','Loading pairwise data...')
         pairwise = rje.dataDict(self,ppipairwise,['Hub','Spoke'],['Spoke','SpokeSeq','Evidence'])
         gene2seq = {}; seq2gene = {}
         fullppi = {}; px = 0.0; ptot = len(pairwise); ppix = 0
         for pair in rje.sortKeys(pairwise):
             self.progLog('\r#PPI','Processing full pairwise PPI: %.2f%%' % (px/ptot)); px += 100.0
             [hub,spoke] = string.split(pair,'\t')
             if spoke not in gene2seq:
                 sseq = pairwise[pair]['SpokeSeq']
                 gene2seq[spoke] = sseq; seq2gene[string.split(sseq,'__')[0]] = spoke
             if hub not in fullppi: fullppi[hub] = {}
             if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']; ppix += 1
         self.printLog('\r#PPI','Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)),rje.integerString(ppix/2)))
         ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt'
         goodppi = self.loadFromFile(goodppifile,chomplines=True)
         self.dict['PPI'] = {}
         px = 0.0; ptot = len(fullppi); fppix = ppix; ppix = 0
         for hub in fullppi:
             self.progLog('\r#PPI','Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px/ptot,rje.integerString(len(self.dict['PPI'])),rje.integerString(ppix))); px +=100.0
             self.dict['PPI'][hub] = []
             for spoke in fullppi[hub]:
                 goodspoke = False
                 for ptype in goodppi:
                     if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True; break
                 if goodspoke: self.dict['PPI'][hub].append(spoke); continue
                 goodspoke = True
                 for spoke2 in fullppi[hub]:
                     if spoke2 in [hub,spoke]: continue
                     if spoke2 in fullppi[spoke]: goodspoke = False; break
                 if goodspoke: self.dict['PPI'][hub].append(spoke)
             ppix += len(self.dict['PPI'][hub])
             if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub)
         self.printLog('\r#PPI','Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString(len(fullppi)),rje.integerString(len(self.dict['PPI'])),rje.integerString(fppix/2),rje.integerString(ppix/2)))
         ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas'
         scmd = ['accnr=F','seqnr=F','seqin=%s' % seqfile] + self.cmd_list + ['autoload=T']
         seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,scmd)
         self.dict['SeqObj'] = seqlist.seqNameDic('Max')
         self.dict['Gene2Seq'] = gene2seq; self.dict['Seq2Gene'] = seq2gene
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
コード例 #16
0
ファイル: unifake.py プロジェクト: kwikwag/SLiMSuite
 def loadAlias(self,sourcefile):  ### Loads Alias data
     '''
     Loads Alias data.
     >> sourcefile:str = Source filename
     '''
     try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if sourcefile.lower() in ['','none']: return 
         if not os.path.exists(sourcefile): return self.log.errorLog('Alias file "%s" not found' % (sourcefile),printerror=False)
         data = rje.dataDict(self,sourcefile,datakeys=['Aliases'],lists=True)
         ### ~ [2] Parse out Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (hx,htot) = (0.0,len(data))
         for id in data:
             self.log.printLog('\r#ALIAS','Processing %s: %.1f%%' % (sourcefile,hx/htot),newline=False,log=False)
             hx += 100.0
             ## ~ [2a] Update self.dict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             for alist in data[id]['Aliases']:
                 for alias in string.split(alist,','): self.addAlias(id,alias)
             if id in self.dict['Aliases']: self.dict['Aliases'][id].sort()
         self.log.printLog('\r#ALIAS','Processed %s: %s IDs with aliases' % (sourcefile,rje.integerString(len(self.dict['Aliases']))))           
     except: self.log.errorLog(rje_zen.Zen().wisdom())
コード例 #17
0
 def mapEnsGO(self,spec='HUMAN',gokey='EnsGO',fixhead=True):   ### Extracts EnsEMBL GO mapping data from a BioMart download
     '''Extracts EnsEMBL GO mapping data from a BioMart download.'''
     ### ~ [1] ~ Setup paths and files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     if gokey not in self.dict: self.dict[gokey] = {}
     ensmap = []
     for gtype in ['GO','GO.BP','GO.CC','GO.MF']:
         gfile = self.info['EnsGOPath'] + 'ens_%s.%s.tdt' % (spec,gtype)
         if os.path.exists(gfile): ensmap.append(gfile)
     if not ensmap:
         self.errorLog('EnsEMBL-GO mapping file (%s) missing' % self.info['EnsGOPath'],printerror=False)
         return False             
     ### ~ [2] ~ Parse Gene-GO Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
     mainkeys = ['Ensembl Gene ID','GO ID']
     for gfile in ensmap:
         if fixhead:
             headers = string.split(rje.chomp(open(gfile,'r').readlines()[0]),'\t')
             if 'Ensembl Gene ID' in headers: mainkeys = ['Ensembl Gene ID']
             else: mainkeys = headers[:1]
             if 'GO Term Accession' in headers: mainkeys.append('GO Term Accession')
             elif 'GO Term Accession (bp)' in headers: mainkeys.append('GO Term Accession (bp)')
             elif 'GO Term Accession (mf)' in headers: mainkeys.append('GO Term Accession (mf)')
             elif 'GO Term Accession (cc)' in headers: mainkeys.append('GO Term Accession (cc)')
             elif 'GO ID' in headers: mainkeys.append('GO ID')
             else: mainkeys.append(headers[2])
             self.printLog('#HEAD','%s' % (string.join(mainkeys,' / ')))
         self.progLog('\r#GO','Mapping EnsEMBL GO...')
         ensdata = rje.dataDict(self,gfile,mainkeys)
         (mx,mtot) = (0.0,len(ensdata))
         obselete_go = []
         for map in ensdata:
             self.progLog('\r#GO','Mapping EnsEMBL GO: %.2f%%' % (mx/mtot)); mx += 100.0
             try: (gene,go) = string.split(map)
             except: continue    # no GO!
             ## Update dictionaries ##
             if go[:3] == 'GO:': go = go[3:]
             if go in self.go(): self.addGeneGO(gene,go,gokey)
             elif go in self.dict['AltID']:
                 for id in self.dict['AltID'][go]: self.addGeneGO(gene,id,gokey)
             elif go not in obselete_go: obselete_go.append(go)
         self.printLog('\r#GO','Mapping EnsEMBL GO from %s complete.' % os.path.basename(gfile))
コード例 #18
0
ファイル: rje_omim.py プロジェクト: lyhniupi1/SLiMSuite
    def loadMutations(
            self):  ### Inputs parsed mutations back into dictionaries
        '''Inputs parsed mutations back into dictionaries.'''
        try:  ### ~ [1] Setup input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            headers = [
                'OMIM_ID', 'SubID', 'Gene', 'Pos', 'WildAA', 'MutAA', 'Disease'
            ]
            infile = 'omim_mutations.tdt'
            if not os.path.exists(infile): return False
            datadict = rje.dataDict(self, infile, headers[:2], headers, '\t')
            mx = len(datadict)

            ### ~ [2] Process into dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for dkey in datadict.keys()[0:]:
                data = datadict.pop(dkey)
                record = data['OMIM_ID']
                subid = data['SubID']
                gene = data['Gene']
                mutation = '%s%s%s' % (data['WildAA'], data['Pos'],
                                       data['MutAA'])
                disease = data['Disease']
                if gene not in self.dict['Records']:
                    self.dict['Records'][gene] = [record]
                if record not in self.dict['Records'][gene]:
                    self.dict['Records'][gene] += [record]
                if gene not in self.dict['Mutations']:
                    self.dict['Mutations'][gene] = {}
                self.dict['Mutations'][gene][subid] = (disease, mutation)
            self.log.printLog(
                '\r#OMIM', 'Loaded %s OMIM mutations (%s genes).' %
                (rje.integerString(mx),
                 rje.integerString(len(self.dict['Records']))))
            return True
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            return False
コード例 #19
0
ファイル: rje_omim.py プロジェクト: slimsuite/SLiMSuite
    def run(self):  ### Main Run Method
        '''Main Run Method.'''
        try:### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.opt['Force'] or not self.loadMutations(): self.parseOMIM()

            ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!#
            ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            import pingu
            pcmd = self.cmd_list + ['fulloutput=F']
            ping = self.obj['Pingu'] = pingu.PINGU(self.log,pcmd)
            ping.run()
            ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not ping.obj['GeneCards']: return self.log.errorLog('Cannot map EnsLoci without GeneCards.', printerror=False)
            genecards = ping.obj['GeneCards'].dict['GeneCard']      # GeneCards dictionary
            ensloci = ping.getEnsLoci()     # EnsLoci SeqList object (ping.obj['EnsLoci'])
            seqdict = ensloci.seqNameDic()  
            if not seqdict: return self.log.errorLog('Failed to read in EnsLoci sequences.', printerror=False)
            ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['Fudge'] = {}
            ensback = {}    # Dictionary of {EnsLoci name:OMIM gene}
            mutations = {}  # Reorganised dictionary of {gene:{pos:Mutation}}
            for gene in rje.sortKeys(self.dict['Mutations']):
                try: seq = seqdict[genecards[gene]['EnsLoci']]
                except:
                    self.log.printLog('#MAP','No EnsLoci protein mapped for %s' % gene)
                    continue
                mutations[gene] = {}
                ensback[genecards[gene]['EnsLoci']] = gene
                mutpos = {}     # Dictionary of {pos:AA} to map onto sequence
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):                    
                    (disease,mutation) = self.dict['Mutations'][gene][subid]
                    (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation)
                    mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()]
                    mutations[gene][int(pos)] = self.dict['Mutations'][gene][subid]
                self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos)
            self.deBug(self.dict['Fudge'])

            ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            allslims = {}   # Full dictionary of SLiMFinder results matching OMIM genes
            slimomim = []   # List of (gene,pos) overlapping with SLiMs
            outfile = 'rje_omim.slimfinder.tdt'
            dataheaders = string.split('Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',',')
            headers = ['Gene','OMIM','SubID','Mutation','Disease'] + dataheaders
            rje.delimitedFileOutput(self,outfile,headers,delimit='\t',rje_backup=True)
            for file in glob.glob(self.info['SlimDir'] + '*.occ.csv'):      # Potential SLiM
                slimdata = rje.dataDict(self,file,['Pattern','Hit','Pos','Match'],dataheaders,delimit=',')
                for occ in slimdata:
                    if slimdata[occ]['Hit'] in ensback:     # OMIM gene - possible overlap
                        gene = ensback[slimdata[occ]['Hit']]
                        (start,end) = (int(slimdata[occ]['Pos']),int(slimdata[occ]['EndPos']))
                        if gene not in allslims: allslims[gene] = {}
                        allslims[gene][occ] = slimdata[occ]
                        for mpos in mutations[gene]:
                            if start <= (mpos + self.dict['Fudge'][seqdict[genecards[gene]['EnsLoci']]]) <= end:
                                self.log.printLog('#OMIMSLIM','%s %s %s (%d-%d) = %s' % (slimdata[occ]['Dataset'],slimdata[occ]['Hit'],slimdata[occ]['Pattern'],start,end,mutations[gene][mpos]))
                                slimdata[occ]['Gene'] = gene
                                slimdata[occ]['OMIM'] = string.join(self.dict['Records'][gene])
                                slimdata[occ]['Mutation'] = mutations[gene][mpos][1]
                                slimdata[occ]['Disease'] = mutations[gene][mpos][0]
                                rje.delimitedFileOutput(self,outfile,headers,'\t',slimdata[occ])
                                if (gene,mpos) not in slimomim: slimomim.append((gene,mpos))
            
            ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (inslim,resx,mutx) = (0,0,0)  # No. of residues in SLiMs, total residue count + no. mutations that may overlap
            for gene in mutations:      # These are just the genes that mapped to sequences
                mutx += len(mutations[gene])
                resx += seqdict[genecards[gene]['EnsLoci']].aaLen()
                if gene in allslims:    # Partially covered by SLiMs
                    res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen()
                    for occ in allslims[gene]:
                        (start,end) = (int(allslims[gene][occ]['Pos'])-1,int(allslims[gene][occ]['EndPos']))
                        res = res[:start] + [1] * (end-start) + res[end-1:]
                    self.deBug('%s %d (%d)' % (gene,sum(res),seqdict[genecards[gene]['EnsLoci']].aaLen()))
                    inslim += sum(res)
            self.log.printLog('#COV','SLiMs have %.1f%% coverage of OMIM gene sequences' % (100.0*inslim/resx))
            self.log.printLog('#MUT','%d mutations that could potentially occur in SLiMs' % mutx)
            self.log.printLog('#PROB','Probability of observed %d mutation overlap = %.4f' % (len(slimomim),rje.binomial(len(slimomim),mutx,float(inslim)/resx,callobj=self)))
        except: self.log.errorLog(rje_zen.Zen().wisdom())
コード例 #20
0
ファイル: rje_omim.py プロジェクト: lyhniupi1/SLiMSuite
    def run(self):  ### Main Run Method
        '''Main Run Method.'''
        try:  ### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.opt['Force'] or not self.loadMutations(): self.parseOMIM()

            ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!#
            ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            import pingu
            pcmd = self.cmd_list + ['fulloutput=F']
            ping = self.obj['Pingu'] = pingu.PINGU(self.log, pcmd)
            ping.run()
            ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not ping.obj['GeneCards']:
                return self.log.errorLog(
                    'Cannot map EnsLoci without GeneCards.', printerror=False)
            genecards = ping.obj['GeneCards'].dict[
                'GeneCard']  # GeneCards dictionary
            ensloci = ping.getEnsLoci(
            )  # EnsLoci SeqList object (ping.obj['EnsLoci'])
            seqdict = ensloci.seqNameDic()
            if not seqdict:
                return self.log.errorLog(
                    'Failed to read in EnsLoci sequences.', printerror=False)
            ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['Fudge'] = {}
            ensback = {}  # Dictionary of {EnsLoci name:OMIM gene}
            mutations = {}  # Reorganised dictionary of {gene:{pos:Mutation}}
            for gene in rje.sortKeys(self.dict['Mutations']):
                try:
                    seq = seqdict[genecards[gene]['EnsLoci']]
                except:
                    self.log.printLog(
                        '#MAP', 'No EnsLoci protein mapped for %s' % gene)
                    continue
                mutations[gene] = {}
                ensback[genecards[gene]['EnsLoci']] = gene
                mutpos = {}  # Dictionary of {pos:AA} to map onto sequence
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease, mutation) = self.dict['Mutations'][gene][subid]
                    (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',
                                                    mutation)
                    mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()]
                    mutations[gene][int(
                        pos)] = self.dict['Mutations'][gene][subid]
                self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos)
            self.deBug(self.dict['Fudge'])

            ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            allslims = {
            }  # Full dictionary of SLiMFinder results matching OMIM genes
            slimomim = []  # List of (gene,pos) overlapping with SLiMs
            outfile = 'rje_omim.slimfinder.tdt'
            dataheaders = string.split(
                'Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',
                ',')
            headers = ['Gene', 'OMIM', 'SubID', 'Mutation', 'Disease'
                       ] + dataheaders
            rje.delimitedFileOutput(self,
                                    outfile,
                                    headers,
                                    delimit='\t',
                                    rje_backup=True)
            for file in glob.glob(self.info['SlimDir'] +
                                  '*.occ.csv'):  # Potential SLiM
                slimdata = rje.dataDict(self,
                                        file,
                                        ['Pattern', 'Hit', 'Pos', 'Match'],
                                        dataheaders,
                                        delimit=',')
                for occ in slimdata:
                    if slimdata[occ][
                            'Hit'] in ensback:  # OMIM gene - possible overlap
                        gene = ensback[slimdata[occ]['Hit']]
                        (start, end) = (int(slimdata[occ]['Pos']),
                                        int(slimdata[occ]['EndPos']))
                        if gene not in allslims: allslims[gene] = {}
                        allslims[gene][occ] = slimdata[occ]
                        for mpos in mutations[gene]:
                            if start <= (mpos + self.dict['Fudge'][seqdict[
                                    genecards[gene]['EnsLoci']]]) <= end:
                                self.log.printLog(
                                    '#OMIMSLIM', '%s %s %s (%d-%d) = %s' %
                                    (slimdata[occ]['Dataset'],
                                     slimdata[occ]['Hit'],
                                     slimdata[occ]['Pattern'], start, end,
                                     mutations[gene][mpos]))
                                slimdata[occ]['Gene'] = gene
                                slimdata[occ]['OMIM'] = string.join(
                                    self.dict['Records'][gene])
                                slimdata[occ]['Mutation'] = mutations[gene][
                                    mpos][1]
                                slimdata[occ]['Disease'] = mutations[gene][
                                    mpos][0]
                                rje.delimitedFileOutput(
                                    self, outfile, headers, '\t',
                                    slimdata[occ])
                                if (gene, mpos) not in slimomim:
                                    slimomim.append((gene, mpos))

            ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (inslim, resx, mutx) = (
                0, 0, 0
            )  # No. of residues in SLiMs, total residue count + no. mutations that may overlap
            for gene in mutations:  # These are just the genes that mapped to sequences
                mutx += len(mutations[gene])
                resx += seqdict[genecards[gene]['EnsLoci']].aaLen()
                if gene in allslims:  # Partially covered by SLiMs
                    res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen()
                    for occ in allslims[gene]:
                        (start, end) = (int(allslims[gene][occ]['Pos']) - 1,
                                        int(allslims[gene][occ]['EndPos']))
                        res = res[:start] + [1] * (end - start) + res[end - 1:]
                    self.deBug('%s %d (%d)' %
                               (gene, sum(res),
                                seqdict[genecards[gene]['EnsLoci']].aaLen()))
                    inslim += sum(res)
            self.log.printLog(
                '#COV', 'SLiMs have %.1f%% coverage of OMIM gene sequences' %
                (100.0 * inslim / resx))
            self.log.printLog(
                '#MUT',
                '%d mutations that could potentially occur in SLiMs' % mutx)
            self.log.printLog(
                '#PROB', 'Probability of observed %d mutation overlap = %.4f' %
                (len(slimomim),
                 rje.binomial(
                     len(slimomim), mutx, float(inslim) / resx, callobj=self)))
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
コード例 #21
0
    def mapRegionsToSequences(
            self):  ### Maps tabulates PPI regions onto sequence datasets
        '''Maps tabulates PPI regions onto sequence datasets.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            minseq = 3
            outdir = 'RegPPI/'
            adddir = 'RegPPIAdd/'
            rje.mkDir(self, outdir)
            rje.mkDir(self, adddir)
            tabfile = 'ppi_region.tdt'
            region = rje.dataDict(self,
                                  tabfile, ['Interactor', 'Protein'],
                                  ['Start', 'End'],
                                  lists=True)
            ### ~ [2] Work through each pair in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            px = 0.0
            ptot = len(region)
            fx = 0
            for pair in rje.sortKeys(region):
                self.progLog('\r#FAS',
                             'Generating fasta files: %.2f%%' % (px / ptot))
                px += 100.0
                ## ~ [2a] Map sequences to PPI dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                [hub, spoke] = string.split(pair, '\t')
                try:
                    qryseq = self.dict['SeqObj'][spoke]
                except:
                    self.printLog(
                        '\n#QRY',
                        'Spoke gene "%s" missing from Sequence file' % spoke)
                    continue
                try:
                    spoke = self.dict['Seq2Gene'][spoke]
                except:
                    self.printLog(
                        '\n#QRY',
                        'Spoke protein "%s" missing from PPI dictionary' %
                        spoke)
                    continue
                if hub not in self.dict['PPI']:
                    self.printLog(
                        '\n#HUB',
                        'Hub gene "%s" missing from PPI dictionary' % hub)
                    continue
                addspoke = spoke not in self.dict['PPI'][hub]
                if addspoke:
                    self.dict['PPI'][hub].append(spoke)
                    self.printLog(
                        '\n#PPI',
                        'Added spoke gene "%s" to hub "%s" interactome' %
                        (spoke, hub))
                if len(self.dict['PPI'][hub]) < minseq:
                    self.printLog(
                        '\n#HUB',
                        'Hub "%s" interactome too small (<%s spokes)' %
                        (hub, minseq))
                    continue
                ## ~ [2b] Identify query sequence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                reglist = []
                for pos in region[pair]['Start'] + region[pair]['End']:
                    reglist.append(string.atoi(pos))
                reglist.sort()
                qsequence = qryseq.info['Sequence'].lower()
                self.deBug(len(qsequence))
                self.deBug(qsequence)
                prelen = len(qsequence)
                while reglist:
                    self.deBug(reglist)
                    try:
                        startx = reglist.pop(0) - 1
                        endx = reglist.pop(0)
                    except:
                        self.errorLog('%s PPI Region problem: %s' %
                                      (pair, region[pair]))
                        continue
                    self.deBug(qsequence[startx - 1:endx + 1].upper())
                    qsequence = qsequence[:startx] + qsequence[
                        startx:endx].upper() + qsequence[endx:]
                self.deBug(qsequence)
                if len(qsequence) != prelen:
                    self.printLog('#F**K', '%s' % region[pair])
                    self.printLog('#F**K', qryseq.info['Sequence'].lower())
                    self.printLog('#F**K', qsequence)
                    raise ValueError
                ## ~ [2c] Output sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if addspoke:
                    outfile = '%s%s.%s.fas' % (adddir, hub, spoke)
                    ox = 1
                else:
                    outfile = '%s%s.%s.fas' % (outdir, hub, spoke)
                    ox = 1
                open(outfile,
                     'w').write('>%s\n%s\n' % (qryseq.info['Name'], qsequence))
                for spoke2 in self.dict['PPI'][hub]:
                    if spoke2 == spoke: continue
                    try:
                        sseq = self.dict['SeqObj'][self.dict['Gene2Seq']
                                                   [spoke2]]
                        open(outfile, 'a').write(
                            '>%s\n%s\n' %
                            (sseq.info['Name'], sseq.info['Sequence']))
                        ox += 1
                    except:
                        pass
                self.printLog('\n#FAS',
                              '%s sequences output to %s' % (ox, outfile))

        except:
            self.errorLog(rje_zen.Zen().wisdom())
            raise  # Delete this if method error not terrible
コード例 #22
0
 def setup(self):  ### Loads data into attributes.
     '''Loads data into attributes.'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ UniProt Object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = self.obj['UniProt'] = rje_uniprot.UniProt(
             self.log, self.cmd_list)
         uniprot.readUniProt()
         if uniprot.entryNum(
         ) > 0:  ### UniProt data loaded. Populate seqlist and domain dictionary.
             seqlist = rje_seq.SeqList(self.log,
                                       self.cmd_list + ['autoload=F'])
             for entry in uniprot.list['Entry']:
                 seq = entry.obj['Sequence']
                 seqlist.seq.append(entry.obj['Sequence'])
                 name = seq.shortName()
                 self.dict['Entry'][name] = entry
                 self.dict['Seq'][name] = seq
                 for ft in entry.list['Feature']:
                     if ft['Type'] in self.list['DomFT']:
                         try:
                             dom = string.split(ft['Desc'])[0]
                             if dom not in self.dict['Domain']:
                                 self.dict['Domain'][dom] = []
                             if name not in self.dict['Domain'][dom]:
                                 self.dict['Domain'][dom].append(name)
                         except:
                             self.errorLog('Trouble with %s feature %s' %
                                           (name, ft))
         ## ~ [1b] ~ SeqList only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             seqlist = rje_seq.SeqList(self.log, self.cmd_list)
             for seq in seqlist.seq:
                 name = seq.shortName()
                 self.dict['Entry'][name] = None
                 self.dict['Seq'][name] = seq
                 #!# Consider adding loading domains from a table #!#
         ## ~ [1c] ~ Add PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['PPI']  # Dictionary of ShortName-centred
         ppi = rje.dataDict(self, self.info['PPI'])
         for hub in ppi:
             if ppi[hub]['EnsLoci'] == '-': continue
             ens = ppi[hub]['EnsLoci']
             if ens not in self.dict['PPI']: self.dict['PPI'][ens] = []
             self.dict['Gene'][ens] = hub
             for gene in string.split(ppi[hub]['PPI'], ','):
                 if ppi[gene]['EnsLoci'] == '-': continue
                 if ppi[gene]['EnsLoci'] not in self.dict['PPI'][ens]:
                     self.dict['PPI'][ens].append(ppi[gene]['EnsLoci'])
         ## ~ [1d] ~ Add DDI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['DDI'] = {}
         if self.info['DDI'].lower() not in ['', 'none']:
             data = rje.dataDict(self,
                                 self.info['DDI'],
                                 mainkeys=['Name1'],
                                 datakeys=['Name2'],
                                 headers=[
                                     'Pfam1', 'Pfam2', 'Name1', 'Name2',
                                     'Acc1', 'Acc2', 'Code1', 'Code2'
                                 ],
                                 lists=True)
             ## ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
             (dx, dtot) = (0.0, len(data))
             self.deBug(data)
             try:
                 rje.sortKeys(data)
             except:
                 self.errorLog('F**k', quitchoice=True)
             for p1 in rje.sortKeys(data):
                 self.progLog(
                     '\r#DDI',
                     'Parsing DDI from iPFam: %.1f%%' % (dx / dtot))
                 if p1 not in self.dict['DDI']: self.dict['DDI'][p1] = []
                 for p2 in data[p1]['Name2']:
                     if p2 not in self.dict['DDI']:
                         self.dict['DDI'][p2] = []
                     if p2 not in self.dict['DDI'][p1]:
                         self.dict['DDI'][p1].append(p2)
                     if p1 not in self.dict['DDI'][p2]:
                         self.dict['DDI'][p2].append(p1)
             self.printLog(
                 '\r#DDI', 'Parsing DDI from iPFam: %s domains' %
                 (rje.integerString(dtot)))
         ## ~ [1e] ~ Family data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['Fam'] = {}
         if self.info['Fam'].lower() not in ['', 'none']:
             data = rje.dataDict(self,
                                 self.info['Fam'],
                                 mainkeys=['Qry'],
                                 datakeys=['Hit'],
                                 lists=True)
             for qry in self.dict['Seq']:
                 self.dict['Fam'][qry] = []
                 if qry in data: self.dict['Fam'][qry] = data[qry]['Hit']
                 elif self.dict['Seq'][qry].info['AccNum'] in data:
                     self.dict['Fam'][qry] = data[
                         self.dict['Seq'][qry].info['AccNum']]['Hit']
                 if qry not in self.dict['Fam'][qry]:
                     self.dict['Fam'][qry].append(qry)
     except:
         self.errorLog('Problem with SLiMPID.setup()', quitchoice=True)
コード例 #23
0
ファイル: qsf_analysis.py プロジェクト: slimsuite/SLiMSuite
 def mapRegionsToSequences(self):    ### Maps tabulates PPI regions onto sequence datasets
     '''Maps tabulates PPI regions onto sequence datasets.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         minseq = 3
         outdir = 'RegPPI/'
         adddir = 'RegPPIAdd/'
         rje.mkDir(self,outdir)
         rje.mkDir(self,adddir)
         tabfile = 'ppi_region.tdt'
         region = rje.dataDict(self,tabfile,['Interactor','Protein'],['Start','End'],lists=True)
         ### ~ [2] Work through each pair in turn ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         px = 0.0; ptot = len(region); fx = 0
         for pair in rje.sortKeys(region):
             self.progLog('\r#FAS','Generating fasta files: %.2f%%' % (px/ptot)); px += 100.0
             ## ~ [2a] Map sequences to PPI dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             [hub, spoke] = string.split(pair,'\t')
             try: qryseq = self.dict['SeqObj'][spoke]
             except: self.printLog('\n#QRY','Spoke gene "%s" missing from Sequence file' % spoke); continue
             try: spoke = self.dict['Seq2Gene'][spoke]
             except: self.printLog('\n#QRY','Spoke protein "%s" missing from PPI dictionary' % spoke); continue
             if hub not in self.dict['PPI']: self.printLog('\n#HUB','Hub gene "%s" missing from PPI dictionary' % hub); continue
             addspoke = spoke not in self.dict['PPI'][hub]
             if addspoke:
                 self.dict['PPI'][hub].append(spoke)
                 self.printLog('\n#PPI','Added spoke gene "%s" to hub "%s" interactome' % (spoke,hub))
             if len(self.dict['PPI'][hub]) < minseq: self.printLog('\n#HUB','Hub "%s" interactome too small (<%s spokes)' % (hub,minseq)); continue
             ## ~ [2b] Identify query sequence ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             reglist = []
             for pos in region[pair]['Start'] + region[pair]['End']: reglist.append(string.atoi(pos))
             reglist.sort()
             qsequence = qryseq.info['Sequence'].lower()
             self.deBug(len(qsequence))
             self.deBug(qsequence)
             prelen = len(qsequence)
             while reglist:
                 self.deBug(reglist)
                 try: startx = reglist.pop(0) - 1; endx = reglist.pop(0)
                 except: self.errorLog('%s PPI Region problem: %s' % (pair,region[pair])); continue
                 self.deBug(qsequence[startx-1:endx+1].upper())
                 qsequence = qsequence[:startx] + qsequence[startx:endx].upper() + qsequence[endx:]
             self.deBug(qsequence)
             if len(qsequence) != prelen:
                 self.printLog('#F**K','%s' % region[pair])
                 self.printLog('#F**K',qryseq.info['Sequence'].lower())
                 self.printLog('#F**K',qsequence)
                 raise ValueError
             ## ~ [2c] Output sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
             if addspoke: outfile = '%s%s.%s.fas' % (adddir,hub,spoke); ox = 1
             else: outfile = '%s%s.%s.fas' % (outdir,hub,spoke); ox = 1
             open(outfile,'w').write('>%s\n%s\n' % (qryseq.info['Name'],qsequence))
             for spoke2 in self.dict['PPI'][hub]:
                 if spoke2 == spoke: continue
                 try:
                     sseq = self.dict['SeqObj'][self.dict['Gene2Seq'][spoke2]]
                     open(outfile,'a').write('>%s\n%s\n' % (sseq.info['Name'],sseq.info['Sequence']))
                     ox += 1
                 except: pass
             self.printLog('\n#FAS','%s sequences output to %s' % (ox,outfile))
                 
             
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
コード例 #24
0
ファイル: picsi.py プロジェクト: lyhniupi1/SLiMSuite
    def picsi(self):    ### Cleans up cross-species search results
        '''Cleans up cross-species search results.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            datafile = self.info['SumFile']
            delimit = rje.delimitFromExt(filename=self.info['SumFile'])
            data = {}       # search:{hit:{???}}
            pep2prot = {}   # search:{peptide:[hits]}
            id2prot = {}    # search:{id:hit}
            prot2desc = {}
            fullpeplist = {}    
            pepcon = {}     # Convert pep:longer pep
            speclist = []   # List of species codes
            ### ~ [1] Read Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            indata = rje.dataDict(self,datafile,['search','prot_hit_num'],'All',lists=True)
            for ikey in rje.sortKeys(indata):
                (search,id) = string.split(ikey,delimit)
                prot = indata[ikey]['prot_acc'][0]
                desc = string.replace(indata[ikey]['prot_desc'][0],'Full=','')
                if desc[3:7] == 'Name': desc = desc[9:]
                prot2desc[prot] = desc; self.printLog('#DESC','%s = %s' % (prot,desc))
                indata[ikey]['pep_seq'] = string.join(indata[ikey]['pep_seq'],'|')
                pepconv = string.replace(indata[ikey]['pep_seq'],'I','L')
                pepconv = string.replace(pepconv,'Q','K')
                peplist = rje.sortUnique(string.split(pepconv,'|'))
                indata[ikey]['pep_seq'] = string.join(rje.sortUnique(string.split(indata[ikey]['pep_seq'],'|')),'|')
                if search not in data:
                    data[search] = {}
                    pep2prot[search] = {}
                    id2prot[search] = {}
                    fullpeplist[search] = []
                    pepcon[search] = {}
                fullpeplist[search] += peplist
                id2prot[search][id] = prot
                spec = string.split(prot,'_')[1]
                if spec not in speclist: speclist.append(spec)
                data[search][prot] = {'search':search,'pepcount':len(peplist),'hit':id,'desc':desc,'spec':spec,
                                      'pep_uniq':0,'peplist':indata[ikey]['pep_seq'],'conpep':peplist[0:],
                                      'pep_rem':0}
                try: data[search][prot]['accnum'] = self.dict['Acc2Seq'][prot].info['AccNum']
                except: data[search][prot]['accnum'] = string.split(prot,'__')[-1]
                for pep in peplist:
                    if pep not in pep2prot[search]:
                        pep2prot[search][pep] = []
                    pep2prot[search][pep].append(prot)
            ## ~ [1a] Convert peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for search in fullpeplist:
                fullpeplist[search] = rje.sortUnique(fullpeplist[search])
                for pep in fullpeplist[search][0:]:
                    for pep2 in fullpeplist[search]:
                        if pep != pep2 and pep in pep2:
                            pepcon[search][pep] = pep2
                            fullpeplist[search].remove(pep)
                            break
                for pep in pepcon[search]:
                    while pepcon[search][pep] in pepcon[search]: pepcon[search][pep] = pepcon[search][pepcon[pep]]
                self.printLog('#PEP','%s %s peptide conversions' % (len(pepcon[search]),search))
                #self.deBug(pepcon[search])
                #self.deBug(rje.sortKeys(pep2prot[search]))
                pp = 0; pm = 0
                for prot in data[search]:
                    for pep in data[search][prot]['conpep'][0:]:
                        if pep in pepcon[search]:
                            newpep = pepcon[search][pep]
                            if newpep not in data[search][prot]['conpep']: data[search][prot]['conpep'].append(newpep); pp += 1
                            data[search][prot]['conpep'].remove(pep); pm += 0
                            if prot not in pep2prot[search][newpep]: pep2prot[search][newpep].append(prot)
                            if pep in pep2prot[search]: pep2prot[search].pop(pep)
                    data[search][prot]['pep_con'] = len(data[search][prot]['conpep'])
                self.printLog('#PEP','%s %s converted peptides added; %s removed' % (pp,search,pm))
            ### ~ [2] Calculate Unique/Redundancy status ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for search in pep2prot:
            ## ~ [2a] Species Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                remx = 0
                for prot in data[search]:
                    if data[search][prot]['spec'] != self.info['QrySpec']: continue
                    for pep in data[search][prot]['conpep']:
                        for prot2 in pep2prot[search][pep][0:]:
                            if data[search][prot2]['spec'] == self.info['QrySpec']: continue
                            pep2prot[search][pep].remove(prot2)
                            data[search][prot2]['conpep'].remove(pep)
                            data[search][prot2]['pep_rem'] += 1; remx += 1
                self.printLog('#REM','%s %s peptides removed from non-%s hits' % (rje.integerString(remx),search,self.info['QrySpec']))
            ## ~ [2b] One-hit wonders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                for prot in data[search]:
                    if len(data[search][prot]['conpep']) < 2:
                        for pep in data[search][prot]['conpep']:
                            #if pep in pep2prot[search] and prot in pep2prot[search][pep]:
                            pep2prot[search][pep].remove(prot)
            ## ~ [2c] Unique peptides ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                ux = 0
                for pep in pep2prot[search]:
                    #self.deBug(pep)
                    if len(pep2prot[search][pep]) == 1: data[search][pep2prot[search][pep][0]]['pep_uniq'] += 1; ux += 1
                self.printLog('#UNIQ','%s unique %s peptides' % (rje.integerString(ux),search))
            ## ~ [2d] Total Redundancy ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                summary = {'HITS':len(data[search]),'REJECT':0,'UNIQUE':0,'NR':0,'REDUNDANT':0}
                rx = 0
                for prot in data[search]:
                    #if data[search][prot]['unique']: data[search][prot]['red'] = False; continue
                    data[search][prot]['pep_red'] = 0   # Redundant peptides found in proteins with unique peptides
                    data[search][prot]['pep_nr'] = 0    # Redundant peptides found only in proteins without unique peptides
                    for pep in data[search][prot]['conpep']:
                        if pep2prot[search][pep] == [prot]: continue
                        upep = False
                        for prot2 in pep2prot[search][pep]:
                            if data[search][prot2]['pep_uniq']: upep = True; break
                        if upep: data[search][prot]['pep_red'] += 1     # Redundant peptide found in unique protein
                        else: data[search][prot]['pep_nr'] += 1         # Redundant peptide NOT found in unique protein
                    if len(data[search][prot]['conpep']) < 2: data[search][prot]['class'] = 'REJECT'; rx += 1
                    elif data[search][prot]['pep_uniq']: data[search][prot]['class'] = 'UNIQUE'
                    elif data[search][prot]['pep_nr']: data[search][prot]['class'] = 'NR'
                    else: data[search][prot]['class'] = 'REDUNDANT'; rx += 1
                    summary[data[search][prot]['class']] += 1
                self.printLog('#REJ','%s rejected %s hits' % (rje.integerString(rx),search))
                for x in rje.sortKeys(summary): self.printLog('#%s' % search,'%s %s' % (summary[x],x))

            ### ~ [3] Species ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            speclist.sort()
            species = {}
            for spec in speclist:
                try:
                    grep = os.popen('grep %s %s' % (spec,self.info['SpecTDT'])).read()
                    species[spec] = string.split(grep,':')[-4]
                    self.printLog('#SPEC','%s = %s' % (spec,species[spec]))
                except: species[spec] = '?'

            ### ~ [END] Output data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            outfile = '%s.clean.tdt' % rje.baseFile(self.info['SumFile'])
            headers = ['search','hit','class','accnum','spec','species','desc','pepcount','pep_con','pep_rem','pep_uniq','pep_nr','pep_red','peplist','conpep']
            if self.dict['Acc2Seq']: headers.insert(3,'cluster')
            rje.delimitedFileOutput(self,outfile,headers,datadict={},rje_backup=True)
            for search in rje.sortKeys(data):
                if self.dict['Acc2Seq']: self.clusterGoodSeq(search,data[search])
                for prot in rje.sortKeys(data[search]):
                    if rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc']):
                        data[search][prot]['species'] = rje.matchExp('^gi:(\d+).+\[(\S.+\S)\]$',data[search][prot]['desc'])[1]
                    else: data[search][prot]['species'] = species[data[search][prot]['spec']]                                                                               
                    rje.delimitedFileOutput(self,outfile,headers,datadict=data[search][prot])
                                
        except: self.errorLog('Errg')
コード例 #25
0
    def run(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            mygo = rje_go.GO(self.log,self.cmd_list)
            mygo.readGO()
            gomap = rje.dataDict(self,self.info['GOMap'],mainkeys=['Ensembl Gene ID'],datakeys=['GO ID'],lists=True)
            self.deBug(rje.sortKeys(gomap)[:100])
            #!# Replace 'Ensembl Gene ID' with commandline parameter at some point #!#
            self.printLog('#GOMAP','Loaded GO mappings for %s sequence IDs' % (rje.integerString(len(gomap))))
            slimocc = rje.dataDict(self,self.info['OccData'],mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=['Motif','Seq','Start_Pos','End_Pos','Cons','HomNum'])
            self.printLog('#OCC','Loaded Data for %s motif occurrences.' % (rje.integerString(len(slimocc))))
            ## ~ [1a] ~ Sequence mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            seqlist = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list)
            seqmap = {}
            (sx,stot) = (0.0,seqlist.seqNum())
            for seq in seqlist.seq:
                self.progLog('#SEQMAP','Mappings sequence IDs: %.1f%%' % (sx/stot)); sx += 100.0
                if rje.matchExp('gene:(\S+)\]',seq.info['Name']): seqmap[seq.shortName()] = rje.matchExp('gene:(\S+)\]',seq.info['Name'])[0]
            self.printLog('\r#SEQMAP','Mappings %s sequence IDs complete: %s mapped' % (rje.integerString(stot),rje.integerString(len(seqmap))))
            self.deBug(rje.sortKeys(seqmap)[:100])

            ### ~ [2] ~ Output new data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            goocc = {}
            outfile = string.join(string.split(self.info['OccData'],'.')[:-1] + ['slimfungo','tdt'],'.')
            headers = ['GO','Motif','Type','Seq','Start_Pos','End_Pos','Cons','HomNum']
            for okey in slimocc.keys():
                self.progLog('#NEW','Making new GO occurrences: %s    ' % (rje.integerString(len(slimocc))))
                data = slimocc.pop(okey)
                gene = seq = data['Seq']
                type = 'fwd'
                if string.split(data['Motif'],'_')[-1] in ['rev','scram']:
                    type = string.split(data['Motif'],'_')[-1]
                    data['Motif'] = string.join(string.split(data['Motif'],'_')[:-1],'_')
                if gene not in gomap and gene in seqmap: gene = seqmap[gene]
                golist = []
                if gene in gomap:
                    for id in gomap[gene]: golist += mygo.parents(id)
                else: golist = ['NoGo']
                self.deBug('%s:%s::%s' % (seq,gene,golist))
                for id in rje.sortUnique(golist,False,False):
                    if id not in goocc: goocc[id] = {}
                    if motif not in goocc[id]: goocc[id][motif] = {'fwd':[],'rev':[],'scram':[]}
                    goocc[id][motif][type].append(rje.combineDict({'GO':id,'Type':type},data))
            self.printLog('\r#NEW','Making new GO occurrences complete.    ' % (rje.integerString(len(slimocc))))

            rje.delimitedFileOutput(self,outfile,headers,rje_backup=True)
            (mx,ox,ix,itot) = (0,0,0.0,len(goocc))
            for id in rje.sortKeys(goocc):
                for motif in rje.sortKeys(goocc[id]):
                    for type in rje.sortKeys(goocc[id][motif]):
                        if len(goocc[id][motif][type] < self.stat['MinOcc']): goocc[id][motif].pop(type)
                    if len(goocc[id][motif]) < 2 or 'fwd' not in goocc[id][motif]: continue
                    mx += 1
                    for type in goocc[id][motif]:
                        for occ in goocc[id][motif][type]: rje.delimitedFileOutput(self,outfile,headers,datadict=occ); ox += 1
                self.progLog('#OUT','Output to %s: %.2f%% :: %s motifs; %s occ.' % (outfile,ix/itot,rje.integerString(mx),rje.integerString(ox)))
            self.printLog('\r#OUT','Output of occurrences to %s is now complete: %s motifs; %s occ.' % (outfile,rje.integerString(mx),rje.integerString(ox)))

        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible
コード例 #26
0
    def readPELM(
        self
    ):  ### Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.
        '''Reads phosphoELM into classes. Extracts UniProt data if available for Species etc.'''
        try:  ### ~ [1] Setup & Read File into Data Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            data = rje.dataDict(self,
                                self.info['PELM'],
                                mainkeys=['acc', 'position'])
            seqdict = {}  # Dictionary of Acc:Sequence

            ### ~ [2] Generate PhosphoSites dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            pdict = self.dict['PhosphoSites']
            for dkey in data:
                ## ~ [2a] Basic acc, seq and pos ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                (acc, pos) = string.split(dkey)
                pos = string.atoi(pos)
                if acc not in pdict: pdict[acc] = {}
                if pos not in pdict[acc]: pdict[acc][pos] = {}
                ## ~ [2b] PhosphoELM data with checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if acc not in seqdict: seqdict[acc] = data[dkey]['sequence']
                elif seqdict[acc] != data[dkey]['sequence']:
                    self.log.printLog(
                        '#ERR', 'Warning. Sequence mismatch for %s' % acc)
                if 'aa' not in pdict[acc][pos]:
                    pdict[acc][pos]['aa'] = data[dkey]['code']
                elif pdict[acc][pos]['aa'] != data[dkey]['code']:
                    self.log.printLog(
                        '#ERR',
                        'Warning. PhosphoSite mismatch for %s at pos %d: %s not %s'
                        %
                        (acc, pos, data[dkey]['code'], pdict[acc][pos]['aa']))
                if data[dkey]['code'] != seqdict[acc][(pos - 1):pos]:
                    self.log.printLog(
                        '#ERR',
                        'Warning. PhosphoSeq mismatch for %s at pos %d: %s not %s'
                        % (acc, pos, data[dkey]['code'],
                           seqdict[acc][pos - 1:pos]))

            ### ~ [3] Make sequence objects and update PhosphoSites keys ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [3a] Setup objects ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            acclist = rje.sortKeys(seqdict)
            pelmuni = rje_uniprot.UniProt(self.log,
                                          self.cmd_list)  # UniProt entry
            unidict = pelmuni.accDict(
                acclist)  # Dictionary of {acc:UniProtEntry}
            pelmseq = rje_seq.SeqList(self.log, self.cmd_list +
                                      ['seqin=None'])  # SeqList object
            ## ~ [3b] Add one sequence for each AccNum and update seqdict  ~~~~~~~~~~~~~~~~~~~~~~~~ ##
            #!# Look out for splice variants! (There are some!) - Copy UniProt and change sequence & AccNum #!#
            for acc in acclist:  #!# Make accdict of {acc:Seq} using unidict and seqlist #!#
                sequence = seqdict[acc]
                try:
                    uni = unidict[string.split(acc, '-')[0]]
                    desc = uni.obj['Sequence'].info['Description']
                    name = '%s__%s %s' % (uni.obj['Sequence'].info['ID'], acc,
                                          desc)
                    if sequence != uni.obj['Sequence'].info['Sequence']:
                        self.log.printLog(
                            '#WARNING',
                            'Sequence mismatch for UniProt entry %s' % acc)
                except:
                    self.log.errorLog('Problem with %s' % acc)
                    name = '%s_UNK__%s' % (
                        acc, acc)  #!# Add sequences where UniProt missing #!#
                seqdict[acc] = pelmseq._addSeq(name, sequence)
            ## ~ [3c] Filtering of sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if self.opt['FilterSeq']:
                pelmseq.autoFilter()
                for acc in acclist:
                    if seqdict[acc] not in pelmseq.seq: seqdict.pop(acc)
                acclist = rje.sortKeys(seqdict)
            ## ~ [3d] Save sequences for BLASTing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not os.path.exists(
                    self.info['PELMFas']
            ) or self.stat['Interactive'] < 0 or rje.yesNo(
                    '%s exists: overwrite?' % self.info['PELMFas']):
                pelmseq.saveFasta(seqfile=self.info['PELMFas'])
            self.obj['SeqList'] = pelmseq
            self.obj['UniProt'] = pelmuni
        except:
            self.log.errorLog('Problem during PhosphoSeq.readPELM')
コード例 #27
0
ファイル: slimpid.py プロジェクト: slimsuite/SLiMSuite
 def setup(self):    ### Loads data into attributes.
     '''Loads data into attributes.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [1a] ~ UniProt Object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         uniprot = self.obj['UniProt'] = rje_uniprot.UniProt(self.log,self.cmd_list)
         uniprot.readUniProt()
         if uniprot.entryNum() > 0:  ### UniProt data loaded. Populate seqlist and domain dictionary.
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F'])
             for entry in uniprot.list['Entry']:
                 seq = entry.obj['Sequence']
                 seqlist.seq.append(entry.obj['Sequence'])
                 name = seq.shortName()
                 self.dict['Entry'][name] = entry
                 self.dict['Seq'][name] = seq
                 for ft in entry.list['Feature']:
                     if ft['Type'] in self.list['DomFT']:
                         try:
                             dom = string.split(ft['Desc'])[0]
                             if dom not in self.dict['Domain']: self.dict['Domain'][dom] = []
                             if name not in self.dict['Domain'][dom]: self.dict['Domain'][dom].append(name)
                         except: self.errorLog('Trouble with %s feature %s' % (name,ft))
         ## ~ [1b] ~ SeqList only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         else:
             seqlist = rje_seq.SeqList(self.log,self.cmd_list)
             for seq in seqlist.seq:
                 name = seq.shortName()
                 self.dict['Entry'][name] = None
                 self.dict['Seq'][name] = seq
                 #!# Consider adding loading domains from a table #!#
         ## ~ [1c] ~ Add PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['PPI']    # Dictionary of ShortName-centred 
         ppi = rje.dataDict(self,self.info['PPI'])
         for hub in ppi:
             if ppi[hub]['EnsLoci'] == '-': continue
             ens = ppi[hub]['EnsLoci']
             if ens not in self.dict['PPI']: self.dict['PPI'][ens] = []
             self.dict['Gene'][ens] = hub
             for gene in string.split(ppi[hub]['PPI'],','):
                 if ppi[gene]['EnsLoci'] == '-': continue
                 if ppi[gene]['EnsLoci'] not in self.dict['PPI'][ens]: self.dict['PPI'][ens].append(ppi[gene]['EnsLoci'])
         ## ~ [1d] ~ Add DDI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['DDI'] = {}
         if self.info['DDI'].lower() not in ['','none']:                    
             data = rje.dataDict(self,self.info['DDI'],mainkeys=['Name1'],datakeys=['Name2'],
                                 headers=['Pfam1','Pfam2','Name1','Name2','Acc1','Acc2','Code1','Code2'],lists=True)
             ## ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ #
             (dx,dtot) = (0.0,len(data))
             self.deBug(data)
             try: rje.sortKeys(data)
             except: self.errorLog('F**k',quitchoice=True)
             for p1 in rje.sortKeys(data):
                 self.progLog('\r#DDI','Parsing DDI from iPFam: %.1f%%' % (dx/dtot))
                 if p1 not in self.dict['DDI']: self.dict['DDI'][p1] = []
                 for p2 in data[p1]['Name2']:
                     if p2 not in self.dict['DDI']: self.dict['DDI'][p2] = []
                     if p2 not in self.dict['DDI'][p1]: self.dict['DDI'][p1].append(p2)
                     if p1 not in self.dict['DDI'][p2]: self.dict['DDI'][p2].append(p1)
             self.printLog('\r#DDI','Parsing DDI from iPFam: %s domains' % (rje.integerString(dtot)))
         ## ~ [1e] ~ Family data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         self.dict['Fam'] = {}
         if self.info['Fam'].lower() not in ['','none']:                    
             data = rje.dataDict(self,self.info['Fam'],mainkeys=['Qry'],datakeys=['Hit'],lists=True)
             for qry in self.dict['Seq']:
                 self.dict['Fam'][qry] = []
                 if qry in data: self.dict['Fam'][qry] = data[qry]['Hit']
                 elif self.dict['Seq'][qry].info['AccNum'] in data: self.dict['Fam'][qry] = data[self.dict['Seq'][qry].info['AccNum']]['Hit']
                 if qry not in self.dict['Fam'][qry]: self.dict['Fam'][qry].append(qry)
     except: self.errorLog('Problem with SLiMPID.setup()',quitchoice=True)
コード例 #28
0
ファイル: rje_glossary.py プロジェクト: lyhniupi1/SLiMSuite
 def setup(self,
           gtext=''
           ):  ### Main class setup method. gtext will over-ride input file.
     '''Main class setup method. gtext will over-ride input file.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.obj['HTML'] = rje_html.HTML(self.log, self.cmd_list)
         ## ~ [1a] File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if self.basefile().lower() in ['', 'none']:
             self.basefile(rje.baseFile(self.getStr('InFile')))
         if self.getStr('OutFile').lower() in ['', 'none']:
             self.str['OutFile'] = '%s.html' % self.basefile()
         ## ~ [1b] Read in Glossary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         interms = []
         if gtext:
             delimit = self.getStr('TermSplit')
             if delimit.lower() == 'tab': delimit = '\t'
             if delimit.lower() == 'space': delimit = ' '
             if delimit.lower() == 'comma': delimit = ','
             if delimit.lower() == 'period (.)': delimit = '.'
             if delimit.lower() == 'colon': delimit = ':'
             glossary = {}
             for line in string.split(gtext, '\n'):
                 splitline = string.split(line, delimit)
                 if delimit == '.' and (splitline[-1] in ['', ' ']):
                     splitline = splitline[:-1]
                 if not splitline: continue
                 (term, definition) = (splitline[0],
                                       string.join(splitline[1:], delimit))
                 if term == 'Term' and not glossary: continue
                 if term:
                     glossary[term] = {'Definition': definition}
                     interms.append(term)
         else:
             try:
                 if not self.getBool('KeepOrder') and open(
                         self.getStr('InFile'),
                         'r').readline()[:4] == 'Term':
                     glossary = rje.dataDict(
                         self,
                         self.getStr('InFile'),
                         mainkeys=['Term'],
                         datakeys=['Term', 'Definition'])
                 else:
                     return self.setup(
                         open(self.getStr('InFile'), 'r').read())
             except:
                 self.errorLog(
                     'Problem reading input as dataDict(). Will try as text.'
                 )
                 return self.setup(open(self.getStr('InFile'), 'r').read())
         if self.list['Terms']:
             for term in glossary:
                 if term not in self.list['Terms']: glossary.pop(term)
         elif self.getBool('KeepOrder'): self.list['Terms'] = interms
         else: self.list['Terms'] = rje.sortKeys(glossary)
         for term in glossary:
             glossary[term] = glossary[term]['Definition']
         ### ~ [2] Create Full Glossary Dictionary ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         nested = {}
         for term in glossary:
             tdict = nested
             for word in string.split(term.lower()):
                 if word not in tdict: tdict[word] = {}
                 tdict = tdict[word]
             tdict['='] = glossary[term]
         self.dict['Glossary'] = nested
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
コード例 #29
0
 def setup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt'
         self.progLog('\r#PPI', 'Loading pairwise data...')
         pairwise = rje.dataDict(self, ppipairwise, ['Hub', 'Spoke'],
                                 ['Spoke', 'SpokeSeq', 'Evidence'])
         gene2seq = {}
         seq2gene = {}
         fullppi = {}
         px = 0.0
         ptot = len(pairwise)
         ppix = 0
         for pair in rje.sortKeys(pairwise):
             self.progLog(
                 '\r#PPI',
                 'Processing full pairwise PPI: %.2f%%' % (px / ptot))
             px += 100.0
             [hub, spoke] = string.split(pair, '\t')
             if spoke not in gene2seq:
                 sseq = pairwise[pair]['SpokeSeq']
                 gene2seq[spoke] = sseq
                 seq2gene[string.split(sseq, '__')[0]] = spoke
             if hub not in fullppi: fullppi[hub] = {}
             if spoke not in fullppi[hub]:
                 fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']
                 ppix += 1
         self.printLog(
             '\r#PPI', 'Processed full pairwise PPI: %s genes; %s ppi.' %
             (rje.integerString(len(fullppi)), rje.integerString(ppix / 2)))
         ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt'
         goodppi = self.loadFromFile(goodppifile, chomplines=True)
         self.dict['PPI'] = {}
         px = 0.0
         ptot = len(fullppi)
         fppix = ppix
         ppix = 0
         for hub in fullppi:
             self.progLog(
                 '\r#PPI', 'Filtering complexes: %.2f%% (%s hubs; %s ppi)' %
                 (px / ptot, rje.integerString(len(
                     self.dict['PPI'])), rje.integerString(ppix)))
             px += 100.0
             self.dict['PPI'][hub] = []
             for spoke in fullppi[hub]:
                 goodspoke = False
                 for ptype in goodppi:
                     if rje.matchExp(':(%s)($|\|)' % ptype,
                                     fullppi[hub][spoke]):
                         goodspoke = True
                         break
                 if goodspoke:
                     self.dict['PPI'][hub].append(spoke)
                     continue
                 goodspoke = True
                 for spoke2 in fullppi[hub]:
                     if spoke2 in [hub, spoke]: continue
                     if spoke2 in fullppi[spoke]:
                         goodspoke = False
                         break
                 if goodspoke: self.dict['PPI'][hub].append(spoke)
             ppix += len(self.dict['PPI'][hub])
             if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub)
         self.printLog(
             '\r#PPI', 'Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' %
             (rje.integerString(
                 len(fullppi)), rje.integerString(len(self.dict['PPI'])),
              rje.integerString(fppix / 2), rje.integerString(ppix / 2)))
         ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas'
         scmd = ['accnr=F', 'seqnr=F',
                 'seqin=%s' % seqfile] + self.cmd_list + ['autoload=T']
         seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd)
         self.dict['SeqObj'] = seqlist.seqNameDic('Max')
         self.dict['Gene2Seq'] = gene2seq
         self.dict['Seq2Gene'] = seq2gene
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
コード例 #30
0
ファイル: rje_mc58.py プロジェクト: slimsuite/SLiMSuite
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for fasta in glob.glob('*.fasta'):
             fas = fasta[:-2]
             if os.path.exists(fas): continue
             sx = 0
             for line in open(fasta,'r').readlines():
                 if line[:1] == '>':
                     try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line)
                     except: name = rje.matchExp('^>(\S+)',line)[0]
                     if len(string.split(name,'|')) == 3:
                         name = '6rf_NEIME__%s' % string.split(name,'|')[2]
                         open(fas,'a').write('>%s\n' % name)
                     elif len(string.split(name,'|')) == 5:
                         name = 'ref_NEIME__%s' % string.split(name,'|')[3]
                         open(fas,'a').write('>%s %s\n' % (name,desc))
                     else: print string.split(name,'|'); raise ValueError
                     self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1
                 else: open(fas,'a').write(line)
             self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta))
             rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True)
         ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhits = {}     # Dictionary of {hit:['File:hit_num']}
         acc = 'MC58_6RF_Hits.acc'; open(acc,'w')
         gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
         cx = 0
         for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
             cx += 1
             file = os.path.basename(csv)[:-4]
             hits = False
             for line in open(csv,'r').readlines():
                 if line.find('prot_hit_num,prot_acc') == 0: hits = True
                 elif hits:
                     data = rje.readDelimit(line,',')
                     if len(data) < 2: continue
                     [num,name] = data[:2]
                     try: name = string.split(name,'|')[2]
                     except: continue
                     if name not in rfhits:
                         open(acc,'a').write('6rf_NEIME__%s\n' % name)
                         rfhits[name] = []
                     id = '%s:%s' % (file,num)
                     if id not in rfhits[name]: rfhits[name].append(id)
                     self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits))))
         self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc))
         ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not os.path.exists(gfile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam()
         ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdata = rje.dataDict(self,gfile,['Qry'],['HitNum'])
         zeros = []
         for hit in gdata:
             if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
         zeros = rje.sortUnique(zeros,False)
         open('6rf_zeros.acc','w').write(string.join(zeros,'\n'))
         self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
         ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
         if not os.path.exists(ufile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam()
         gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True)
         fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True)
         headers = gdata.pop('Headers')
         headers.insert(1,'Sample')
         headers.append('BestHit')
         rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True)
         for rf in rje.sortKeys(gdata):
             rfcut = string.split(rf,'__')[1]
             gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ')
             gdata[rf]['Qry'] = rfcut
             try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
             except: gdata[rf]['BestHit']  = '-'
             rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf])
         
     except: self.errorLog(rje_zen.Zen().wisdom())
     self.printLog('#ZEN',rje_zen.Zen().wisdom())
コード例 #31
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:  ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Basefile'].lower() in ['', 'none']:
             self.info['Basefile'] = ''
         elif self.info['Basefile'][-1] != '.':
             self.info['Basefile'] += '.'
         self.obj['SeqList'] = rje_seq.SeqList(
             self.log, self.cmd_list + ['autoload=T'])
         self.list['PlotFT'] = string.split(
             string.join(self.list['PlotFT']).upper())
         if self.info['OccFile'].lower() not in ['', 'none']:
             self.info['Delimit'] = rje.delimitFromExt(
                 filename=self.info['OccFile'])
             self.dict['OccData'] = {}
             occdata = rje.dataDict(
                 self, self.info['OccFile'],
                 ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos'],
                 ['Seq', 'Dataset', 'Pattern', 'Start_Pos', 'End_Pos'])
             for key in rje.sortKeys(occdata):
                 seq = occdata[key].pop('Seq')
                 if seq not in self.dict['OccData']:
                     self.dict['OccData'][seq] = {}
                 dataset = occdata[key].pop('Dataset')
                 if dataset not in self.dict['OccData'][seq]:
                     self.dict['OccData'][seq][dataset] = []
                 self.dict['OccData'][seq][dataset].append(occdata[key])
             self.printLog(
                 '#OCC', 'Loaded data for %s occurrences in %s sequences' %
                 (rje.integerString(len(occdata)),
                  rje.integerString(len(self.dict['OccData']))))
             self.obj['SeqList'].autoFilter([
                 'GoodSeq=%s' %
                 string.join(rje.sortKeys(self.dict['OccData']), ',')
             ])
         ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['PlotStat'] = string.split(
             string.join(self.list['PlotStat']).lower())
         if 'cons' in self.list['PlotStat'] or 'rel' in self.list[
                 'PlotStat']:
             slimcalc = rje_slimcalc.SLiMCalc(self.log, self.cmd_list)
         seqdict = self.obj['SeqList'].seqNameDic()
         for name in rje.sortKeys(seqdict):
             if self.opt['OccOnly'] and not name in self.dict['OccData']:
                 continue
             seq = seqdict[name]
             sequence = seq.getSequence(gaps=False)
             seq.dict['PlotStat'] = {}
             if 'sa' in self.list['PlotStat']:
                 seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(
                     sequence, returnlist=True)
             if 'hyd' in self.list['PlotStat']:
                 seq.dict['PlotStat'][
                     'Hydropathy'] = rje_seq.eisenbergHydropathy(
                         sequence, returnlist=True)
             if 'dis' in self.list['PlotStat']:
                 seq.dict['PlotStat']['Disorder'] = seq.disorder(
                     returnlist=True)
             if 'cons' in self.list['PlotStat'] or 'rel' in self.list[
                     'PlotStat']:
                 slimcalc.relConListFromSeq(seq,
                                            slimcalc.stat['RelConWin'],
                                            store=True)
                 try:
                     seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons')
                     seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop(
                         'RelCons')
                 except:
                     self.printLog('#CONS',
                                   'No conservation stats for %s' % name)
             self.printLog('#STAT', 'PlotStats calculated for %s' % name)
             for stat in seq.dict['PlotStat']:
                 if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0:
                     seq.dict['PlotStat'][stat] = self.plotWin(
                         seq.dict['PlotStat'][stat])
                 seq.dict['PlotStat'][stat] = self.convertStat(
                     seq.dict['PlotStat'][stat])
             self.printLog('#STAT', 'PlotStats converted for %s' % name)
             ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
             if name in self.dict['OccData']:
                 for dataset in self.dict['OccData'][name]:
                     ofile = '%s%s.%s.plot.txt' % (
                         self.info['Basefile'], dataset, seq.info['AccNum'])
                     self.output(seq, ofile,
                                 self.dict['OccData'][name][dataset])
             else:
                 self.output(
                     seq, '%s%s.plot.txt' %
                     (self.info['Basefile'], seq.info['AccNum']))
         return
     except:
         self.errorLog(rje_zen.Zen().wisdom())
コード例 #32
0
ファイル: rje_genecards.py プロジェクト: kwikwag/SLiMSuite
    def readHGNC(self):     ### Read links from HGNC into data structure
        '''Read links from HGNC into data structure.'''
        try:### ~ [1] Read into dictionary with HGNC ID as key ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.info['HGNCData'].lower() in ['','none']: return
            if not os.path.exists(self.info['HGNCData']): return self.log.errorLog('HGNC file "%s" not found' % (self.info['HGNCData']),printerror=False)
            hgncdata = rje.dataDict(self,self.info['HGNCData'],['HGNC ID'])
            aliaii = {}     # Dictionary of withdrawn symbols to map

            ### ~ [2] Parse out information ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (hx,htot) = (0.0,len(hgncdata))
            for hgnc in rje.sortKeys(hgncdata):
                self.log.printLog('\r#HGNC','Processing HGNC: %.1f%%' % (hx/htot),newline=False,log=False)
                hx += 100.0
                ## ~ [2a] Adjust headers for new vs old HGNC compatibility ~~~~~~~~~~~~~~~~~~~~~~~~ ##
                data = hgncdata[hgnc]
                for hkey in rje.sortKeys(data):
                    if rje.matchExp('^(\S.+\S)\s*\(mapped data supplied by \S+\)',hkey):
                        data['%s (mapped data)' % rje.matchExp('^(\S.+\S)\s*\(mapped data supplied by \S+\)',hkey)[0]] = data.pop(hkey)
                    if rje.matchExp('^(\S.+\S)\s*\(supplied by \S+\)',hkey):
                        data['%s (mapped data)' % rje.matchExp('^(\S.+\S)\s*\(supplied by \S+\)',hkey)[0]] = data.pop(hkey)
                ## ~ [2b] Make dictionary of Genecards data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                gdict = {}
                gdict['Symbol'] = gene = data['Approved Symbol'].upper()
                gdict['Desc'] = data['Approved Name']
                ## ~ [2c] Special treatment of obselete symbol ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if gene.find('~withdrawn') > 0:     ### Obselete symbol
                    try:
                        gene = gene[:gene.find('~WITHDRAWN')]
                        alias = rje.matchExp(', see (\S+)',gdict['Desc'])[0]
                        if len(string.split(alias)) > 1: continue   # Ambiguous
                        if gene in aliaii and aliaii[gene] != alias: aliaii[gene] = 'AMBIGUOUS'
                        else: aliaii[gene] = alias
                    except: pass
                    continue
                ## ~ [2d] Add additional aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if 'Synonyms' in data and 'Aliases' not in data: data['Aliases'] = data.pop('Synonyms')
                for alias in string.split(data['Previous Symbols'].upper(),', ') + string.split(data['Aliases'].upper(),', '):
                    #x# if alias.upper() != alias: continue     # Not really a symbol
                    if alias in self.dict['GeneCard']: aliaii[alias] = 'AMBIGUOUS'
                    if alias in aliaii and aliaii[alias] != gene: aliaii[alias] = 'AMBIGUOUS'
                    else: aliaii[alias] = gene
                if gene in aliaii: aliaii[gene] = 'AMBIGUOUS'
                gdict['Entrez'] = data['Entrez Gene ID']
                if not gdict['Entrez']: gdict['Entrez'] = data['Entrez Gene ID (mapped data)']
                gdict['OMIM'] = data['OMIM ID (mapped data)']
                gdict['UniProt'] = data['UniProt ID (mapped data)']
                gdict['EnsEMBL'] = ensgene = data['Ensembl ID (mapped data)']
                gdict['HGNC'] = string.replace(hgnc,'HGNC:','')
                if not gene: gene = ensgene
                if not gene:
                    self.log.errorLog('HGNC has no gene for %s: %s' % (gdict['HGNC'],data),printerror=False)
                    continue
                #x#self.deBug(data)
                self.dict['GeneCard'][gene] = gdict
                if self.opt['FullHGNC'] and gene not in self.list['Genes']: self.list['Genes'].append(gene)
                ## ~ [2b] Deal with EnsGene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if self.opt['FullEns'] and ensgene:
                    if ensgene not in self.list['Genes']: self.list['Genes'].append(ensgene)
                    if ensgene not in self.dict['GeneCard']: self.dict['GeneCard'][ensgene] = {}
                    rje.combineDict(self.dict['GeneCard'][ensgene],gdict,overwrite=False,replaceblanks=True)
            #x#self.deBug(aliaii)
            self.log.printLog('\r#HGNC','Processed HGNC: %s genes & %s aliases' % (rje.integerString(len(self.dict['GeneCard'])),rje.integerString(len(aliaii))))

            ### ~ [3] Deal with aliaii ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ambig = []
            (hx,htot) = (0.0,len(aliaii))
            for alias in aliaii:
                self.log.printLog('\r#HGNC','Processing HGNC aliases: %.1f%%' % (hx/htot),newline=False,log=False)
                hx += 100.0
                if aliaii[alias] == 'AMBIGUOUS':
                    ambig.append(alias)
                    continue       # Alias mapped to multiple genes 
                while aliaii[alias] not in self.dict['GeneCard'] and aliaii[alias] in aliaii: aliaii[alias] = aliaii[aliaii[alias]]     # Map through several aliases if needed
                if aliaii[alias] not in self.dict['GeneCard']: continue     # Alias is not a valid Gene, so ignore
                if alias not in self.dict['GeneCard']: self.dict['GeneCard'][alias] = self.dict['GeneCard'][aliaii[alias]]
                if self.opt['FullHGNC'] and alias not in self.list['Genes']: self.list['Genes'].append(alias)
            self.log.printLog('\r#HGNC','Processed HGNC: %s genes & aliases' % (rje.integerString(len(self.dict['GeneCard']))))
            if ambig:
                self.log.printLog('#AMB','%s ambiguous aliases were not mapped' % rje.integerString(len(ambig)))
                open('hgnc.ambiguities.txt','w').write(string.join(ambig,'\n'))
        except: self.log.errorLog(rje_zen.Zen().wisdom())
コード例 #33
0
ファイル: rje_mc58.py プロジェクト: lyhniupi1/SLiMSuite
    def run(self):  ### Main run method
        '''Main run method.'''
        try:  ### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for fasta in glob.glob('*.fasta'):
                fas = fasta[:-2]
                if os.path.exists(fas): continue
                sx = 0
                for line in open(fasta, 'r').readlines():
                    if line[:1] == '>':
                        try:
                            (name,
                             desc) = rje.matchExp('^>(\S+) (\S.+)$', line)
                        except:
                            name = rje.matchExp('^>(\S+)', line)[0]
                        if len(string.split(name, '|')) == 3:
                            name = '6rf_NEIME__%s' % string.split(name, '|')[2]
                            open(fas, 'a').write('>%s\n' % name)
                        elif len(string.split(name, '|')) == 5:
                            name = 'ref_NEIME__%s' % string.split(name, '|')[3]
                            open(fas, 'a').write('>%s %s\n' % (name, desc))
                        else:
                            print string.split(name, '|')
                            raise ValueError
                        self.progLog(
                            '\r#FAS', 'Processing %s: %s seqs' %
                            (fas, rje.integerString(sx)))
                        sx += 1
                    else:
                        open(fas, 'a').write(line)
                self.printLog(
                    '\r#FAS', 'Processed %s: %s seqs from %s' %
                    (fas, rje.integerString(sx), fasta))
                rje_blast.BLASTRun(self.log,
                                   self.cmd_list).formatDB(fas,
                                                           protein=True,
                                                           force=True)
            ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            rfhits = {}  # Dictionary of {hit:['File:hit_num']}
            acc = 'MC58_6RF_Hits.acc'
            open(acc, 'w')
            gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
            cx = 0
            for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
                cx += 1
                file = os.path.basename(csv)[:-4]
                hits = False
                for line in open(csv, 'r').readlines():
                    if line.find('prot_hit_num,prot_acc') == 0: hits = True
                    elif hits:
                        data = rje.readDelimit(line, ',')
                        if len(data) < 2: continue
                        [num, name] = data[:2]
                        try:
                            name = string.split(name, '|')[2]
                        except:
                            continue
                        if name not in rfhits:
                            open(acc, 'a').write('6rf_NEIME__%s\n' % name)
                            rfhits[name] = []
                        id = '%s:%s' % (file, num)
                        if id not in rfhits[name]: rfhits[name].append(id)
                        self.progLog(
                            '\r#CSV', 'Reading %d CSV files: %s 6RF Hits' %
                            (cx, rje.integerString(len(rfhits))))
            self.printLog(
                '\r#CSV', 'Read %d CSV files: %s 6RF Hits output to %s' %
                (cx, rje.integerString(len(rfhits)), acc))
            ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not os.path.exists(gfile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=%s' % acc, 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Hits.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Hits.fas', 'searchdb=MC58_1.fas',
                        'qryacc=F'
                    ]).gablam()
            ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            gdata = rje.dataDict(self, gfile, ['Qry'], ['HitNum'])
            zeros = []
            for hit in gdata:
                if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
            zeros = rje.sortUnique(zeros, False)
            open('6rf_zeros.acc', 'w').write(string.join(zeros, '\n'))
            self.printLog(
                '#ZERO',
                '%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
            ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
            if not os.path.exists(ufile):
                seqlist = rje_seq.SeqList(
                    self.log, self.cmd_list + [
                        'seqin=6rf_zeros.acc', 'fasdb=MC58_6RF.fas',
                        'seqout=MC58_6RF_Zeros.fas', 'autoload=T', 'accnr=F',
                        'seqnr=F'
                    ])
                seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
                seqlist.saveFasta()
                gablam.GABLAM(
                    self.log, self.cmd_list + [
                        'seqin=MC58_6RF_Zeros.fas',
                        'searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas',
                        'qryacc=F'
                    ]).gablam()
            gdata = rje.dataDict(self, ufile, ['Qry'], getheaders=True)
            fdata = rje.dataDict(self,
                                 string.replace(ufile, 'hitsum', 'gablam'),
                                 ['Qry'], ['Hit'],
                                 lists=True)
            headers = gdata.pop('Headers')
            headers.insert(1, 'Sample')
            headers.append('BestHit')
            rje.delimitedFileOutput(self,
                                    'MC58_6RF_Zeros.tdt',
                                    headers,
                                    rje_backup=True)
            for rf in rje.sortKeys(gdata):
                rfcut = string.split(rf, '__')[1]
                gdata[rf]['Sample'] = string.join(rfhits[rfcut], '; ')
                gdata[rf]['Qry'] = rfcut
                try:
                    gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
                except:
                    gdata[rf]['BestHit'] = '-'
                rje.delimitedFileOutput(self,
                                        'MC58_6RF_Zeros.tdt',
                                        headers,
                                        datadict=gdata[rf])

        except:
            self.errorLog(rje_zen.Zen().wisdom())
        self.printLog('#ZEN', rje_zen.Zen().wisdom())