Esempio n. 1
0
 def ensLoci(self):  ### Reads from EnsLoci file if it exists and parses into dictionaries.
     '''Reads from EnsLoci file if it exists and parses into dictionaries.'''
     self.dict['EnsLoci'] = {}    # Dictionary of {EnsGene:shortName()}
     self.dict['EnsDesc'] = {}    # Dictionary of {EnsGene:Description}
     self.dict['UniEns'] = {}     # Dictionary of {UniProt?:EnsGene}
     if os.path.exists(self.info['EnsLoci']):
         elines = self.loadFromFile(self.info['EnsLoci'])
         (ex,etot) = (0.0,len(elines))
         while elines:
             ex += 100.0
             line = elines.pop(0)
             if line[:1] != '>': continue
             if rje.matchExp('^>(\S+).+ gene:(\S+)\]',line): (name,gene) = rje.matchExp('^>(\S+).+ gene:(\S+)\]',line)
             else:
                 self.log.errorLog('Problem with EnsLoci line: %s' % line,printerror=False)
                 continue
             try: acc = rje.matchExp('\[acc:(\S+)',line)[0]
             except: acc = ''
             if acc: self.dict['UniEns'][acc] = gene
             self.dict['EnsLoci'][gene] = name
             self.dict['EnsDesc'][gene] = string.join(string.split(string.split(line,' [acc:')[0][1:])[1:])
             if self.opt['FullEns'] and gene not in self.list['Genes']:
                 self.list['Genes'].append(gene)
             if self.opt['FullEns'] and gene not in self.dict['GeneCard']:
                 self.dict['GeneCard'][gene] = {'EnsEMBL':gene,'Symbol':'!FAILED!'}
             self.log.printLog('\r#ENS','Parsing EnsLoci %.1f%%: %s genes' % (ex/etot,rje.integerString(len(self.dict['EnsLoci']))),newline=False,log=False)
         self.log.printLog('\r#ENS','Parsing EnsLoci complete: %s genes' % (rje.integerString(len(self.dict['EnsLoci']))))
Esempio n. 2
0
 def tabulatePPIRegion(self):    ### Tabulates regions of known PPI from DAT file
     '''Tabulates regions of known PPI from DAT file.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tabfile = 'ppi_region.tdt'
         unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat'
         if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB','%s found. (Force=F)' % tabfile)
         headers = ['Protein','Start','End','Interactor']
         rje.delimitedFileOutput(self,tabfile,headers,rje_backup=True)
         ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gcmd = "grep -P '(ID   |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile
         self.printLog('#GREP',gcmd)
         prot = None; rx = 0; plist = []; ilist = []
         for gline in os.popen(gcmd).readlines():
             if rje.matchExp('ID   (\S+)',gline): prot = rje.matchExp('ID   (\S+)',gline)[0]
             if rje.matchExp('FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline):
                 (rstart,rend,rint) = rje.matchExp('FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline)
                 for ppi in string.split(rint):
                     if rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi):
                         datadict = {'Protein':prot,'Start':rstart,'End':rend,'Interactor':rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi)[0]}
                         rje.delimitedFileOutput(self,tabfile,headers,datadict=datadict); rx += 1
                         if prot not in plist: plist.append(prot)
                         if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor'])
                         self.progLog('\r#REGTAB','Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString(len(plist)),rje.integerString(len(ilist)), rje.integerString(rx)))
         self.printLog('\r#REGTAB','Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)),rje.integerString(len(ilist)),rje.integerString(rx),tabfile))
         return True
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise   # Delete this if method error not terrible
Esempio n. 3
0
    def convert(self,filelist=[],outfile=None):      ### Converts scansite output files in FileList to Outfile
        '''
        Converts scansite output files in FileList to Outfile.
        '''
        try:
            ### Setup ###
            _stage = 'Setup'
            if len(filelist) < 1:
                filelist = self.list['FileList']
            if not outfile:
                outfile = self.info['Name']          
            if len(filelist) < 1:
                self.log.errorLog('No scansite files to convert! %s unchanged/not made.' % outfile,printerror=False)
                return False
            delimit = rje.getDelimit(self.cmd_list)
            ext = rje.delimitExt(delimit)
            if ext != outfile[-3:]:
                newfile = outfile[:-3] + ext
                if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)):
                    outfile = newfile
            self.log.printLog('#OUT','Converting %d file(s), output to %s.' % (len(filelist),outfile))

            ### Output File ###
            _stage = 'Output File'
            if not self.opt['Append'] or not os.path.exists(outfile):   # Create with header
                OUTFILE = open(outfile,'w')
                headers = ['seq_id','enzyme','enz_group','aa','pos','score','percentile','matchseq','sa']
                rje.writeDelimit(OUTFILE,headers,delimit)
            else:
                OUTFILE = open(outfile,'a')

            ### Conversion ###
            _stage = 'Conversion'
            sx = 0
            for infile in filelist:
                if not os.path.exists(infile):
                    self.log.errorLog('Input file %s does not exist! :o(' % infile,False,False)
                    continue
                fx = 0
                INFILE = open(infile,'r')
                inline = rje.nextLine(INFILE)
                while inline != None:
                    if rje.matchExp(re_scansite,inline):
                        scanlist = rje.matchExp(re_scansite,inline)
                    rje.writeDelimit(OUTFILE,scanlist,delimit)
                    sx += 1
                    fx += 1
                    rje.progressPrint(self,sx)
                    inline = rje.nextLine(INFILE)
                self.log.printLog('#OUT','%s scansite results from %s. (%s Total.)' % (rje.integerString(fx),infile,rje.integerString(sx)))
                INFILE.close()

            ### End ###
            _stage = 'End'
            OUTFILE.close()
            self.log.printLog('#OUT','%s scansite results output to %s.' % (rje.integerString(sx),outfile))
            return True            
        except:
            self.log.errorLog('Error in convert(%s)' % _stage,printerror=True,quitchoice=False)
            raise   
Esempio n. 4
0
    def parseDisorder(self):    ### Parses disordered regions from sequence name (e.g. DisProt download)
        '''
        Parses disordered regions from sequence name (e.g. DisProt download).
        #X-Y = disordered region [1.0]; &X-Y = ordered region [0.0]; All else neutral [0.5];
        '''
        try:
            ### Setup sequence and name ###
            sequence = self.info['Sequence']
            name = self.info['Name']
            self.list['ResidueDisorder'] = [0.5] * len(sequence)
            self.list['RegionDisorder'] = []
            scoredict = {'#':1.0,'&':0.0}

            ### Process ###
            for region in string.split(name)[1:]:
                if rje.matchExp('^[#&](\d+)-(\d+)',region):
                    (i,x,y) = rje.matchExp('^([#&])(\d+)-(\d+)',region)
                    score = scoredict[i]
                    start = string.atoi(x) - 1
                    end = string.atoi(y)
                    for r in range(start,end): self.list['ResidueDisorder'][r] = score
                    if i == '#': self.list['RegionDisorder'].append((start,end))
            self.minRegion()
            if self.opt['PrintLog']: self.log.printLog('\r#DIS','DisProt Disorder parsing complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']),self.list['ResidueDisorder'].count(1.0)))
            return True
        except:
            self.log.errorLog('Error in Disorder.foldIndex(%s)' % self.info['Name'],quitchoice=True)
            return False
Esempio n. 5
0
 def readAAProp(self,
                filename=None):  ### Reads AA Property Matrix from file
     '''
     Reads AA Property Matrix from file.
     >> filename:str = Filename. If None, will use self.info['Name']
     '''
     try:
         ### <a> ### Load and read
         if filename:
             self.info['Name'] = filename
         else:
             filename = self.info['Name']
         readtxt = 'Reading AA Properties from %s...' % filename
         self.progLog('\r#AAPROP', readtxt)
         proplines = self.loadFromFile(filename, v=2)
         ### <b> ### Process
         self.alphabet = []
         self.prop = {}
         ## <i> ## Properties and alphabet
         for line in proplines:
             line = rje.chomp(line)
             if line.find('#') == 0:  # Comment line
                 continue
             elif line.find('PROP') == 0:  # Header line - has amino acids
                 line = rje.matchExp('^\S+(\s.+)', line)[0]
                 while re.search('^\s+\S.*', line):
                     (aa, line) = rje.matchExp('^\s+(\S)(.*)', line)
                     self.alphabet.append(aa)
                 readtxt += ' ...%s' % string.join(self.alphabet)
                 self.progLog('\r#AAPROP', readtxt)
             elif re.search('^\S', line) and self.alphabet:  # Property line
                 (aaproperty, line) = rje.matchExp('^(\S+)(\s.+)', line)
                 readtxt += ' ...%s' % aaproperty
                 self.progLog('\r#AAPROP', readtxt)
                 self.prop[aaproperty] = {}
                 for aa in self.alphabet:
                     (p, line) = rje.matchExp('^\s+(\S)(.*)', line)
                     self.prop[aaproperty][aa] = p
                 #self.verbose(2,3,'...%s' % self.prop[property],0)
         readtxt += ' ...Done!'
         self.printLog('\r#AAPROP', readtxt)
     except IOError:
         self.log.errorLog(
             'AA Property matrix file %s missing?' % self.info['Name'],
             True)
         raise
     except:
         self.log.errorLog(
             'Major Problem reading AA Property matrix(%s)' %
             self.info['Name'], True)
         return
     add = []
     if 'X' not in self.alphabet:
         add.append('X')
     if '-' not in self.alphabet:
         add.append('-')
     if add:
         add = self.alphabet + add
         self.useAlphabet(alphabet=add)
     self.makePropDif()
Esempio n. 6
0
    def parseOMIM(self):    ### Main parsing method
        '''Main parsing method.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            aas = string.split(string.join(rje_sequence.aa_code_3.values()).upper())
            oline = os.path.exists(self.info['Name'])
            (olen,ox,mx) = (len(open(self.info['Name'],'r').readlines()),0.0,0)
            OMIM = open(self.info['Name'],'r')

            ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            record = gene = subid = disease = mutation = ''
            av = False      # Whether reading *FIELD* AV for mutation data
            while oline:
                oline = OMIM.readline()
                self.log.printLog('\r#OMIM','Processing OMIM: %.2f%% (%s genes)' % (ox/olen,rje.integerString(len(self.dict['Records']))),newline=False,log=False)
                ox += 100.0
                if not av and oline[:1] != '*': continue
                line = rje.chomp(oline)
                while line[-1:] == ' ': line = line[:-1]
                ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if line == '*RECORD*': (record,av) = ('',False)
                elif line == '*FIELD* NO':    # New record
                    record = rje.chomp(OMIM.readline())
                    gene = ''
                    ox += 100.0
                ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* TI':      # New gene
                    gene = string.split(rje.chomp(OMIM.readline()))[-1]
                    subid = ''
                    av = False
                    ox += 100.0
                ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* AV': av = True        # Start of mutation records
                elif av and rje.matchExp('^(\.\d+)',line):  # New subid mutation record
                    subid = rje.matchExp('^(\.\d+)',line)[0]
                    disease = rje.chomp(OMIM.readline())
                    ox += 100.0
                    try: mutation = rje.matchExp('^%s, (\D\D\D\d+\D\D\D)' % gene,rje.chomp(OMIM.readline()))[0]
                    except: continue    # No mutation or not coding change
                    ox += 100.0
                    subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)',mutation)
                    if subaa[0] not in aas or subaa[1] not in aas: continue
                    if gene not in self.dict['Records']: self.dict['Records'][gene] = [record]
                    if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record]
                    if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {}
                    mx += 1
                    self.dict['Mutations'][gene][subid] = (disease,mutation)
                        
            ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OMIM.close()
            self.log.printLog('\r#OMIM','Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len(self.dict['Records'])),rje.integerString(mx)))
            self.saveMutations()
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise   # Delete this if method error not terrible
Esempio n. 7
0
 def readAAProp(self,filename=None): ### Reads AA Property Matrix from file
     '''
     Reads AA Property Matrix from file.
     >> filename:str = Filename. If None, will use self.info['Name']
     '''
     try:
         ### <a> ### Load and read
         if filename:
             self.info['Name'] = filename
         else:
             filename = self.info['Name']
         readtxt = 'Reading AA Properties from %s...' % filename
         self.progLog('\r#AAPROP',readtxt)
         proplines = self.loadFromFile(filename,v=2)
         ### <b> ### Process
         self.alphabet = []
         self.prop = {}
         ## <i> ## Properties and alphabet
         for line in proplines:
             line = rje.chomp(line)
             if line.find('#') == 0: # Comment line
                 continue
             elif line.find('PROP') == 0:  # Header line - has amino acids
                 line = rje.matchExp('^\S+(\s.+)',line)[0]
                 while re.search('^\s+\S.*',line):
                     (aa,line) = rje.matchExp('^\s+(\S)(.*)',line)
                     self.alphabet.append(aa)
                 readtxt += ' ...%s' % string.join(self.alphabet)
                 self.progLog('\r#AAPROP',readtxt)
             elif re.search('^\S',line) and self.alphabet:   # Property line
                 (aaproperty,line) = rje.matchExp('^(\S+)(\s.+)',line)
                 readtxt += ' ...%s' % aaproperty
                 self.progLog('\r#AAPROP',readtxt)
                 self.prop[aaproperty] = {}
                 for aa in self.alphabet:
                     (p,line) = rje.matchExp('^\s+(\S)(.*)',line)
                     self.prop[aaproperty][aa] = p
                 #self.verbose(2,3,'...%s' % self.prop[property],0)
         readtxt += ' ...Done!'
         self.printLog('\r#AAPROP',readtxt)
     except IOError:
         self.log.errorLog('AA Property matrix file %s missing?' % self.info['Name'],True)
         raise
     except:
         self.log.errorLog('Major Problem reading AA Property matrix(%s)' % self.info['Name'],True)
         return
     add = []
     if 'X' not in self.alphabet:
         add.append('X')
     if '-' not in self.alphabet:
         add.append('-')
     if add:
         add = self.alphabet + add
         self.useAlphabet(alphabet=add)
     self.makePropDif()
Esempio n. 8
0
    def foldIndex(self):  ### Runs FoldIndex disorder prediction
        '''Runs FoldIndex disorder prediction.'''
        try:
            ### Setup sequence and name ###
            sequence = self.info['Sequence']

            ### Run Disorder ###
            retry = self.stat['FILoop']
            url = "http://bioportal.weizmann.ac.il/fldbin/findex"
            params = "m=xml&sq=" + sequence + "  "
            while retry:
                try:
                    flines = urllib2.urlopen(url, params).readlines()
                except:
                    flines = []
                if flines:
                    break
                retry -= 1
                time.sleep(self.stat['FISleep'])
            if not flines:
                self.log.errorLog('FoldIndex run for "%s" failed.' %
                                  self.info['Name'],
                                  printerror=False)
                self.list['ResidueDisorder'] = []
                self.list['RegionDisorder'] = []
                return False
            ### Process ###
            self.list['ResidueDisorder'] = [0.0] * len(sequence)
            self.list['RegionDisorder'] = []
            for f in flines:
                if rje.matchExp(
                        '<segment start="(\d+)" end="(\d+)" len="(\d+)"', f):
                    fm = rje.matchExp(
                        '<segment start="(\d+)" end="(\d+)" len="(\d+)"', f)
                    self.list['RegionDisorder'].append(
                        (string.atoi(fm[0]), string.atoi(fm[1])))
                    for i in range(string.atoi(fm[0]) - 1, string.atoi(fm[1])):
                        self.list['ResidueDisorder'][i] = 1.0
            self.minRegion()
            if self.opt['PrintLog']:
                self.log.printLog(
                    '\r#DIS',
                    'FoldIndex Disorder prediction complete: %d disorder regions, %d disordered aa'
                    % (len(self.list['RegionDisorder']),
                       sum(self.list['ResidueDisorder'])))
            self.opt['Flat'] = True
            return True
        except:
            self.log.errorLog('Error in Disorder.foldIndex(%s)' %
                              self.info['Name'],
                              quitchoice=True)
            return False
Esempio n. 9
0
 def setup(self):    ### Main class setup method. Makes sumfile if necessary.
     '''Main class setup method. Makes sumfile if necessary.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile'))
         if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile']))
         if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile()
         self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile'))
         if os.path.exists(self.info['SumFile']) and not self.opt['Force']:
             if rje.yesNo('%s found. Use these results?' % self.info['SumFile']):
                 return self.printLog('#SUM','Summary results file found. No MASCOT processing.')
         mapgi = False
         ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for mfile in self.list['ResFiles']:
             bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile])
             bud.info['Name'] = mfile
             bud.readMascot()
             self.dict['Searches'][mfile] = bud.dict['Hits']
             protacclist = rje.sortKeys(bud.dict['Hits'])
             for protacc in protacclist:
                 if rje.matchExp('gi\|(\d+)',protacc): mapgi = True
             accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile))
             self.debug(accfile)
             open(accfile,'w').write(string.join(protacclist,'\n'))
             self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist)))
         ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         #if mapgi:
         #    mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI')
         #    open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n'))
         ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list)
         self.dict['Acc2Seq'] = seqlist.seqNameDic('Max')
         ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',')
         rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True)
         for mfile in rje.sortKeys(self.dict['Searches']):
             bud = self.dict['Searches'][mfile]
             for protacc in rje.sortKeys(bud)[0:]:
                 protname = bud[protacc]['prot_acc']
                 protdesc = bud[protacc]['prot_desc']
                 if rje.matchExp('gi\|(\d+)',protacc):
                     gi = rje.matchExp('gi\|(\d+)',protacc)[0]
                     try:
                         protname = self.dict['Acc2Seq'][gi].shortName()
                         protdesc = self.dict['Acc2Seq'][gi].info['Description']
                     except: protname = 'gi_UNK__%s' % gi
                 #x#print protname, protdesc, bud[protacc]
                 for pep in bud[protacc]['Peptides']:
                     data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname,
                             'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']}
                     rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data)
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Esempio n. 10
0
 def readSLiMSearchOcc(self,motifs=[]):   ### Reads SLiMSearch results into data dictionary
     '''Reads SLiMSearch results into data dictionary.'''
     try:### ~ [1] Read ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not motifs: self.printLog('#OCC','Cannot process occurrences for No motifs!')
         occfile = '%s.csv' % self.info['ResFile']
         delimit = rje.delimitFromExt(filename=occfile)
         data = rje.dataDict(self,occfile,mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=string.split('Seq,Desc,Start_Pos,End_Pos,Cons,HomNum,GlobID,LocID,Hyd,SA',','))
         self.dict['Occ'] = {}
         ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (mx,ox,otot) = (0,0.0,len(data))
         for occ in data:
             self.progLog('\r#OCC','Processing occurrences (%d motifs): %.2f%%' % (mx,ox/otot)); ox += 100.0
             #x#self.deBug('%s vs MinHom %d' % (data[occ],self.stat['MinHom']))
             if string.atoi(data[occ]['HomNum']) < self.stat['MinHom']: continue
             (motif,seq,start,end) = string.split(occ,delimit)
             if motif not in motifs: continue
             try:
                 gene = rje.matchExp('gene:(\S+)\]',data[occ]['Desc'])[0]
                 self.deBug('%s:%s' % (gene,self.ensGO(gene)))
                 if not self.ensGO(gene): continue
             except: continue
             if motif[-3:] == 'rev': (motif,type) = (motif[:-4],'Rev')
             elif motif[-5:] == 'scram': (motif,type) = (motif[:-6],'Scr')
             else: type = 'ELM'
             if motif not in self.dict['Occ']: self.dict['Occ'][motif] = {}; mx += 1
             if type not in self.dict['Occ'][motif]: self.dict['Occ'][motif][type] = {}
             if gene not in self.dict['Occ'][motif][type]: self.dict['Occ'][motif][type][gene] = []
             self.dict['Occ'][motif][type][gene].append(data[occ])
         self.printLog('\r#OCC','Processed %s occurrences: %d motifs with GO-links' % (rje.integerString(otot),mx))
     except: self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 11
0
 def report(self):  ### Run qstat to get job list then showstart on each job
     '''Run qstat to get job list then showstart on each job .'''
     try:  ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         qidlist = []
         qidjob = {}
         ### ~ [2] ~ Read in List of IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qline in os.popen('qstat'):
             try:
                 (qid, job) = rje.matchExp('^(\d+)\.\S+\s+(\S+)', qline)
                 qidlist.append(qid)
                 qidjob[qid] = job
             except:
                 continue
         ### ~ [3] ~ Report ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#QSTAT', '%d jobs in queue.' % len(qidlist))
         for qid in qidlist:
             self.printLog('#JOB',
                           '%s = %s' % (qid, qidjob[qid]),
                           timeout=False)
             for qline in os.popen('showstart %s' % qid):
                 if rje.chomp(qline):
                     self.printLog('#INFO', qline, timeout=False)
         self.printLog('#ZEN', rje_zen.Zen().wisdom())
     except:
         self.errorLog('QSub.report problem')
Esempio n. 12
0
def setupStatFilter(
    callobj,
    statlist=[],
    filterlist=[]
):  ### Makes StatFilter dictionary from statlist and filterlist
    '''
    Makes StatFilter dictionary from statlist and filterlist (from cmd_list) !!! Changes case of statfilter keys. !!!
    >> callobj:RJE_Object [None] = calling object for Error Messages etc.
    >> statlist:list of stats that are allowed for filtering. Generally column headers for output.
    >> filterlist:list of StatFilters read in from commandline consisting of StatOperatorValue 
    << statfilter:dictionary of StatFilter {Stat:(Operator,String,Numeric)}
    '''
    try:
        ## Setup dictionary ##
        statfilter = {}
        for filter in filterlist:
            ## Extract details ##
            match = rje.matchExp(
                '^(\S*[A-Za-z0-9])(>|>=|=<|=>|<=|==|=|<|!=|<>)(-*[A-Za-z0-9]\S*)$',
                filter)
            if not match:
                callobj.log.errorLog('Filter "%s" not recognised.' % filter,
                                     printerror=False)
                continue
            (stat, op, cutoff) = match
            if op == '<>':
                op = '!='
            if op == '=':
                op = '=='
            if op in ['=>', '=<']:
                op = rje.strReverse(op)
            if op not in ['=>', '=<', '!=', '==', '>', '<']:
                callobj.log.errorLog('Filter "%s" operator "%s" not known!' %
                                     (filter, op),
                                     printerror=False)
                continue
            ## Check for numeric value ##
            try:
                numcut = float(cutoff)
            except:
                numcut = None
            ## Check stat ##
            if stat not in statlist:
                for h in statlist:
                    if h.lower() == stat.lower():
                        stat = h
                        break
            if stat not in statlist:
                callobj.log.errorLog('Stat "%s" in filter "%s" not found.' %
                                     (stat, filter),
                                     printerror=False)
                continue
            ## Update dictionary ##
            statfilter[stat] = (op, cutoff, numcut)
        ### Finish ###
        return statfilter
    except:
        callobj.log.errorLog('Error in rje_scoring.setupStatFilter()',
                             quitchoice=True)
        return statfilter
Esempio n. 13
0
 def run(self,iterate=None,log=True):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         slim = ''
         if iterate == None: iterate = self.getBool('Iterate')
         elif iterate: self.setBool({'Iterate':True})
         self.setup(log=log)
         self.setInt({'MinSeq':max(1,self.getInt('MinSeq'))})
         if len(self.list['Peptides']) < self.getInt('MinSeq'):
             if log: self.printLog('#MIN','Too few peptides (%d) for minseq=%d' % (len(self.list['Peptides']),self.getInt('MinSeq')))
             return ('','Too few peptides (%d) for minseq=%d' % (len(self.list['Peptides']),self.getInt('MinSeq')))
         if not self.list['Input']: self.list['Input'] = self.list['Peptides'][0:]
         equiv = []
         if self.getBool('ExtendAA'):
             #self.warnLog('Equivalence mode (extendaa=T) not yet implemented! Please contact author.')
             self.printLog('#EQUIV','[%s]' % string.join(self.list['Equiv'],'] ['))
             equiv = self.list['Equiv'][0:]
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         slim = rje_slim.makeSlim(self.list['Peptides'],self.getInt('MinSeq'),self.getNum('MinFreq'),self.getInt('MaxAA'),self,self.getStr('Ignore'),self.getBool('VarLength'),equiv)
         self.dict['Output']['slim'] = slim
         if log: self.printLog('#SLIM','SLiM generated: "%s"' % slim)
         if not slim: return (slim,'Unable to make a SLiM with these settings and peptides')
         ## ~ [2a] ~ Assess matches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         matched = []
         if self.getBool('DNA'): regexp = string.replace(slim,'N','.')
         else: regexp = string.replace(slim,'X','.')
         for peptide in self.list['Peptides']:
             searchpep = string.replace('X%sX' % peptide,'$X','')
             searchpep = string.replace(searchpep,'X^','')
             searchpep = string.replace(searchpep,'-','')
             try:
                 if rje.matchExp('(%s)' % regexp,searchpep): matched.append(peptide)
             except: self.errorLog('Error with SLiM/peptide match, %s vs %s' % (regexp,searchpep))
         sx = len(matched)
         matchstr = 'SLiM matches %d of %d sequences (%.1f%%).' % (sx,len(self.list['Peptides']),(100.0*sx)/len(self.list['Peptides']))
         if log: self.printLog('#FREQ',matchstr)
         if iterate:
             self.dict['Output']['iterate'] += '%s: %s\n' % (slim,matchstr)
             self.dict['Output']['iterate'] += '-> %s\n' % string.join(matched,',')
         if iterate and (len(matched) != len(self.list['Peptides'])):
             if not matched: return (slim,'Unable to make an interative SLiM with these settings and peptides')
             if self.getStrLC('PeptAlign'):
                 self.list['Peptides'] = string.split(string.replace(string.join(matched),'-',''))
             else: self.list['Peptides'] = matched
             return self.run(iterate=True)
         ### ~ [3] REST Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if iterate:
             (matchstr,matched) = self.inputMatches(regexp)
             if log: self.printLog('#FREQ',matchstr)
         self.dict['Output']['match'] = matchstr
         self.dict['Output']['matches'] = string.join(matched,'\n')
         try:
             unmatched = self.list['Input'][0:]
             for pep in matched: unmatched.remove(pep)
             self.dict['Output']['unmatched'] = string.join(unmatched,'\n')
         except: self.dict['Output']['unmatched'] = self.errorLog('SLiMMaker Umatched Error')
         return (slim,matchstr)
     except: return (slim,self.errorLog('SLiMMaker Error'))
Esempio n. 14
0
 def addLinks(self,nested): ### Adds href aname links to definitions.
     '''Adds href aname links to definitions.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         endstrip = [')','.',',',':',';','!']
         if self.getBool('Plurals'): endstrip.append('s')
         for term in rje.sortKeys(nested):
             if term == '=':
                 linkdef = []
                 rawdef = string.split(string.replace(nested['='],'(','( '))
                 while rawdef:
                     glossary = self.dict['Glossary']
                     if self.getBool('HRef') and rje.matchExp('<(\S+)>',rawdef[0]):
                         safetynet = rawdef[0:]
                         url = rje.matchExp('<(\S+)>',rawdef[0])[0]
                         if rje.matchExp('<(\S+)>\[(\S+)',rawdef[0]): rawdef[0] = '[%s' % rje.matchExp('<(\S+)>\[(\S+)',rawdef[0])[1]
                         elif rje.matchExp('<(\S+)>(\S+)',rawdef[0]): rawdef[0] = '[%s]%s' % (url,rje.matchExp('<(\S+)>(\S+)',rawdef[0])[1])
                         else: rawdef[0] = '[%s]' % url
                         try:
                             while ']' not in rawdef[0]: rawdef[0] = '%s %s' % (rawdef[0],rawdef.pop(1))
                             (linktext,linkextra) = rje.matchExp('\[(.+)\](\S*)',rawdef.pop(0))
                             if url[:3] not in ['htt','ftp']: url = 'http://%s' % url
                             linkdef.append('<a href="%s">%s</a>%s' % (url,linktext,linkextra))
                             continue
                         except:
                             self.errorLog('Problem parsing URL from "%s"' % nested['='])
                             rawdef = safetynet
                     if rawdef[0].lower() not in glossary:
                         if rawdef[0].lower()[:-1] not in glossary or rawdef[0].lower()[-1] not in endstrip:
                             linkdef.append(rawdef.pop(0)); continue
                     akey = []; alink = []
                     while rawdef and (rawdef[0].lower() in glossary or rawdef[0].lower()[:-1] in glossary):
                         if rawdef[0].lower() in glossary and '=' in glossary[rawdef[0].lower()]: rterm = rawdef[0].lower()
                         elif len(rawdef) > 1 and rawdef[0].lower() in glossary and (rawdef[1].lower() in glossary[rawdef[0].lower()] or rawdef[1].lower()[:-1] in glossary[rawdef[0].lower()]): rterm = rawdef[0].lower()
                         elif rawdef[0].lower()[-1] in endstrip and rawdef[0].lower()[:-1] in glossary: rterm = rawdef[0].lower()[:-1]
                         elif rawdef[0].lower() in glossary: rterm = rawdef[0].lower()
                         else: break
                         glossary = glossary[rterm]
                         akey.append(rterm)
                         alink.append(rawdef.pop(0))
                     akey = string.join(akey,'_')
                     if '=' in glossary:
                         alink = string.join(alink)
                         if nested == glossary: linkdef.append(alink)
                         elif self.getStr('HTMLStyle') != 'tab':
                             if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<a href="#%s">%s</a>%s' % (akey,alink[:-1],alink[-1]))
                             else: linkdef.append('<a href="#%s">%s</a>' % (akey,alink))
                         else:
                             if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<scaps>%s</scaps>%s' % (alink[:-1],alink[-1]))
                             else: linkdef.append('<scaps>%s</scaps>' % (alink))
                     else:
                         linkdef.append(alink[0])
                         rawdef = alink[1:] + rawdef
                 nested['+'] = string.replace(string.join(linkdef),'( ','(')
                 while rje.matchExp(' _([^_]+)_',nested['+']):
                     italics = rje.matchExp(' _([^_]+)_',nested['+'])[0]
                     nested['+'] = string.replace(nested['+'],' _%s_' % italics,' <i>%s</i>' % italics)
                 #self.deBug(nested)
             elif term != '+': self.addLinks(nested[term])
     except: self.errorLog('%s.addLinks error' % self)
Esempio n. 15
0
def stripTags(html,keeptags=[]):    ### Strips all HTML tag text from html code, except listed keeptags
    '''Strips all HTML tag text from html code, except listed keeptags.'''
    keeptags = string.split(string.join(keeptags).lower())
    tagsplit = string.split(html,'<')
    newhtml = tagsplit.pop(0)    
    while tagsplit:
        tagtxt = tagsplit.pop(0)
        tag = rje.matchExp('^\\\\?([A-Za-z0-9]+)',tagtxt)
        if tag and tag[0].lower() in keeptags: newhtml += '<%s' % tagtxt
        elif tagtxt.find('>') >= 0: newhtml += ' %s' % tagtxt[tagtxt.find('>')+1:]
    return string.replace(newhtml,'  ',' ')
Esempio n. 16
0
def stripTags(html,keeptags=[]):    ### Strips all HTML tag text from html code, except listed keeptags
    '''Strips all HTML tag text from html code, except listed keeptags.'''
    keeptags = string.split(string.join(keeptags).lower())
    tagsplit = string.split(html,'<')
    newhtml = tagsplit.pop(0)    
    while tagsplit:
        tagtxt = tagsplit.pop(0)
        tag = rje.matchExp('^\\\\?([A-Za-z0-9]+)',tagtxt)
        if tag and tag[0].lower() in keeptags: newhtml += '<%s' % tagtxt
        elif tagtxt.find('>') >= 0: newhtml += ' %s' % tagtxt[tagtxt.find('>')+1:]
    return string.replace(newhtml,'  ',' ')
Esempio n. 17
0
 def run(self,iterate=None,log=True):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         slim = ''
         if iterate == None: iterate = self.getBool('Iterate')
         elif iterate: self.setBool({'Iterate':True})
         if not self.setup(log=log): return ('','SLiMMaker setup failed. Check log.')
         if not self.list['Input']: self.list['Input'] = self.list['Peptides'][0:]
         equiv = []
         if self.getBool('ExtendAA'):
             #self.warnLog('Equivalence mode (extendaa=T) not yet implemented! Please contact author.')
             self.printLog('#EQUIV','[%s]' % string.join(self.list['Equiv'],'] ['))
             equiv = self.list['Equiv'][0:]
         ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         slim = rje_slim.makeSlim(self.list['Peptides'],self.getInt('MinSeq'),self.getNum('MinFreq'),self.getInt('MaxAA'),self,self.getStr('Ignore'),self.getBool('VarLength'),equiv)
         self.dict['Output']['slim'] = slim
         if log: self.printLog('#SLIM','SLiM generated: "%s"' % slim)
         if not slim: return (slim,'Unable to make a SLiM with these settings and peptides')
         ## ~ [2a] ~ Assess matches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         matched = []
         if self.getBool('DNA'): regexp = string.replace(slim,'N','.')
         else: regexp = string.replace(slim,'X','.')
         for peptide in self.list['Peptides']:
             searchpep = string.replace('X%sX' % peptide,'$X','')
             searchpep = string.replace(searchpep,'X^','')
             searchpep = string.replace(searchpep,'-','')
             try:
                 if rje.matchExp('(%s)' % regexp,searchpep): matched.append(peptide)
             except: self.errorLog('Error with SLiM/peptide match, %s vs %s' % (regexp,searchpep))
         sx = len(matched)
         matchstr = 'SLiM matches %d of %d sequences (%.1f%%).' % (sx,len(self.list['Peptides']),(100.0*sx)/len(self.list['Peptides']))
         if log: self.printLog('#FREQ',matchstr)
         if iterate:
             self.dict['Output']['iterate'] += '%s: %s\n' % (slim,matchstr)
             self.dict['Output']['iterate'] += '-> %s\n' % string.join(matched,',')
         if iterate and (len(matched) != len(self.list['Peptides'])):
             if not matched: return (slim,'Unable to make an interative SLiM with these settings and peptides')
             if self.getStrLC('PeptAlign'):
                 self.list['Peptides'] = string.split(string.replace(string.join(matched),'-',''))
             else: self.list['Peptides'] = matched
             return self.run(iterate=True)
         ### ~ [3] REST Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if iterate:
             (matchstr,matched) = self.inputMatches(regexp)
             if log: self.printLog('#FREQ',matchstr)
         self.dict['Output']['match'] = matchstr
         self.dict['Output']['matches'] = string.join(matched,'\n')
         try:
             unmatched = self.list['Input'][0:]
             for pep in matched: unmatched.remove(pep)
             self.dict['Output']['unmatched'] = string.join(unmatched,'\n')
         except: self.dict['Output']['unmatched'] = self.errorLog('SLiMMaker Umatched Error')
         return (slim,matchstr)
     except: return (slim,self.errorLog('SLiMMaker Error'))
Esempio n. 18
0
 def parse(self):    ### Parse REST file into dictionaries
     '''Parse REST file into dictionaries.'''
     try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['RestKeys'] = []
         rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True))
         if rje.exists(self.getStr('RestIn')): restin = open(self.getStr('RestIn'),'r').read()
         elif rje.matchExp('^(\d+)$',self.getStr('RestIn')):
             url = '%sretrieve&jobid=%s&password=%s' % (self.getStr('RestURL'),self.getStr('RestIn'),self.getStr('Password'))
             if self.getBool('PureAPI') and self.getStrLC('Rest'): url += '&rest=%s' % (self.getStr('Rest'))
             else: url += '&rest=full'
             restin = urllib2.urlopen(url).read()
             if self.getBool('PureAPI'): return restin
         else: raise IOError('%s not found!' % self.getStr('RestIn'))
         jobid = None
         ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for restdata in string.split(restin,'###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n'):
             if not jobid:
                 self.dict['Output']['intro'] = restdata
                 prog = rje.matchExp('Output for (\S+)',restdata)[0]
                 self.dict['Output']['prog'] = prog
                 jobid = rje.matchExp('JobID: (\d+)',restdata)[0]
                 self.dict['Output']['jobid'] = jobid
                 if not self.getStrLC('RestBase'): rbase = '%s%s' % (self.getStr('RestOutDir'),jobid)
                 self.dict['Outfile']['jobid'] =  '%s.jobid' % (rbase)
                 continue
             restlines = string.split(restdata,'\n')
             rparse = string.split(restlines.pop(0))
             if rparse[0] != '#': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue
             if rparse[1][-1] != ':': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue
             rkey = rparse[1][:-1]
             try:
                 rfile = '%s.%s' % (rbase,rje.baseFile(rparse[2],strip_path=True,keepext=True))
             except: rfile = ''
             if not rfile: rfile = '%s.%s' % (rbase,rkey)
             rfile = string.replace(rfile,'%s.%s.' % (jobid,jobid),'%s.' % jobid)
             self.dict['Output'][rkey] = string.join(restlines,'\n')
             self.dict['Outfile'][rkey] = rfile
             self.list['RestKeys'].append(rkey)
         self.printLog('#PARSE','Parsed %s: %d REST outputs.' % (self.getStr('RestIn'),len(self.dict['Output'])))
         return True
     except: self.errorLog('%s.parse error' % self); return False
Esempio n. 19
0
    def foldIndex(self):     ### Runs FoldIndex disorder prediction
        '''Runs FoldIndex disorder prediction.'''
        try:
            ### Setup sequence and name ###
            sequence = self.info['Sequence']

            ### Run Disorder ###
            retry = self.stat['FILoop']
            url = "http://bioportal.weizmann.ac.il/fldbin/findex"
            params = "m=xml&sq=" + sequence  + "  " 
            while retry:
                try:
                    flines = urllib2.urlopen(url, params).readlines()
                except:
                    flines = []
                if flines:
                    break
                retry -= 1
                time.sleep(self.stat['FISleep'])
            if not flines:
                self.log.errorLog('FoldIndex run for "%s" failed.' % self.info['Name'],printerror=False)
                self.list['ResidueDisorder'] = []
                self.list['RegionDisorder'] = []
                return False
            ### Process ###
            self.list['ResidueDisorder'] = [0.0] * len(sequence)
            self.list['RegionDisorder'] = []
            for f in flines:
                if rje.matchExp('<segment start="(\d+)" end="(\d+)" len="(\d+)"',f):
                    fm = rje.matchExp('<segment start="(\d+)" end="(\d+)" len="(\d+)"',f)
                    self.list['RegionDisorder'].append((string.atoi(fm[0]),string.atoi(fm[1])))
                    for i in range(string.atoi(fm[0])-1,string.atoi(fm[1])):
                        self.list['ResidueDisorder'][i] = 1.0
            self.minRegion()
            if self.opt['PrintLog']: self.log.printLog('\r#DIS','FoldIndex Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']),sum(self.list['ResidueDisorder'])))
            self.opt['Flat'] = True
            return True
        except:
            self.log.errorLog('Error in Disorder.foldIndex(%s)' % self.info['Name'],quitchoice=True)
            return False
Esempio n. 20
0
    def parseDisorder(
        self
    ):  ### Parses disordered regions from sequence name (e.g. DisProt download)
        '''
        Parses disordered regions from sequence name (e.g. DisProt download).
        #X-Y = disordered region [1.0]; &X-Y = ordered region [0.0]; All else neutral [0.5];
        '''
        try:
            ### Setup sequence and name ###
            sequence = self.info['Sequence']
            name = self.info['Name']
            self.list['ResidueDisorder'] = [0.5] * len(sequence)
            self.list['RegionDisorder'] = []
            scoredict = {'#': 1.0, '&': 0.0}

            ### Process ###
            for region in string.split(name)[1:]:
                if rje.matchExp('^[#&](\d+)-(\d+)', region):
                    (i, x, y) = rje.matchExp('^([#&])(\d+)-(\d+)', region)
                    score = scoredict[i]
                    start = string.atoi(x) - 1
                    end = string.atoi(y)
                    for r in range(start, end):
                        self.list['ResidueDisorder'][r] = score
                    if i == '#':
                        self.list['RegionDisorder'].append((start, end))
            self.minRegion()
            if self.opt['PrintLog']:
                self.log.printLog(
                    '\r#DIS',
                    'DisProt Disorder parsing complete: %d disorder regions, %d disordered aa'
                    % (len(self.list['RegionDisorder']),
                       self.list['ResidueDisorder'].count(1.0)))
            return True
        except:
            self.log.errorLog('Error in Disorder.foldIndex(%s)' %
                              self.info['Name'],
                              quitchoice=True)
            return False
Esempio n. 21
0
 def taxDict(self,taxid,store=False,skipuni=False):    ### Extracts taxonomy details from SpecFile for taxid
     '''Extracts taxonomy details from SpecFile for taxid. If taxid is a list, will process each element.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxdict = {}
         ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tlist = True
         try: taxid.sort()
         except: tlist = False
         if tlist:
             tx = 0.0; ttot = len(taxid); mx = 0
             for t in taxid:
                 self.progLog('\r#SPEC','Extracting Uniprot species details: %.1f%%' % (tx/ttot)); tx += 100.0
                 taxdict[t] = self.taxDict(t,store)
                 if not taxdict[t]: mx += 1
             self.printLog('\r#SPEC','Extracted Uniprot/NCBI species details for %s TaxID: %s missing' % (rje.iStr(ttot),rje.iStr(mx)))
             return taxdict
         ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxid = '%s' % taxid
         if taxid in self.dict['TaxDict']: return self.dict['TaxDict'][taxid]
         if not skipuni:
             greplines = os.popen('grep -A 1 " %s:" %s' % (taxid, self.getStr('SpecFile'))).readlines()
             for entry in greplines:
                 nmatch = rje.matchExp('^(\S+)\s+\S+\s+(\d+):\s+N=(\S.+)\s*$',entry)
                 if nmatch and nmatch[1] != taxid: break # Next taxon
                 if nmatch: taxdict['spcode'] = nmatch[0]; taxdict['name'] = nmatch[2]
                 elif rje.matchExp('C=(\S.+)\s*$',entry): taxdict['common'] = rje.matchExp('C=(\S.+)\s*$',entry)[0]
         #if not taxdict and taxid in self.list['RankID']: self.warnLog('Cannot find TaxID "%s" in %s!' % (taxid,self.getStr('SpecFile')),'Missing_TaxID',suppress=True)
         ## ~ [2b] ~ Adding missing scientific names from NameMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         if not taxdict:
             for entry in os.popen('grep -i -e "^%s\t" %s' % (taxid, self.getStr('NameMap'))).readlines():
                 tdata = string.split(entry,'\t|\t')
                 if not tdata[3].startswith('scientific name'): continue
                 tname = tdata[1]
                 if 'name' in taxdict: self.warnLog('TaxID %d hits "%s" and "%s"!' % (taxid, taxdict[name],tname))
                 else: taxdict['name'] = tname
         return taxdict
     except: self.errorLog('%s.taxDict() error' % (self)); raise
Esempio n. 22
0
 def parseTMHMM(self,file=None):     ### Parses TMHMM into dictionary
     '''
     Parses TMHMM into dictionary self.tmhmm.
     >> file:str = will read from file if given, else self.info['TMHMM']
     '''
     try:
         ### <a> ### Setup
         _stage = '<a> Setup'
         tmhmm_pattern = '^(\S+)\s+(len.+)$'
         if file == None:
             file = self.info['TMHMM']
         if file == 'None':
             self.verbose(0,1,'No TMHMM file given.',2)
             return
         self.verbose(0,3,'Parsing TMHMM file %s...' % file,0)
         TMRES = open(file, 'r')
         ### <b> ### Read in
         _stage = '<b> Read in'
         while 1:
             tmline = re.sub('\t',' ',TMRES.readline())
             if tmline:
                 tmres = rje.matchExp(tmhmm_pattern,tmline)
                 if tmres:
                     acc = tmres[0]
                     if rje.matchExp('^\S+__(\S+)',acc):
                         acc = rje.matchExp('^\S+__(\S+)',acc)[0]
                     self.tmhmm[acc] = {}
                     reslist = string.split(tmres[1])
                     for res in reslist:
                         split = string.split(res,'=')
                         self.tmhmm[acc][split[0]] = split[1]
             else:
                 break
         TMRES.close()
         self.verbose(0,1,'Done!',2)
     except:
         self.log.errorLog('Problem with parseTMHMM() %s.' % _stage)
Esempio n. 23
0
 def inputMatches(self,regexp):  ### Returns the matches for the original peptides
     '''Returns the matches for the original peptides.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         matched = []
         for peptide in self.list['Input']:
             searchpep = string.replace('X%sX' % peptide,'$X','')
             searchpep = string.replace(searchpep,'X^','')
             searchpep = string.replace(searchpep,'-','')
             try:
                 if rje.matchExp('(%s)' % regexp,searchpep): matched.append(peptide)
             except: self.errorLog('Error with SLiM/peptide match, %s vs %s' % (regexp,searchpep))
         sx = len(matched)
         matchstr = 'SLiM matches %d of %d input sequences (%.1f%%).' % (sx,len(self.list['Input']),(100.0*sx)/len(self.list['Input']))
     except: self.errorLog('Error with inputMatches()'); matchstr = 'Error with inputMatches()'; matched = []
     return (matchstr,matched)
Esempio n. 24
0
 def inputMatches(self,regexp):  ### Returns the matches for the original peptides
     '''Returns the matches for the original peptides.'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         matched = []
         for peptide in self.list['Input']:
             searchpep = string.replace('X%sX' % peptide,'$X','')
             searchpep = string.replace(searchpep,'X^','')
             searchpep = string.replace(searchpep,'-','')
             try:
                 if rje.matchExp('(%s)' % regexp,searchpep): matched.append(peptide)
             except: self.errorLog('Error with SLiM/peptide match, %s vs %s' % (regexp,searchpep))
         sx = len(matched)
         matchstr = 'SLiM matches %d of %d input sequences (%.1f%%).' % (sx,len(self.list['Input']),(100.0*sx)/len(self.list['Input']))
     except: self.errorLog('Error with inputMatches()'); matchstr = 'Error with inputMatches()'; matched = []
     return (matchstr,matched)
Esempio n. 25
0
def setupStatFilter(callobj, statlist=[], filterlist=[]):  ### Makes StatFilter dictionary from statlist and filterlist
    """
    Makes StatFilter dictionary from statlist and filterlist (from cmd_list) !!! Changes case of statfilter keys. !!!
    >> callobj:RJE_Object [None] = calling object for Error Messages etc.
    >> statlist:list of stats that are allowed for filtering. Generally column headers for output.
    >> filterlist:list of StatFilters read in from commandline consisting of StatOperatorValue 
    << statfilter:dictionary of StatFilter {Stat:(Operator,String,Numeric)}
    """
    try:
        ## Setup dictionary ##
        statfilter = {}
        for filter in filterlist:
            ## Extract details ##
            match = rje.matchExp("^(\S*[A-Za-z0-9])(>|>=|=<|=>|<=|==|=|<|!=|<>)(-*[A-Za-z0-9]\S*)$", filter)
            if not match:
                callobj.log.errorLog('Filter "%s" not recognised.' % filter, printerror=False)
                continue
            (stat, op, cutoff) = match
            if op == "<>":
                op = "!="
            if op == "=":
                op = "=="
            if op in ["=>", "=<"]:
                op = rje.strReverse(op)
            if op not in ["=>", "=<", "!=", "==", ">", "<"]:
                callobj.log.errorLog('Filter "%s" operator "%s" not known!' % (filter, op), printerror=False)
                continue
            ## Check for numeric value ##
            try:
                numcut = float(cutoff)
            except:
                numcut = None
            ## Check stat ##
            if stat not in statlist:
                for h in statlist:
                    if h.lower() == stat.lower():
                        stat = h
                        break
            if stat not in statlist:
                callobj.log.errorLog('Stat "%s" in filter "%s" not found.' % (stat, filter), printerror=False)
                continue
            ## Update dictionary ##
            statfilter[stat] = (op, cutoff, numcut)
        ### Finish ###
        return statfilter
    except:
        callobj.log.errorLog("Error in rje_scoring.setupStatFilter()", quitchoice=True)
        return statfilter
Esempio n. 26
0
    def saveMutations(self):    ### Outputs parsed mutations into a delimited file
        '''Outputs parsed mutations into a delimited file.'''
        try:### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease']
            outfile = 'omim_mutations.tdt'
            rje.delimitedFileOutput(self,outfile,headers,'\t',rje_backup=True)

            ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for gene in rje.sortKeys(self.dict['Mutations']):
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease,mutation) = self.dict['Mutations'][gene][subid]
                    (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation)
                    datadict = {'OMIM_ID':string.join(self.dict['Records'][gene],'; '),'SubID':subid,'Gene':gene,
                                'Pos':pos,'WildAA':wild,'MutAA':mut,'Disease':disease}
                    rje.delimitedFileOutput(self,outfile,headers,'\t',datadict)
            self.log.printLog('#OUT','OMIM Mutation output to %s complete' % outfile)
        except: self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 27
0
 def setup(self):    ### Main class setup method.
     '''Main class setup method.'''
     try:### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt'
         self.progLog('\r#PPI','Loading pairwise data...')
         pairwise = rje.dataDict(self,ppipairwise,['Hub','Spoke'],['Spoke','SpokeSeq','Evidence'])
         gene2seq = {}; seq2gene = {}
         fullppi = {}; px = 0.0; ptot = len(pairwise); ppix = 0
         for pair in rje.sortKeys(pairwise):
             self.progLog('\r#PPI','Processing full pairwise PPI: %.2f%%' % (px/ptot)); px += 100.0
             [hub,spoke] = string.split(pair,'\t')
             if spoke not in gene2seq:
                 sseq = pairwise[pair]['SpokeSeq']
                 gene2seq[spoke] = sseq; seq2gene[string.split(sseq,'__')[0]] = spoke
             if hub not in fullppi: fullppi[hub] = {}
             if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']; ppix += 1
         self.printLog('\r#PPI','Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)),rje.integerString(ppix/2)))
         ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt'
         goodppi = self.loadFromFile(goodppifile,chomplines=True)
         self.dict['PPI'] = {}
         px = 0.0; ptot = len(fullppi); fppix = ppix; ppix = 0
         for hub in fullppi:
             self.progLog('\r#PPI','Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px/ptot,rje.integerString(len(self.dict['PPI'])),rje.integerString(ppix))); px +=100.0
             self.dict['PPI'][hub] = []
             for spoke in fullppi[hub]:
                 goodspoke = False
                 for ptype in goodppi:
                     if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True; break
                 if goodspoke: self.dict['PPI'][hub].append(spoke); continue
                 goodspoke = True
                 for spoke2 in fullppi[hub]:
                     if spoke2 in [hub,spoke]: continue
                     if spoke2 in fullppi[spoke]: goodspoke = False; break
                 if goodspoke: self.dict['PPI'][hub].append(spoke)
             ppix += len(self.dict['PPI'][hub])
             if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub)
         self.printLog('\r#PPI','Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString(len(fullppi)),rje.integerString(len(self.dict['PPI'])),rje.integerString(fppix/2),rje.integerString(ppix/2)))
         ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas'
         scmd = ['accnr=F','seqnr=F','seqin=%s' % seqfile] + self.cmd_list + ['autoload=T']
         seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,scmd)
         self.dict['SeqObj'] = seqlist.seqNameDic('Max')
         self.dict['Gene2Seq'] = gene2seq; self.dict['Seq2Gene'] = seq2gene
         return True     # Setup successful
     except: self.errorLog('Problem during %s setup.' % self); return False  # Setup failed
Esempio n. 28
0
 def taxaChildren(self,taxid):   ### Extracts TaxID children from TaxMap file and updates RankID and TaxMap dicts.
     '''Extracts TaxID children from TaxMap file and updates RankID and TaxMap dicts.'''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         # NB. This is very slow and so reading the while.
         self.debug(taxid)
         taxmap = self.dict['TaxMap']
         if taxid in taxmap: return taxmap[taxid]
         ### ~ [1] Parse from TaxMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         taxmap[taxid] = []
         for tline in os.popen('grep -e "\s%s\s" %s' % (taxid,self.getStr('TaxMap'))).readlines():
             try: (child,parent,taxtype) = rje.matchExp('^(\d+)\s+\|\s+(\d+)\s+\|\s+(\S+)\s+',tline)
             except: continue
             if parent not in taxmap: taxmap[parent] = []
             taxmap[parent].append(child)
             if taxtype in ['species','subspecies']: self.list['RankID'].append(child)
             self.progLog('\r#TAXID','Reading %s: %s TaxID' % (self.getStr('TaxMap'),rje.iLen(taxmap)))
         return taxmap[taxid]
     except: self.errorLog('%s.taxaChildren(%s) error' % (self,taxid)); raise
Esempio n. 29
0
 def report(self):   ### Run qstat to get job list then showstart on each job
     '''Run qstat to get job list then showstart on each job .'''
     try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         qidlist = []
         qidjob = {}
         ### ~ [2] ~ Read in List of IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for qline in os.popen('qstat'):
             try:
                 (qid,job) = rje.matchExp('^(\d+)\.\S+\s+(\S+)',qline)
                 qidlist.append(qid)
                 qidjob[qid] = job
             except: continue
         ### ~ [3] ~ Report ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.printLog('#QSTAT','%d jobs in queue.' % len(qidlist))
         for qid in qidlist:
             self.printLog('#JOB', '%s = %s' % (qid,qidjob[qid]), timeout=False)
             for qline in os.popen('showstart %s' % qid):
                 if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False)
         self.printLog('#ZEN',rje_zen.Zen().wisdom())
     except: self.errorLog('QSub.report problem')            
Esempio n. 30
0
def absCons(callobj,Occ,hithom,seqfrag,seqwt):    ### Absolute conservation score.
    '''Absolute conservation score.'''
    try:
        ### Absolute matching of motif in corresponding homologous region ###
        Motif = Occ.obj['Motif']
        hitcon = {}     # Dictionary of {seq:conservation}
        for seq in hithom:
            hitcon[seq] = 0.0
            if callobj.opt['ConsAmb']:   # Search degenerate motif
                vlist = Motif.dict['Search'][0]     
            else:                       # Search with matched variant
                vlist = [Occ.getData('Variant')]
            for variant in vlist:
                searchvar = '(%s)' % string.replace(variant,'X','[A-Z]')
                if rje.matchExp(searchvar,seqfrag[seq]):
                    hitcon[seq] = 1.0
                    break
                    
        ### Weight by distance? ###
        return consWeight(callobj,hitcon,seqwt)
    except:
        callobj.log.errorLog('Error in rje_motif_cons.absCons()',quitchoice=True) 
        return
Esempio n. 31
0
    def saveMutations(
            self):  ### Outputs parsed mutations into a delimited file
        '''Outputs parsed mutations into a delimited file.'''
        try:  ### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = [
                'OMIM_ID', 'SubID', 'Gene', 'Pos', 'WildAA', 'MutAA', 'Disease'
            ]
            outfile = 'omim_mutations.tdt'
            rje.delimitedFileOutput(self,
                                    outfile,
                                    headers,
                                    '\t',
                                    rje_backup=True)

            ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for gene in rje.sortKeys(self.dict['Mutations']):
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease, mutation) = self.dict['Mutations'][gene][subid]
                    (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',
                                                    mutation)
                    datadict = {
                        'OMIM_ID': string.join(self.dict['Records'][gene],
                                               '; '),
                        'SubID': subid,
                        'Gene': gene,
                        'Pos': pos,
                        'WildAA': wild,
                        'MutAA': mut,
                        'Disease': disease
                    }
                    rje.delimitedFileOutput(self, outfile, headers, '\t',
                                            datadict)
            self.log.printLog('#OUT',
                              'OMIM Mutation output to %s complete' % outfile)
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 32
0
    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F']
            cds = rje_seq.SeqList(
                self.log, self.cmd_list +
                ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A', 'C', 'G', 'T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []  # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {
            }  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {
            }  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1, n2, n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod, 'T', 'U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][
                            cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,
                                     total=False)  # Normalise triplet freq.
            for aa in nts_cfreq:
                nts_cfreq[aa] = rje.dictFreq(
                    nts_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog('#FREQ', 'Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog(
                    '\r#OBS',
                    'Calculating observed codon frequencies: %.1f%%' %
                    (sx / stot),
                    newline=False,
                    log=False)
                sx += 100.0
                try:
                    (id, scaffold, pos, name, glen, parent) = rje.matchExp(
                        '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',
                        seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try:
                    exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',
                                         pos)[0]
                except:
                    try:
                        exons = rje.matchExp('^join\((\d+\..*\.\d+)\)', pos)[0]
                    except:
                        exons = rje.matchExp('^(\d+\.\.\d+)', pos)[0]
                self.deBug(exons)
                exons = string.split(exons, ',')
                elen = []
                try:
                    for exon in exons:
                        (start, end) = string.split(exon, '..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue

                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen():
                    self.log.errorLog('%s exon length error' % id,
                                      printerror=False)
                if seq.aaLen() / 3 != seq.aaLen() / 3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,
                                      printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence, 'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id, printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq:
                obs_cfreq[aa] = rje.dictFreq(
                    obs_cfreq[aa], total=False)  # Normalise codon freq.
            self.log.printLog(
                '\r#OBS', 'Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx, stot) = (0.0, cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP',
                                  'Calculating triplet frequencies: %.1f%%' %
                                  (sx / stot),
                                  newline=False,
                                  log=False)
                sx += 100.0
                elen = seq.list['ExonLen']
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0  # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod, 'T', 'U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:  # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:  # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1, 3):
                                    if elen[0] > ax + i:  # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (
                                            obs_cfreq[prevaa][cod1] *
                                            obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (
                                            nts_cfreq[prevaa][cod1] *
                                            nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:  # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:  # Look at overlap with previous codon
                        for i in range(1, 3):
                            if elen[0] > ax + i:  # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq, total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq, total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq, total=False)
            self.log.printLog('\r#TRIP',
                              'Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = [
                'Triplet', 'AA', 'Degen', 'Obs_Codon', 'NT_Codon', 'Obs_Trip',
                'NT_Trip', 'ObCod_Trip', 'NTCod_Trip'
            ]
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self, tfile, headers, rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod, 'T', 'U')]
                datadict = {
                    'Triplet': cod,
                    'AA': aa,
                    'Degen': len(obs_cfreq[aa]),
                    'Obs_Codon': obs_cfreq[aa][cod],
                    'NT_Codon': nts_cfreq[aa][cod],
                    'Obs_Trip': obs_tfreq[cod],
                    'NT_Trip': nts_tfreq[cod],
                    'ObCod_Trip': ocd_tfreq[cod],
                    'NTCod_Trip': ncd_tfreq[cod]
                }
                rje.delimitedFileOutput(self,
                                        tfile,
                                        headers,
                                        datadict=datadict)
            self.log.printLog('#OUT',
                              'Triplet & codon data output to %s' % tfile)
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 33
0
 def addLinks(self, nested):  ### Adds href aname links to definitions.
     '''Adds href aname links to definitions.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         endstrip = [')', '.', ',', ':', ';', '!']
         if self.getBool('Plurals'): endstrip.append('s')
         for term in rje.sortKeys(nested):
             if term == '=':
                 linkdef = []
                 rawdef = string.split(
                     string.replace(nested['='], '(', '( '))
                 while rawdef:
                     glossary = self.dict['Glossary']
                     if self.getBool('HRef') and rje.matchExp(
                             '<(\S+)>', rawdef[0]):
                         safetynet = rawdef[0:]
                         url = rje.matchExp('<(\S+)>', rawdef[0])[0]
                         if rje.matchExp('<(\S+)>\[(\S+)', rawdef[0]):
                             rawdef[0] = '[%s' % rje.matchExp(
                                 '<(\S+)>\[(\S+)', rawdef[0])[1]
                         elif rje.matchExp('<(\S+)>(\S+)', rawdef[0]):
                             rawdef[0] = '[%s]%s' % (
                                 url, rje.matchExp('<(\S+)>(\S+)',
                                                   rawdef[0])[1])
                         else:
                             rawdef[0] = '[%s]' % url
                         try:
                             while ']' not in rawdef[0]:
                                 rawdef[0] = '%s %s' % (rawdef[0],
                                                        rawdef.pop(1))
                             (linktext, linkextra) = rje.matchExp(
                                 '\[(.+)\](\S*)', rawdef.pop(0))
                             if url[:3] not in ['htt', 'ftp']:
                                 url = 'http://%s' % url
                             linkdef.append('<a href="%s">%s</a>%s' %
                                            (url, linktext, linkextra))
                             continue
                         except:
                             self.errorLog('Problem parsing URL from "%s"' %
                                           nested['='])
                             rawdef = safetynet
                     if rawdef[0].lower() not in glossary:
                         if rawdef[0].lower(
                         )[:-1] not in glossary or rawdef[0].lower(
                         )[-1] not in endstrip:
                             linkdef.append(rawdef.pop(0))
                             continue
                     akey = []
                     alink = []
                     while rawdef and (rawdef[0].lower() in glossary or
                                       rawdef[0].lower()[:-1] in glossary):
                         if rawdef[0].lower(
                         ) in glossary and '=' in glossary[
                                 rawdef[0].lower()]:
                             rterm = rawdef[0].lower()
                         elif len(rawdef) > 1 and rawdef[0].lower(
                         ) in glossary and (rawdef[1].lower()
                                            in glossary[rawdef[0].lower()]
                                            or rawdef[1].lower()[:-1]
                                            in glossary[rawdef[0].lower()]):
                             rterm = rawdef[0].lower()
                         elif rawdef[0].lower()[-1] in endstrip and rawdef[
                                 0].lower()[:-1] in glossary:
                             rterm = rawdef[0].lower()[:-1]
                         elif rawdef[0].lower() in glossary:
                             rterm = rawdef[0].lower()
                         else:
                             break
                         glossary = glossary[rterm]
                         akey.append(rterm)
                         alink.append(rawdef.pop(0))
                     akey = string.join(akey, '_')
                     if '=' in glossary:
                         alink = string.join(alink)
                         if nested == glossary: linkdef.append(alink)
                         elif self.getStr('HTMLStyle') != 'tab':
                             if alink[-1] in endstrip and alink[-1] != 's':
                                 linkdef.append(
                                     '<a href="#%s">%s</a>%s' %
                                     (akey, alink[:-1], alink[-1]))
                             else:
                                 linkdef.append('<a href="#%s">%s</a>' %
                                                (akey, alink))
                         else:
                             if alink[-1] in endstrip and alink[-1] != 's':
                                 linkdef.append('<scaps>%s</scaps>%s' %
                                                (alink[:-1], alink[-1]))
                             else:
                                 linkdef.append('<scaps>%s</scaps>' %
                                                (alink))
                     else:
                         linkdef.append(alink[0])
                         rawdef = alink[1:] + rawdef
                 nested['+'] = string.replace(string.join(linkdef), '( ',
                                              '(')
                 while rje.matchExp(' _([^_]+)_', nested['+']):
                     italics = rje.matchExp(' _([^_]+)_', nested['+'])[0]
                     nested['+'] = string.replace(nested['+'],
                                                  ' _%s_' % italics,
                                                  ' <i>%s</i>' % italics)
                 #self.deBug(nested)
             elif term != '+':
                 self.addLinks(nested[term])
     except:
         self.errorLog('%s.addLinks error' % self)
Esempio n. 34
0
    def makeHistory(self):  ### Extracts history information from docstrings.
        '''Extracts history information from docstrings.'''
        try:  ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            hdb = self.db('history')
            mdb = self.db('Module')
            pydoc = self.obj['PyDoc']
            uhtml = []  # Update HTML text
            udir = ''

            ### ~ [1] Work through python modules ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            for mkey in mdb.dataKeys():
                entry = mdb.data(mkey)
                pyfile = entry['File']
                mod = entry['Module']
                prev = '-'
                lastv = ''
                ## ~ [1a] Parse out history text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                history = []  # History lines
                updates = []  # Version number entries
                for mentry in self.db('Method').indexEntries('File', pyfile):
                    if mentry['Method'] == 'history':
                        try:
                            history = string.split(mentry['DocString'], '\n')
                            break
                        except:
                            history = []
                            self.errorLog('History parsing problem: %s' %
                                          mentry,
                                          printerror=False)
                for dline in history:
                    if rje.matchExp('# (\d+\.\d+\.?\d*)\s?-\s(\S.+)$', dline):
                        (v,
                         text) = rje.matchExp('# (\d\.\d+\.?\d*)\s?-\s(\S.+)$',
                                              dline)
                        if v == lastv:  # Continuation of update
                            if prev == lastv: continue  # In previous release
                            updates[-1]['Update'] += ' %s' % text
                            continue
                        lastv = v
                        ventry = {
                            'Dir': entry['SourceDir'],
                            'Module': mod,
                            'Version': v,
                            'Update': text,
                            'Release': rje.dateTime(dateonly=True)
                        }
                        vkey = hdb.makeKey(ventry)
                        if hdb.data(vkey): prev = v
                        else: updates.append(ventry)
                ## ~ [1b] Assess/report updates ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if prev != lastv:
                    if entry['SourceDir'] != udir:
                        uhtml.append('<h2>Updates in %s/:</h2>\n' %
                                     entry['SourceDir'])
                        udir = entry['SourceDir']
                if prev == '-' and lastv:
                    self.printLog(
                        '#VNUM',
                        '%s: Creation -> Version %s' % (mod, entry['Version']))
                    uhtml.append(
                        '<p><b>&bull; %s:</b> <i>Created/Renamed/moved.</i>' %
                        mod)
                elif prev not in ['-', lastv]:
                    self.printLog(
                        '#VNUM', '%s: Version %s -> Version %s' %
                        (mod, prev, entry['Version']))
                    uhtml.append(
                        '<p><b>&bull; %s:</b> <i>Updated from Version %s.</i>'
                        % (mod, prev))
                for ventry in updates:
                    self.printLog('#V%s' % ventry['Version'], ventry['Update'])
                    uhtml.append('<br>&rarr; Version %s: %s' %
                                 (ventry['Version'], ventry['Update']))
                    hdb.addEntry(ventry)
                if uhtml and uhtml[-1] != '</p>': uhtml.append('</p>')
                if lastv != entry['Version']:
                    self.warnLog(
                        'Module %s Version %s but history() ends at %s' %
                        (mod, entry['Version'], lastv))
                self.deBug('>>>')
            if 'history' in self.list['Output']: hdb.saveToFile(backup=False)

            ### ~ [2] Make updates.html file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if 'updates' in self.list['Output']:
                htmlfile = '%s.updates.html' % self.basefile()
                title = 'SLiMSuite updates'
                stylesheets = []
                for css in pydoc.list['StyleSheets']:
                    stylesheets.append(pydoc.getStr('StylePath') + css)
                htmlhead = rje_html.htmlHead(
                    title,
                    stylesheets,
                    tabber=True,
                    frontpage=False,
                    nobots=False,
                    keywords=pydoc.list['Keywords'],
                    javascript=pydoc.getStr('StylePath'))
                htmlbody = string.join(['<h1>SLiMSuite updates</h1>'] + uhtml,
                                       '\n')
                htmltail = rje_html.htmlTail(
                    '%s %s' %
                    (pydoc.getStr('Author'),
                     string.split(time.asctime(time.localtime(
                         time.time())))[-1]))
                open(htmlfile, 'w').write(htmlhead + htmlbody + htmltail)
                self.printLog('#HTML',
                              'HTML update summary output to %s' % (htmlfile))
        except:
            self.errorLog('Error in %s.makeHistory()' % self.prog())
Esempio n. 35
0
    def codons(self):  ### Main codons analysis method
        '''Main codons analysis method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            gcode = rje_sequence.genetic_code

            ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            nts = ['A','C','G','T']
            ntfreq = cds.aaFreq(alphabet=nts)
            codons = []     # List of codons
            obs_cfreq = {}  # Observed codon frequencies
            nts_cfreq = {}  # Codon frequencies from NT frequencies
            obs_tfreq = {}  # Observed triplet frequencies
            nts_tfreq = {}  # Predicted triplet frequencies from NT frequencies
            ocd_tfreq = {}  # Predicted triplet frequencies from observed codon frequencies
            ncd_tfreq = {}  # Predicted triplet frequencies from nt-predicted codon frequencies
            ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            for n1 in nts:
                for n2 in nts:
                    for n3 in nts:
                        cod = '%s%s%s' % (n1,n2,n3)
                        codons.append(cod)
                        aa = gcode[string.replace(cod,'T','U')]
                        if aa not in obs_cfreq: obs_cfreq[aa] = {}
                        if aa not in nts_cfreq: nts_cfreq[aa] = {}
                        obs_cfreq[aa][cod] = 0.0
                        nts_cfreq[aa][cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        obs_tfreq[cod] = 0.0
                        nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3]
                        ocd_tfreq[cod] = 0.0
                        ncd_tfreq[cod] = 0.0
            nts_tfreq = rje.dictFreq(nts_tfreq,total=False)                                 # Normalise triplet freq.
            for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq(nts_cfreq[aa],total=False)    # Normalise codon freq.
            self.log.printLog('#FREQ','Frequency dictionaries set up.')
            ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (sx,stot) = (0.0,cds.seqNum())
            for seq in cds.seq[0:]:
                self.log.printLog('\r#OBS','Calculating observed codon frequencies: %.1f%%' % (sx/stot),newline=False,log=False)
                sx += 100.0
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',pos)[0]
                except:
                    try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)',pos)[0]
                    except: exons = rje.matchExp('^(\d+\.\.\d+)',pos)[0]
                self.deBug(exons)
                exons = string.split(exons,',')
                elen = []
                try:
                    for exon in exons:
                        (start,end) = string.split(exon,'..')
                        elen.append(string.atoi(end) - string.atoi(start) + 1)
                except:
                    self.log.errorLog(id)
                    cds.seq.remove(seq)
                    continue
                        
                if pos[:4] == 'comp': elen.reverse()
                seq.list['ExonLen'] = elen
                self.deBug(elen)
                if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id,printerror=False)
                if seq.aaLen()/3 != seq.aaLen()/3.0:
                    self.log.errorLog('%s not a multiple of 3nt long!' % id,printerror=False)
                    cds.seq.remove(seq)
                    continue
                #!# Add use exon option - single full-length exon if false (mature mRNA) #!#
                sequence = seq.info['Sequence'][0:]
                if string.count(sequence,'N') > 0:
                    self.log.errorLog('%s has 1+ Ns!' % id,printerror=False)
                    cds.seq.remove(seq)
                    continue
                while sequence:
                    cod = sequence[:3]
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod,'T','U')]
                    obs_cfreq[aa][cod] += 1
            for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq(obs_cfreq[aa],total=False)    # Normalise codon freq.
            self.log.printLog('\r#OBS','Calculating observed codon frequencies complete.')

            ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (sx,stot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#TRIP','Calculating triplet frequencies: %.1f%%' % (sx/stot),newline=False,log=False)
                sx += 100.0
                elen = seq.list['ExonLen'] 
                sequence = seq.info['Sequence'][0:]
                aa = ''
                cod = ''
                ax = 0      # Measure sequence length processed for exon boundary checks
                while sequence:
                    prevcod = cod
                    cod = sequence[:3]
                    prevaa = aa
                    sequence = sequence[3:]
                    aa = gcode[string.replace(cod,'T','U')]
                    ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    for cod2 in obs_cfreq[aa]:
                        if elen[0] > ax + 3:    # Exon boundary beyond this codon
                            ocd_tfreq[cod2] += obs_cfreq[aa][cod2]
                            ncd_tfreq[cod2] += nts_cfreq[aa][cod2]
                        if prevaa:              # Look at overlap with previous codon
                            for cod1 in obs_cfreq[prevaa]:
                                for i in range(1,3):
                                    if elen[0] > ax + i:    # Exon boundary beyond overlap
                                        acod = cod1[i:] + cod2[:i]
                                        ocd_tfreq[acod] += (obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2])
                                        ncd_tfreq[acod] += (nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2])
                    ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                    if elen[0] > ax + 3:    # Exon boundary beyond this codon
                        obs_tfreq[cod] += 1
                    if prevcod:              # Look at overlap with previous codon
                        for i in range(1,3):
                            if elen[0] > ax + i:    # Exon boundary beyond overlap
                                acod = prevcod[i:] + cod[:i]
                                obs_tfreq[acod] += 1
                    # Check exons #
                    ax += 3
                    if ax >= elen[0]: ax -= elen.pop(0)
            obs_tfreq = rje.dictFreq(obs_tfreq,total=False)
            ocd_tfreq = rje.dictFreq(ocd_tfreq,total=False)
            ncd_tfreq = rje.dictFreq(ncd_tfreq,total=False)    
            self.log.printLog('\r#TRIP','Calculating triplet frequencies complete.')

            ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            headers = ['Triplet','AA','Degen','Obs_Codon','NT_Codon','Obs_Trip','NT_Trip','ObCod_Trip','NTCod_Trip']
            tfile = 'quad_triplet.tdt'
            rje.delimitedFileOutput(self,tfile,headers,rje_backup=True)
            for cod in codons:
                aa = gcode[string.replace(cod,'T','U')]
                datadict = {'Triplet':cod,'AA':aa,'Degen':len(obs_cfreq[aa]),'Obs_Codon':obs_cfreq[aa][cod],
                            'NT_Codon':nts_cfreq[aa][cod],'Obs_Trip':obs_tfreq[cod],'NT_Trip':nts_tfreq[cod],
                            'ObCod_Trip':ocd_tfreq[cod],'NTCod_Trip':ncd_tfreq[cod]}
                rje.delimitedFileOutput(self,tfile,headers,datadict=datadict)
            self.log.printLog('#OUT','Triplet & codon data output to %s' % tfile)
        except: self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 36
0
 def mapHit(self,seq,hits,hitdict,method):     ### Tries to map seq onto hitseq and returns hit if successful
     '''
     Tries to map seq onto hitseq and returns hit if successful.
     >> seq:Query Sequence Object
     >> hits:List of hits in rough order of goodness
     >> hitdict:Dictionary of {hitname:stats}
     >> method:Mapping method to use
     '''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (name,sequence) = seq
         data = rje_sequence.extractNameDetails(name,self)
         data['Sequence'] = seq[1]
         data['ShortName'] = string.split(seq[0])[0]
         for hit in hitdict:
             hitdict[hit]['Data'] = rje_sequence.extractNameDetails(hitdict[hit]['Seq'][0],self)
             hitdict[hit]['Data']['Sequence'] = hitdict[hit]['Seq'][1]
             hitdict[hit]['Data']['ShortName'] = string.split(hitdict[hit]['Seq'][0])[0]
         ### SkipGene ###
         if method == 'id' and rje.matchExp('^(\S+)_\S+',data['ID']):
             gene = rje.matchExp('^(\S+)_\S+',data['ID'])
             if gene in self.list['SkipGene']:
                 return None
         ### Name, AccNum, Sequence and ID ###
         if method_info[method] in ['Name', 'AccNum', 'Sequence', 'ID']:
             for hit in hits:
                 hitdata = hitdict[hit['Hit']]['Data']
                 if hitdata[method_info[method]] == data[method_info[method]]:
                     if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])):
                         return hit
         ### DescAcc ###
         if method == 'descacc':
             for hit in hits:
                 hitdata = hitdict[hit['Hit']]['Data']
                 if rje.matchExp('\W(%s)\W' % data['AccNum'],hitdata['Name']):
                     if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])):
                         return hit
         ### GABLAM ###
         if method != 'gablam': return None
         focus = self.str['MapFocus'][:1].upper() + self.str['MapFocus'][1:].lower()
         gstat = gstat_type[self.str['MapStat'].lower()]
         possibles = []  # List of Hits that meet MinMap criterion
         for hit in hits:
             hitname = hit['Hit']
             hitdata = hitdict[hit['Hit']]['Data']
             if self.getNum('AutoMap') > 0.0 and hitdict[hitname]['%s_%s' % (focus,gstat)] >= self.getNum('AutoMap'):
                 if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])):
                     return hit
             elif hitdict[hitname]['%s_%s' % (focus,gstat)] >= self.getNum('MinMap'):
                 possibles.append(hit)
         ### Manual GABLAM Choice ###
         if self.i() < 0 or not possibles: return None
         possibles.reverse()
         print '\nMapping options for %s:\n' % data['ShortName']
         for p in range(len(possibles)):
             hit = possibles[p]
             hitname = hit['Hit']
             hitdata = hitdict[hit['Hit']]['Data']
             print '<%d> %s (%d aa) =\t' % (len(possibles)-p,hitdata['Name'],hit['Length']),
             print '%.1f%% Qry Len,' % (100.0 * hit['Length'] / len(seq[1])),
             print '%.1f%% ID (%.1f%% Sim, %.1f%% Cov.)' % (hitdict[hitname]['Hit_ID'],hitdict[hitname]['Hit_Sim'],hitdict[hitname]['Hit_Len']),
             print '(Qry: %.1f%% ID (%.1f%% Sim, %.1f%% Cov.)' % (hitdict[hitname]['Query_ID'],hitdict[hitname]['Query_Sim'],hitdict[hitname]['Query_Len'])
         choice = -1
         print '<0> No mapping.\n'
         ## Choice ##
         while 1:
             choice = rje.getInt('Select sequence to replace %s?' % data['ShortName'],default=1,confirm=True)
             i = len(possibles) - choice
             if choice == 0: # No mapping
                 if self.i() < 2 or rje.yesNo('No GABLAM mapping for %s?' % (data['ShortName'])): return None
             elif choice > 0 and choice <= len(possibles):    
                 hit = possibles[i]
                 hitdata = hitdict[hit['Hit']]['Data']
                 if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit
     except:
         self.errorLog('Problem during SeqMapper.mapHit(%s)' % method,quitchoice=True)
         return None
Esempio n. 37
0
    def iuPred(self, retry=2):  ### Runs IUPred disorder prediction
        '''Runs IUPred disorder prediction.'''
        mydir = os.path.abspath(os.curdir)
        try:
            ### Setup sequence and temp file ###
            sequence = self.info['Sequence'].upper()
            name = self.info['Name'][:4] + rje.randomString(8)
            tmp = name + '.tmp'

            ### Run Disorder ###
            iupath = string.join(
                string.split(self.info['IUPath'], os.sep)[:-1], os.sep)
            iupred = string.split(self.info['IUPath'], os.sep)[-1]
            if self.opt['IUChDir']:
                os.chdir(
                    string.join(
                        string.split(self.info['IUPath'], os.sep)[:-1],
                        os.sep))
            open(tmp, 'w').write('>%s\n%s\n' % (name, sequence))
            if self.opt['IUChDir'] and self.opt['Win32']:
                iucmd = '%s %s %s' % (iupred, tmp,
                                      self.info['IUMethod'].lower())
            elif self.opt['IUChDir']:
                iucmd = './%s %s %s' % (iupred, tmp,
                                        self.info['IUMethod'].lower())
            else:
                iucmd = '%s %s %s' % (self.info['IUPath'], tmp,
                                      self.info['IUMethod'].lower())
            dlines = os.popen(iucmd).readlines()
            try:
                os.unlink(tmp)
            except:
                self.errorLog('Cannot delete %s!' % tmp)
            if self.opt['IUChDir']: os.chdir(mydir)
            if self.info['Name'] not in ['', 'None']: name = self.info['Name']
            self.list['ResidueDisorder'] = []
            for d in dlines:
                if rje.matchExp('^\s*(\d+)\s+(\S)\s+(\S+)', d):
                    dm = rje.matchExp('^\s*(\d+)\s+(\S)\s+(\S+)', d)
                    pos = string.atoi(dm[0])
                    aa = dm[1]
                    score = string.atof(dm[2])
                    i = len(self.list['ResidueDisorder'])
                    if sequence[i] != aa:
                        self.log.errorLog(
                            '%s: Position %d is %s in sequence but %s in IUPred output!'
                            % (name, pos, sequence[i], aa),
                            printerror=False)
                        raise ValueError
                    if pos != (i + 1):
                        self.log.errorLog(
                            '%s: Position %d reached in IUPred output but previous results missing!'
                            % (name, pos),
                            printerror=False)
                        raise ValueError
                    self.list['ResidueDisorder'].append(score)
            if len(self.list['ResidueDisorder']) != len(sequence):
                self.log.errorLog(
                    '%s: Sequence = %d aa but IUPred results stop at %s!' %
                    (name, len(sequence), len(self.list['ResidueDisorder'])),
                    printerror=False)
                raise ValueError

            ### Make Regions ###
            self.list['RegionDisorder'] = []
            self.list['RegionFold'] = []
            start = 0
            fstart = 0
            i = 0
            dx = 0
            while i < len(sequence):
                score = self.list['ResidueDisorder'][i]
                i += 1
                if not start and score > self.stat[
                        'IUCut']:  ### Start new disorder ###
                    start = i
                elif start and score <= self.stat['IUCut']:  ### End!
                    self.list['RegionDisorder'].append((start, i - 1))
                    dx += i - start
                    start = 0
                if not fstart and score <= self.stat[
                        'IUCut']:  ### Start new fold ###
                    fstart = i
                elif fstart and score > self.stat['IUCut']:  ### End!
                    self.list['RegionFold'].append((fstart, i - 1))
                    fstart = 0
            if start:
                self.list['RegionDisorder'].append((start, len(sequence)))
                dx += len(sequence) + 1 - start
            if fstart: self.list['RegionFold'].append((fstart, len(sequence)))
            self.minRegion()
            if self.opt['PrintLog']:
                self.log.printLog(
                    '\r#DIS',
                    'IUPred (%s) Disorder prediction complete: %d disorder regions, %d disordered aa'
                    % (self.info['IUMethod'].lower(),
                       len(self.list['RegionDisorder']), dx))
            return True
        except:
            if self.opt['IUChDir']: os.chdir(mydir)
            if retry:
                self.printLog('#RETRY', 'Trying %s again...' % name)
                return self.iuPred(retry - 1)
            self.log.errorLog(
                'Error in Disorder.iuPred(%s). Disorder prediction failed. Check (setenv?) IUPred_PATH environment variable.'
                % name)
            self.list['RegionDisorder'] = []
            self.list['RegionFold'] = []
            #try: os.system('rm %s*tmp' % (rje.makePath(os.path.split(self.info['IUPath'])[0])))
            #except: pass
            return False
Esempio n. 38
0
    def convert(self,
                filelist=[],
                outfile=None
                ):  ### Converts scansite output files in FileList to Outfile
        '''
        Converts scansite output files in FileList to Outfile.
        '''
        try:
            ### Setup ###
            _stage = 'Setup'
            if len(filelist) < 1:
                filelist = self.list['FileList']
            if not outfile:
                outfile = self.info['Name']
            if len(filelist) < 1:
                self.log.errorLog(
                    'No scansite files to convert! %s unchanged/not made.' %
                    outfile,
                    printerror=False)
                return False
            delimit = rje.getDelimit(self.cmd_list)
            ext = rje.delimitExt(delimit)
            if ext != outfile[-3:]:
                newfile = outfile[:-3] + ext
                if rje.yesNo('Change file name from %s to %s?' %
                             (outfile, newfile)):
                    outfile = newfile
            self.log.printLog(
                '#OUT', 'Converting %d file(s), output to %s.' %
                (len(filelist), outfile))

            ### Output File ###
            _stage = 'Output File'
            if not self.opt['Append'] or not os.path.exists(
                    outfile):  # Create with header
                OUTFILE = open(outfile, 'w')
                headers = [
                    'seq_id', 'enzyme', 'enz_group', 'aa', 'pos', 'score',
                    'percentile', 'matchseq', 'sa'
                ]
                rje.writeDelimit(OUTFILE, headers, delimit)
            else:
                OUTFILE = open(outfile, 'a')

            ### Conversion ###
            _stage = 'Conversion'
            sx = 0
            for infile in filelist:
                if not os.path.exists(infile):
                    self.log.errorLog(
                        'Input file %s does not exist! :o(' % infile, False,
                        False)
                    continue
                fx = 0
                INFILE = open(infile, 'r')
                inline = rje.nextLine(INFILE)
                while inline != None:
                    if rje.matchExp(re_scansite, inline):
                        scanlist = rje.matchExp(re_scansite, inline)
                    rje.writeDelimit(OUTFILE, scanlist, delimit)
                    sx += 1
                    fx += 1
                    rje.progressPrint(self, sx)
                    inline = rje.nextLine(INFILE)
                self.log.printLog(
                    '#OUT', '%s scansite results from %s. (%s Total.)' %
                    (rje.integerString(fx), infile, rje.integerString(sx)))
                INFILE.close()

            ### End ###
            _stage = 'End'
            OUTFILE.close()
            self.log.printLog(
                '#OUT', '%s scansite results output to %s.' %
                (rje.integerString(sx), outfile))
            return True
        except:
            self.log.errorLog('Error in convert(%s)' % _stage,
                              printerror=True,
                              quitchoice=False)
            raise
Esempio n. 39
0
 def tabulatePPIRegion(
         self):  ### Tabulates regions of known PPI from DAT file
     '''Tabulates regions of known PPI from DAT file.'''
     try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         tabfile = 'ppi_region.tdt'
         unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat'
         if os.path.exists(tabfile) and not self.opt['Force']:
             return self.printLog('#REGTAB',
                                  '%s found. (Force=F)' % tabfile)
         headers = ['Protein', 'Start', 'End', 'Interactor']
         rje.delimitedFileOutput(self, tabfile, headers, rje_backup=True)
         ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gcmd = "grep -P '(ID   |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile
         self.printLog('#GREP', gcmd)
         prot = None
         rx = 0
         plist = []
         ilist = []
         for gline in os.popen(gcmd).readlines():
             if rje.matchExp('ID   (\S+)', gline):
                 prot = rje.matchExp('ID   (\S+)', gline)[0]
             if rje.matchExp(
                     'FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',
                     gline):
                 (rstart, rend, rint) = rje.matchExp(
                     'FT   REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',
                     gline)
                 for ppi in string.split(rint):
                     if rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi):
                         datadict = {
                             'Protein':
                             prot,
                             'Start':
                             rstart,
                             'End':
                             rend,
                             'Interactor':
                             rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi)[0]
                         }
                         rje.delimitedFileOutput(self,
                                                 tabfile,
                                                 headers,
                                                 datadict=datadict)
                         rx += 1
                         if prot not in plist: plist.append(prot)
                         if datadict['Interactor'] not in ilist:
                             ilist.append(datadict['Interactor'])
                         self.progLog(
                             '\r#REGTAB',
                             'Tabulating regions: %s proteins; %s interactors; %s regions'
                             % (rje.integerString(
                                 len(plist)), rje.integerString(
                                     len(ilist)), rje.integerString(rx)))
         self.printLog(
             '\r#REGTAB',
             'Tabulated regions (%s proteins; %s interactors; %s regions) => %s'
             % (rje.integerString(len(plist)), rje.integerString(
                 len(ilist)), rje.integerString(rx), tabfile))
         return True
     except:
         self.errorLog(rje_zen.Zen().wisdom())
         raise  # Delete this if method error not terrible
Esempio n. 40
0
    def run(self):  ### Main Run Method
        '''Main Run Method.'''
        try:  ### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if self.opt['Force'] or not self.loadMutations(): self.parseOMIM()

            ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!#
            ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            import pingu
            pcmd = self.cmd_list + ['fulloutput=F']
            ping = self.obj['Pingu'] = pingu.PINGU(self.log, pcmd)
            ping.run()
            ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if not ping.obj['GeneCards']:
                return self.log.errorLog(
                    'Cannot map EnsLoci without GeneCards.', printerror=False)
            genecards = ping.obj['GeneCards'].dict[
                'GeneCard']  # GeneCards dictionary
            ensloci = ping.getEnsLoci(
            )  # EnsLoci SeqList object (ping.obj['EnsLoci'])
            seqdict = ensloci.seqNameDic()
            if not seqdict:
                return self.log.errorLog(
                    'Failed to read in EnsLoci sequences.', printerror=False)
            ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            self.dict['Fudge'] = {}
            ensback = {}  # Dictionary of {EnsLoci name:OMIM gene}
            mutations = {}  # Reorganised dictionary of {gene:{pos:Mutation}}
            for gene in rje.sortKeys(self.dict['Mutations']):
                try:
                    seq = seqdict[genecards[gene]['EnsLoci']]
                except:
                    self.log.printLog(
                        '#MAP', 'No EnsLoci protein mapped for %s' % gene)
                    continue
                mutations[gene] = {}
                ensback[genecards[gene]['EnsLoci']] = gene
                mutpos = {}  # Dictionary of {pos:AA} to map onto sequence
                for subid in rje.sortKeys(self.dict['Mutations'][gene]):
                    (disease, mutation) = self.dict['Mutations'][gene][subid]
                    (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',
                                                    mutation)
                    mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()]
                    mutations[gene][int(
                        pos)] = self.dict['Mutations'][gene][subid]
                self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos)
            self.deBug(self.dict['Fudge'])

            ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            allslims = {
            }  # Full dictionary of SLiMFinder results matching OMIM genes
            slimomim = []  # List of (gene,pos) overlapping with SLiMs
            outfile = 'rje_omim.slimfinder.tdt'
            dataheaders = string.split(
                'Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign',
                ',')
            headers = ['Gene', 'OMIM', 'SubID', 'Mutation', 'Disease'
                       ] + dataheaders
            rje.delimitedFileOutput(self,
                                    outfile,
                                    headers,
                                    delimit='\t',
                                    rje_backup=True)
            for file in glob.glob(self.info['SlimDir'] +
                                  '*.occ.csv'):  # Potential SLiM
                slimdata = rje.dataDict(self,
                                        file,
                                        ['Pattern', 'Hit', 'Pos', 'Match'],
                                        dataheaders,
                                        delimit=',')
                for occ in slimdata:
                    if slimdata[occ][
                            'Hit'] in ensback:  # OMIM gene - possible overlap
                        gene = ensback[slimdata[occ]['Hit']]
                        (start, end) = (int(slimdata[occ]['Pos']),
                                        int(slimdata[occ]['EndPos']))
                        if gene not in allslims: allslims[gene] = {}
                        allslims[gene][occ] = slimdata[occ]
                        for mpos in mutations[gene]:
                            if start <= (mpos + self.dict['Fudge'][seqdict[
                                    genecards[gene]['EnsLoci']]]) <= end:
                                self.log.printLog(
                                    '#OMIMSLIM', '%s %s %s (%d-%d) = %s' %
                                    (slimdata[occ]['Dataset'],
                                     slimdata[occ]['Hit'],
                                     slimdata[occ]['Pattern'], start, end,
                                     mutations[gene][mpos]))
                                slimdata[occ]['Gene'] = gene
                                slimdata[occ]['OMIM'] = string.join(
                                    self.dict['Records'][gene])
                                slimdata[occ]['Mutation'] = mutations[gene][
                                    mpos][1]
                                slimdata[occ]['Disease'] = mutations[gene][
                                    mpos][0]
                                rje.delimitedFileOutput(
                                    self, outfile, headers, '\t',
                                    slimdata[occ])
                                if (gene, mpos) not in slimomim:
                                    slimomim.append((gene, mpos))

            ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (inslim, resx, mutx) = (
                0, 0, 0
            )  # No. of residues in SLiMs, total residue count + no. mutations that may overlap
            for gene in mutations:  # These are just the genes that mapped to sequences
                mutx += len(mutations[gene])
                resx += seqdict[genecards[gene]['EnsLoci']].aaLen()
                if gene in allslims:  # Partially covered by SLiMs
                    res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen()
                    for occ in allslims[gene]:
                        (start, end) = (int(allslims[gene][occ]['Pos']) - 1,
                                        int(allslims[gene][occ]['EndPos']))
                        res = res[:start] + [1] * (end - start) + res[end - 1:]
                    self.deBug('%s %d (%d)' %
                               (gene, sum(res),
                                seqdict[genecards[gene]['EnsLoci']].aaLen()))
                    inslim += sum(res)
            self.log.printLog(
                '#COV', 'SLiMs have %.1f%% coverage of OMIM gene sequences' %
                (100.0 * inslim / resx))
            self.log.printLog(
                '#MUT',
                '%d mutations that could potentially occur in SLiMs' % mutx)
            self.log.printLog(
                '#PROB', 'Probability of observed %d mutation overlap = %.4f' %
                (len(slimomim),
                 rje.binomial(
                     len(slimomim), mutx, float(inslim) / resx, callobj=self)))
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 41
0
    def readHMMSearch(
        self,
        resfile=None,
        readaln=False
    ):  ### Reads HMM Search Results into objects    #!# Needs tidying! #!#
        '''
        Reads HMM Search Results into objects.
        >> resfile:str = Results File (set as self.info['OutFile'])
        >> readaln:boolean = whether to bother reading Alignments into objects [False] (!!!currently always True!!!)
        '''
        try:
            ### <a> ### Setup
            _stage = '<a> Setup'
            #print resfile
            if not resfile or not os.path.exists(resfile):
                self.log.errorLog('Results file (%) missing!' % resfile, False,
                                  False)
                raise IOError
            _hit_elements = [
                '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)',
                'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)'
            ]
            _hit_re = string.join(_hit_elements, '\s+')

            ### <b> ### Read in Search results
            _stage = '<b> Read Results'
            self.verbose(0, 4, 'Reading %s HMMer search results' % resfile, 0)
            RESFILE = open(resfile, 'r')
            lines = RESFILE.readlines()
            RESFILE.close()
            resline = []
            for line in lines:
                resline.append(re.sub('\n', '', line))
            search = None
            i = 0
            hitaln = 0
            if resline[i].find('hmmsearch') != 0:
                self.log.errorLog(
                    "File %s does not appear to be an hmmsearch results file" %
                    resfile)
                raise

            while i < len(resline):
                line = resline[i]
                #print line
                ## <i> ## Basic Search Info
                _stage = '<b-i> Basic Search Info'
                if line.find('HMM file:') == 0:
                    search = self._addSearch()
                    search.info['Name'] = rje.matchExp('HMM file:\s+(\S+)',
                                                       line)[0]
                    self.verbose(0, 4, '.', 0)
                    self.verbose(1, 3, '\n%s' % search.info['Name'], 0)
                elif line.find('Sequence database:') == 0:
                    search.info['SearchDB'] = rje.matchExp(
                        'Sequence database:\s+(\S+)', line)[0]
                elif line.find('Total sequences searched:') == 0:
                    dbnum = rje.matchExp('Total sequences searched:\s+(\d\S*)',
                                         line)[0]
                    dbnum = re.sub('\D', '', dbnum)
                    search.stat['DBNum'] = string.atoi(dbnum)
                ## <ii> ## One-line hit data (BLASTHit)
                elif line.find(
                        'Scores for complete sequences') == 0:  # One-line hits
                    _stage = '<b-ii> One-line hits'
                    i += 3  # Skip two lines
                    while re.search(
                            '^(\S+)\s.+\s(\S*\d)\s+(\S*\d)\s+(\d+)\s*$',
                            resline[i]):
                        match = rje.matchExp(
                            '^(\S+)\s.+\s(\S*\d)\s+(\S*\d)\s+\d+\s*$',
                            resline[i])
                        self.verbose(2, 3, '\n - %s (%s, %s)' % match, 0)
                        hit = search._addHit()
                        hit.info['Name'] = match[0]
                        hit.stat['BitScore'] = string.atof(match[1])
                        #print hit.stat['BitScore'], resline[i], match
                        eval = match[2]
                        if eval.find('e') == 0:
                            eval = '1' + eval
                        hit.stat['E-Value'] = string.atof(eval)
                        i += 1
                    line = resline[i]  # End of one-lines (blank line)
                    self.verbose(1, 3, '=> %d Hits' % search.hitNum(), 1)
                    hitaln = 0
                #!# Make new No hits pattern match
                elif line.find('***** No hits found ******') >= 0:  # No Hits
                    search.hit = []
                    self.verbose(1, 3, '=> %d Hits' % search.hitNum(), 1)
                    hitaln = 0
                ## <iii> ## Aln Hit data (PWAln)
                #!# Consider reading in the 'parsed for domains' section instead/as well
                elif re.search(_hit_re, line):  # New aln hit
                    _stage = '<b-iii> Aln Hit Info'
                    # Identify hit object
                    _hit_detail = rje.matchExp(_hit_re, line)
                    #print _hit_detail
                    hitname = _hit_detail[0]
                    #hitaln += 1 - string.atoi(_hit_detail[1])
                    #print hitname
                    try:
                        #if hitname != search.hit[hitaln].info['Name']:
                        for hit in search.hit:
                            if hit.info['Name'] == hitname:
                                hitaln = search.hit.index(hit)
                        if hitname != search.hit[hitaln].info['Name']:
                            self.log.errorLog(
                                'Problem with HMM results %s - %s single-line hits and alignments do not match'
                                % (hitname, search.info['Name']),
                                printerror=False,
                                quitchoice=True)
                            i += 1
                            continue
                    except:
                        self.log.errorLog(
                            'Problem with HMM results reconciling %s - %s single-line hits and alignments.'
                            % (hitname, search.info['Name']), True, True)
                        i += 1
                        continue
                    hit = search.hit[hitaln]
                    #print hit
                    hitaln += 1
                    # Add details
                    _stage = '<b-iii> Add Aln Hit Info'
                    aln = hit._addAln()
                    aln.stat['SbjStart'] = string.atoi(_hit_detail[3])
                    aln.stat['SbjEnd'] = string.atoi(_hit_detail[4])
                    aln.stat['BitScore'] = string.atof(_hit_detail[5])
                    aln.stat['Expect'] = string.atof(_hit_detail[6])
                    ## <iv> ## Alignments
                    readaln = True
                    i += 1
                    while readaln:
                        _stage = '<b-iv> Read alignments'
                        line = resline[i]
                        #print line
                        block = rje.matchExp('^(\s+)(\S+)', line)
                        #print block
                        if block:
                            # Query Line
                            leadlen = len(block[0])
                            seqblock = block[1]
                            #print block, leadlen, (leadlen+len(seqblock))
                            if block[1][:3] == '*->':  # Start
                                leadlen += 3
                                #print seqblock[3:]
                                seqblock = seqblock[3:]
                            if block[1][-3:] == '<-*':  # End
                                #print seqblock[:-3]
                                seqblock = seqblock[:-3]
                                readaln = False
                            #print block, leadlen, (leadlen+len(seqblock))
                            aln.info['QrySeq'] += seqblock
                            # Alignment Line
                            i += 1
                            aln.info['AlnSeq'] += resline[i][leadlen:(
                                leadlen + len(seqblock))]
                            # Subject Line
                            i += 1
                            aln.info['SbjSeq'] += resline[i][leadlen:(
                                leadlen + len(seqblock))]
                            # Skip Blank line
                            i += 2
                        else:
                            #print 'This should be a block!:\n', line
                            i += 1

                i += 1
            #print self.search
            #print self.search[0].hit
            #print self.search[0].hit[0].aln
            self.verbose(
                0, 1, 'Reading of %s HMM results complete! (%d Searches)' %
                (resfile, len(self.search)), 2)
            return True
        except:
            self.log.errorLog('Calamity during readHMMSearch(%s) %s.' %
                              (resfile, _stage))
            return False
Esempio n. 42
0
 def readHMMPFamSearch(
         self,
         resfile=None,
         readaln=False):  ### Reads HMM PFam Search Results into objects
     '''
     Reads HMM Search Results into objects.
     >> resfile:str = Results File (set as self.info['OutFile'])
     >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!!
     '''
     try:
         ### Setup ###
         if not resfile or not os.path.exists(resfile):
             self.log.errorLog('Results file "%s" missing!' % resfile,
                               printerror=False)
             return False
         ## Make RegExp for starting next alignment ##
         re_hit = string.join([
             '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)',
             'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)'
         ], '\s+')
         ## Search dictionary as results come back per sequence, not per HMM! ##
         pfam = {}  # Dictionary of {PFam name:search}
         hitx = 0  # Total number of hits
         hitlist = [
         ]  # List of sequences processed from file (may or may not include zero hit sequences)
         ### Read in Search results ###
         if open(resfile, 'r').readline().find('hmmpfam') != 0:
             self.errorLog(
                 'File "%s" does not appear to be an hmmpfam results file' %
                 resfile,
                 printerror=False)
             if rje.yesNo(
                     'Delete incorrect results file? (Check that hmmpfam=T is right!)',
                     default='N'):
                 os.unlink(resfile)
                 self.printLog('#DEL',
                               'Dodgy results file "%s" deleted.' % resfile)
             return False
         hitname = None
         i = 0
         hx = 0
         seqx = 0
         RESFILE = open(resfile, 'r')
         #x#resline = self.loadFromFile(resfile,chomplines=True)
         #x#while i < len(resline):
         line = RESFILE.readline()
         newres = [rje.chomp(line)]
         newresout = True
         newresfile = '%s.partial' % resfile
         if os.path.exists(newresfile): os.unlink(newresfile)
         while line:
             self.progLog(
                 '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' %
                 (resfile, rje.integerString(hx),
                  rje.integerString(len(pfam)), rje.integerString(hitx)))
             line = rje.chomp(line)
             #print line
             ## New Sequence ##
             if rje.matchExp('^Query sequence:\s+(\S+)', line):
                 if newres and newresout and self.opt['CleanRes']:
                     open(newresfile, 'a').write(string.join(newres, '\n'))
                 newres = ['', line]
                 newresout = False
                 hitname = rje.matchExp('^Query sequence:\s+(\S+)', line)[0]
                 hx += 1
                 #x#if hitname not in hitlist: hitlist.append(hitname)
             ## One Line Data for hits ##
             elif line.find('Parsed for domains:') == 0:
                 #x#i += 3      # Skip two complete lines
                 newres += [
                     line,
                     rje.chomp(RESFILE.readline()),
                     rje.chomp(RESFILE.readline())
                 ]
                 line = rje.chomp(RESFILE.readline())
                 newres.append(line)
                 #Model           Domain  seq-f seq-t    hmm-f hmm-t      score  E-value
                 #--------        ------- ----- -----    ----- -----      -----  -------
                 #Lep_receptor_Ig   1/1      24   114 ..     1   103 []   158.4  1.7e-44
                 # ... else ...
                 #         [no hits above thresholds]
                 while rje.matchExp(
                         string.join([
                             '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)',
                             '(\S+)\s*$'
                         ], '\s+'), line):
                     newresout = True
                     (dom, start, end, score, eval) = rje.matchExp(
                         string.join([
                             '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)',
                             '(\S+)\s*$'
                         ], '\s+'), line)
                     if not pfam.has_key(dom):
                         pfam[dom] = self._addSearch()
                         pfam[dom].info['Name'] = dom
                     hit = pfam[dom]._addHit()
                     hit.info['Name'] = hitname
                     aln = hit._addAln()
                     aln.setStat({
                         'SbjStart': string.atoi(start),
                         'SbjEnd': string.atoi(end),
                         'Expect': string.atof(eval),
                         'BitScore': string.atof(score)
                     })
                     hitx += 1
                     self.progLog(
                         '\r#RES',
                         'Reading %s: %s Seqs; %s Domains; %s Hits' %
                         (resfile, rje.integerString(hx),
                          rje.integerString(
                              len(pfam)), rje.integerString(hitx)))
                     line = rje.chomp(RESFILE.readline())
                     newres.append(line)
             ## End of Protein ##
             elif line[:2] == '//':
                 hitname = None
                 newres.append(line)
             elif rje.matchExp(
                     'End of rje_hmm reduced results file: (%d) sequences in original',
                     line):
                 seqx = string.atoi(
                     rje.matchExp(
                         'End of rje_hmm reduced results file: (\d+) sequences in original',
                         line)[0])
             elif newres:
                 newres.append(line)
             #x#i += 1
             line = RESFILE.readline()
         if newres and newresout and self.opt['CleanRes']:
             open(newresfile, 'a').write(string.join(newres, '\n'))
         if not seqx: seqx = hx
         if self.opt['CleanRes']:
             open(newresfile, 'a').write(
                 string.join([
                     '',
                     'End of rje_hmm reduced results file: %d sequences in original'
                     % seqx
                 ], '\n'))
             os.unlink(resfile)
             os.rename(newresfile, resfile)
             self.printLog(
                 '\r#RED',
                 'Results file %s replaced with reduced version (%s Hits only)'
                 % (resfile, rje.integerString(hitx)))
         self.printLog(
             '\r#RES', 'Reading %s complete: %s Seqs; %s Domains; %s Hits' %
             (resfile, rje.integerString(seqx), rje.integerString(
                 len(pfam)), rje.integerString(hitx)))
         return True
     except:
         self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile))
         return False
Esempio n. 43
0
    def parseOMIM(self):  ### Main parsing method
        '''Main parsing method.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.dict['Records'] = {}
            self.dict['Mutations'] = {}
            aas = string.split(
                string.join(rje_sequence.aa_code_3.values()).upper())
            oline = os.path.exists(self.info['Name'])
            (olen, ox, mx) = (len(open(self.info['Name'],
                                       'r').readlines()), 0.0, 0)
            OMIM = open(self.info['Name'], 'r')

            ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            record = gene = subid = disease = mutation = ''
            av = False  # Whether reading *FIELD* AV for mutation data
            while oline:
                oline = OMIM.readline()
                self.log.printLog(
                    '\r#OMIM',
                    'Processing OMIM: %.2f%% (%s genes)' %
                    (ox / olen, rje.integerString(len(self.dict['Records']))),
                    newline=False,
                    log=False)
                ox += 100.0
                if not av and oline[:1] != '*': continue
                line = rje.chomp(oline)
                while line[-1:] == ' ':
                    line = line[:-1]
                ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                if line == '*RECORD*': (record, av) = ('', False)
                elif line == '*FIELD* NO':  # New record
                    record = rje.chomp(OMIM.readline())
                    gene = ''
                    ox += 100.0
                ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* TI':  # New gene
                    gene = string.split(rje.chomp(OMIM.readline()))[-1]
                    subid = ''
                    av = False
                    ox += 100.0
                ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
                elif line == '*FIELD* AV':
                    av = True  # Start of mutation records
                elif av and rje.matchExp('^(\.\d+)',
                                         line):  # New subid mutation record
                    subid = rje.matchExp('^(\.\d+)', line)[0]
                    disease = rje.chomp(OMIM.readline())
                    ox += 100.0
                    try:
                        mutation = rje.matchExp(
                            '^%s, (\D\D\D\d+\D\D\D)' % gene,
                            rje.chomp(OMIM.readline()))[0]
                    except:
                        continue  # No mutation or not coding change
                    ox += 100.0
                    subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)', mutation)
                    if subaa[0] not in aas or subaa[1] not in aas: continue
                    if gene not in self.dict['Records']:
                        self.dict['Records'][gene] = [record]
                    if record not in self.dict['Records'][gene]:
                        self.dict['Records'][gene] += [record]
                    if gene not in self.dict['Mutations']:
                        self.dict['Mutations'][gene] = {}
                    mx += 1
                    self.dict['Mutations'][gene][subid] = (disease, mutation)

            ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            OMIM.close()
            self.log.printLog(
                '\r#OMIM',
                'Processing OMIM complete! (%s genes; %s mutations)' %
                (rje.integerString(len(
                    self.dict['Records'])), rje.integerString(mx)))
            self.saveMutations()
        except:
            self.log.errorLog(rje_zen.Zen().wisdom())
            raise  # Delete this if method error not terrible
Esempio n. 44
0
    def mySQLOut(self,tmfile='tm.tdt',domfile='domains.tdt',sigfile='signalp.tdt',makenew=False):     ### Output to tdt files
        '''
        Output to tdt files.
        >> tmfile:str = File to save TM numbers in (no save if None)
        >> domfile:str = File to save domain data in (no save if None)
        >> sigfile:str = File to save choice SignalP data in (no save if None)
        >> makenew:boolean [False] = whether to make new files (True) or append (False)
        '''
        try:
            ### <a> ### Setup
            _stage = '<a> Setup'
            siglist = ['nn_cleavemax','nn_cleavepos','nn_cleave','nn_sig_mean','nn_sig','nn_dscore','nn_d',
                       'hmm_cmax','hmm_cpos','hmm_cleave','hmm_sigprob','hmm_sig']

            self.deBug(self.tmhmm)
            if tmfile and self.tmhmm.keys():
                _stage = '<a-i> TM File'
                if makenew or os.access(tmfile, os.F_OK) == False:
                    TMFILE = open(tmfile, 'w')
                    TMFILE.write('acc_num\ttm\tnterm\tcterm\n')
                else:
                    TMFILE = open(tmfile, 'a')
            if domfile and (self.tmhmm.keys()+self.signalp.keys()):
                _stage = '<a-ii> Dom File'
                if makenew or os.access(domfile, os.F_OK) == False:
                    DOMFILE = open(domfile, 'w')
                    DOMFILE.write('acc_num\tdomain\tdom_start\tdom_end\tsource\n')
                else:
                    DOMFILE = open(domfile, 'a')
            if sigfile and self.signalp.keys():
                _stage = '<a-iii> Sig File'
                if makenew or os.access(sigfile, os.F_OK) == False:
                    SIGFILE = open(sigfile, 'w')
                    sheader = string.join(siglist,'\t')
                    SIGFILE.write('acc_num\t%s\n' % sheader)
                else:
                    SIGFILE = open(sigfile, 'a')

            ### <b> ### TMHMM
            for acc in self.tmhmm.keys():
                _stage = '<b> TMHMM'
                TMFILE.write('%s\t%s\t%s\t%s\n' % (acc,self.tmhmm[acc]['PredHel'],self.tmhmm[acc]['Topology'][0],self.tmhmm[acc]['Topology'][-1]))
                domains = self.tmhmm[acc]['Topology']
                tm = False
                dom = 'CYTOPLASMIC'
                if domains[0] == 'o':
                    dom = 'EXTRACELLULAR'
                domains = re.sub('o', '-', domains)
                domains = re.sub('i', '-', domains)
                domains = string.split('1' + domains + self.tmhmm[acc]['len'],'-')
                started = False
                while len(domains) > 1:
                    _stage = '<b-i> TM Dom Write'
                    start = domains.pop(0)
                    end = domains[0]
                    if tm:
                        type = 'TRANSMEMBRANE'
                    else:
                        type = dom
                        if started:
                            #print start,
                            start = '%d' % (string.atoi(start) + 1)
                            #print start
                        else:
                            started = True
                        if len(domains) > 1:
                            #print end,
                            end = '%d' % (string.atoi(end) - 1)
                            #print end
                    DOMFILE.write('%s\n' % string.join([acc,type,start,end,self.info['Source']],'\t'))
                    if tm:
                        tm = False
                        if dom == 'CYTOPLASMIC':
                            dom = 'EXTRACELLULAR'
                        else:
                            dom = 'CYTOPLASMIC'
                    else:
                        tm = True
                _stage = '<b-ii> TM Dom Check'
                if tm == False:
                    self.log.errorLog('Problem with %s TM domains - wrong number of domains!' % acc)
                
            sigdic = {'nn_cleavemax':'nn_ymax','nn_cleavepos':'nn_ymaxpos','nn_cleave':'nn_ymax?','nn_sig_mean':'nn_smean','nn_sig':'nn_smean?',
                      'nn_dscore':'nn_d','nn_d':'nn_d?','hmm_cmax':'hmm_cmax','hmm_cpos':'hmm_cmaxpos','hmm_cleave':'hmm_cmax?',
                      'hmm_sigprob':'hmm_sprob','hmm_sig':'hmm_sprob?'}

            ### <c> ### SignalP
            for acc in self.signalp.keys():
                _stage = '<c> SignalP'
                accout = acc
                if re.search('_HUMAN_(\S+)$', acc):
                    accout = rje.matchExp('_HUMAN_(\S+)$', acc)[0]
                writelist = [accout]
                for stat in siglist:
                    #print accout, stat
                    writelist.append(self.signalp[acc][sigdic[stat]])
                #print '%s\n' % string.join(writelist,'\t')
                SIGFILE.write('%s\n' % string.join(writelist,'\t'))
                _stage = '<c-ii> SingalP domains'
                nn_y = string.atoi(self.signalp[acc]['nn_ymaxpos']) - 1
                hmm_c = string.atoi(self.signalp[acc]['hmm_cmaxpos']) - 1
                if self.signalp[acc]['nn_d?'] == 'Y':
                    DOMFILE.write('%s\n' % string.join([accout,'SIGNALP','1','%d' % nn_y,'signalp-NN'],'\t'))
                if self.signalp[acc]['nn_ymax?'] == 'Y':
                    DOMFILE.write('%s\n' % string.join([accout,'CLEAVAGE','%d' % nn_y,'%d' % (nn_y+1),'signalp-NN'],'\t'))
                if self.signalp[acc]['hmm_sprob?'] == 'Y':
                    DOMFILE.write('%s\n' % string.join([accout,'SIGNALP','1','%d' % hmm_c,'signalp-HMM'],'\t'))
                if self.signalp[acc]['hmm_cmax?'] == 'Y':
                    DOMFILE.write('%s\n' % string.join([accout,'CLEAVAGE','%d' % hmm_c,'%d' % (hmm_c+1),'signalp-HMM'],'\t'))
                    
            ### <d> ### Finish
            _stage = '<d> Finish'
            if tmfile and self.tmhmm.keys():
                TMFILE.close()
            if domfile and (self.tmhmm.keys()+self.signalp.keys()):
                DOMFILE.close()
            if sigfile and self.signalp.keys():
                SIGFILE.close()
            return
        except:
            self.log.errorLog('Problem with mySQLOut() %s.' % _stage)
Esempio n. 45
0
 def mapHit(self,seq,hits,hitdict,method):     ### Tries to map seq onto hitseq and returns hit if successful
     '''
     Tries to map seq onto hitseq and returns hit if successful.
     >> seq:Query Sequence Object
     >> hits:List of hits in rough order of goodness
     >> hitdict:Dictionary of {hitname:stats}
     >> method:Mapping method to use
     '''
     try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         (name,sequence) = seq
         data = rje_sequence.extractNameDetails(name,self)
         data['Sequence'] = seq[1]
         data['ShortName'] = string.split(seq[0])[0]
         for hit in hitdict:
             hitdict[hit]['Data'] = rje_sequence.extractNameDetails(hitdict[hit]['Seq'][0],self)
             hitdict[hit]['Data']['Sequence'] = hitdict[hit]['Seq'][1]
             hitdict[hit]['Data']['ShortName'] = string.split(hitdict[hit]['Seq'][0])[0]
         ### SkipGene ###
         if method == 'id' and rje.matchExp('^(\S+)_\S+',data['ID']):
             gene = rje.matchExp('^(\S+)_\S+',data['ID'])
             if gene in self.list['SkipGene']:
                 return None
         ### Name, AccNum, Sequence and ID ###
         if method_info[method] in ['Name', 'AccNum', 'Sequence', 'ID']:
             for hit in hits:
                 hitdata = hitdict[hit['Hit']]['Data']
                 if hitdata[method_info[method]] == data[method_info[method]]:
                     if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])):
                         return hit
         ### DescAcc ###
         if method == 'descacc':
             for hit in hits:
                 hitdata = hitdict[hit['Hit']]['Data']
                 if rje.matchExp('\W(%s)\W' % data['AccNum'],hitdata['Name']):
                     if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])):
                         return hit
         ### GABLAM ###
         if method != 'gablam': return None
         focus = self.str['MapFocus'][:1].upper() + self.str['MapFocus'][1:].lower()
         gstat = gstat_type[self.str['MapStat'].lower()]
         possibles = []  # List of Hits that meet MinMap criterion
         for hit in hits:
             hitname = hit['Hit']
             hitdata = hitdict[hit['Hit']]['Data']
             if self.getNum('AutoMap') > 0.0 and hitdict[hitname]['%s_%s' % (focus,gstat)] >= self.getNum('AutoMap'):
                 if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])):
                     return hit
             elif hitdict[hitname]['%s_%s' % (focus,gstat)] >= self.getNum('MinMap'):
                 possibles.append(hit)
         ### Manual GABLAM Choice ###
         if self.i() < 0 or not possibles: return None
         possibles.reverse()
         print '\nMapping options for %s:\n' % data['ShortName']
         for p in range(len(possibles)):
             hit = possibles[p]
             hitname = hit['Hit']
             hitdata = hitdict[hit['Hit']]['Data']
             print '<%d> %s (%d aa) =\t' % (len(possibles)-p,hitdata['Name'],hit['Length']),
             print '%.1f%% Qry Len,' % (100.0 * hit['Length'] / len(seq[1])),
             print '%.1f%% ID (%.1f%% Sim, %.1f%% Cov.)' % (hitdict[hitname]['Hit_ID'],hitdict[hitname]['Hit_Sim'],hitdict[hitname]['Hit_Len']),
             print '(Qry: %.1f%% ID (%.1f%% Sim, %.1f%% Cov.)' % (hitdict[hitname]['Query_ID'],hitdict[hitname]['Query_Sim'],hitdict[hitname]['Query_Len'])
         choice = -1
         print '<0> No mapping.\n'
         ## Choice ##
         while 1:
             choice = rje.getInt('Select sequence to replace %s?' % data['ShortName'],default=1,confirm=True)
             i = len(possibles) - choice
             if choice == 0: # No mapping
                 if self.i() < 2 or rje.yesNo('No GABLAM mapping for %s?' % (data['ShortName'])): return None
             elif choice > 0 and choice <= len(possibles):    
                 hit = possibles[i]
                 hitdata = hitdict[hit['Hit']]['Data']
                 if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit
     except:
         self.errorLog('Problem during SeqMapper.mapHit(%s)' % method,quitchoice=True)
         return None
Esempio n. 46
0
    def alignmentToLocal(self,alignment=[],protqry=False):    ### Converts alignment into local hits table
        '''
        Converts alignment into local hits table.
        >> alignment:list of alignment text strings parsed from exonerate output.
        >> protqry:bool[False] = Whether query is protein
        << returns local database table.
        '''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            vfields = ['Qry','Hit','AlnID','Score','Expect','Length','Identity','Positives','QryStart','QryEnd','HitStart','HitEnd','QrySeq','HitSeq','AlnSeq','Rank','Phase','HitStrand']
            vdb = self.db().addEmptyTable('local',vfields,['Qry','Hit','AlnID'])

            ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            '''
                     Query: FAXD1_NOTSC (P82807) Venom prothrombin activator notecarin-D1 [Notechis scutatus scutatus]
                    Target: ahap_PSETE__EBS10XV2AHAP187 haploidB edges=694320..157489 left=833615 right=281503 ver=1.9 style=4:[revcomp]
                     Model: protein2genome:local
                 Raw score: 1170
               Query range: 19 -> 295
              Target range: 12312786 -> 12307250
            
                   20 : AlaGluSerAsnValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg :       37
                        ..!...|||   ||||||||||||||||||||||||||||||||||||||||||
                        CysSerSerLeuValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg
             12312786 : TGTTCTTCTTTAGTATTCTTAAAAAGCAAAGTGGCAAATAGATTTTTGCAAAGA : 12312735
            
                  264 : {G}  >>>> Target Intron 7 >>>>  {ly}GluIleAspIleSerArg :      270
                        {|}           1304 bp           {||}|||||||||||||||!!!
                        {G}++                         ++{ly}GluIleAspIleSerSer
             12308652 : {G}gt.........................ag{GG}GAAATAGACATATCAAGC : 12307328
            
                  289 : ValProProAsnTyrTyrTyr :      295
                        |||||| !!!..||| !!|||
                        ValProAlaThrTyrAspTyr
             12307273 : GTTCCTGCCACGTATGACTAT : 12307251
            '''
            qry = None
            hit = None
            alnx = {}
            ventry = {}
            parsing = alignment[0:]
            rank = 1

            while parsing:
                line = parsing.pop(0)
                #self.bugPrint(line)
                # Query
                if rje.matchExp('Query: (\S+)',line):
                    if ventry: vdb.addEntry(ventry)
                    ventry = {'Qry':rje.matchExp('Query: (\S+)',line)[0],'QrySeq':'','HitSeq':'','AlnSeq':'','Rank':rank}
                    rank += 1
                # Hit
                if rje.matchExp('Target: (\S+)',line):
                    ventry['Hit'] = rje.matchExp('Target: (\S+)',line)[0]
                    qh = (ventry['Qry'],ventry['Hit'])
                    if qh in alnx: alnx[qh] += 1
                    else: alnx[qh] = 1
                    ventry['AlnID'] = alnx[qh]
                # Score
                if rje.matchExp('core: (\S+)',line):
                    ventry['Score'] = int(rje.matchExp('core: (\S+)',line)[0])
                # Alignment
                if rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line):
                    adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line)
                    #self.bugPrint('= new aln: %s ->  %s' % (adata[0],adata[2]))
                    start = int(adata[0])
                    end = int(adata[2])
                    aln = adata[1]
                    x = line.find(aln)
                    if 'QryStart' not in ventry: ventry['QryStart'] = start
                    ventry['QryEnd'] = end
                    ventry['QrySeq'] += aln
                    #self.bugPrint('^%s$' % ventry['QrySeq'])

                    line = parsing.pop(0)
                    #self.bugPrint(line)
                    #self.bugPrint(']%s[' % aln)
                    #self.bugPrint(']%s[' % line[x:x+len(aln)])
                    ventry['AlnSeq'] += line[x:x+len(aln)]
                    #self.debug('^%s$' % ventry['AlnSeq'])

                    #self.bugPrint(parsing[0])
                    adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0))
                    if not adata:
                        #self.deBug(parsing[0])
                        adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0))
                    if not adata: raise ValueError('Partial alignment! Truncated output?')
                    #self.bugPrint('+ hit aln: %s ->  %s' % (adata[0],adata[2]))
                    start = int(adata[0])
                    end = int(adata[2])
                    aln = adata[1]
                    if 'HitStart' not in ventry: ventry['HitStart'] = start
                    ventry['HitEnd'] = end
                    ventry['HitSeq'] += aln
            if ventry: vdb.addEntry(ventry)
            ## Seq Check
            for ventry in vdb.entries():
                #self.bugPrint('^%s$' % ventry['QrySeq'])
                #self.bugPrint('^%s$' % ventry['AlnSeq'])
                #self.bugPrint('^%s$' % ventry['HitSeq'])
                if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']):
                    self.debug(ventry)
                    raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq'])))

            ### ~ [3] Split on introns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.obj['DNAHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F','dna=T'])
            self.obj['ProtHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F'])

            #i# Protein Position Conversion
            if protqry:
                for ventry in vdb.entries():
                    # 1->1, 2->4, 3->7 = 1+3*(n-1)
                    ventry['QryStart'] = 1+3*(ventry['QryStart']-1)
                    if ventry['QrySeq'].startswith('{'):
                        codend = ventry['QrySeq'].find('}')
                        # {X} = phase 2, find = 2
                        if codend == 2: ventry['QryStart'] += 2
                        # {XX} = phase 1, find = 3
                        elif codend == 3: ventry['QryStart'] += 1
                        else: raise ValueError('QrySeq {} bracket mismatch!: %s' % ventry)
                    ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1

            vdb.newKey(['Qry','Rank','Hit','AlnID'])
            for vkey in vdb.dataKeys():
                ventry = vdb.data(vkey)
                #i# Make a combined hitseq to output to fasta
                #># phap_PSETE__EBS10XV2PHAP187.FAXD1_NOTSC.XXX
                hitname = '%s.ex%s %s %s-%s' % (ventry['Qry'],ventry['Rank'],ventry['Hit'],rje.iStr(ventry['HitStart']),rje.iStr(ventry['HitEnd']))
                hitseq = ''
                phase = (ventry['QryStart'] + 2) % 3
                alnx = 1
                vkeyentries = [ventry]
                dirn = 1
                if ventry['HitEnd'] < ventry['HitStart']:
                    dirn = -1
                    ventry['HitStrand'] = '-'
                else: ventry['HitStrand'] = '+'
                for seq in ['HitSeq','QrySeq','AlnSeq']:
                    ventry[seq] = string.replace(ventry[seq],'}','')
                    ventry[seq] = string.replace(ventry[seq],'{','')
                while rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq']):
                    intron = rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq'])[0]
                    x = ventry['QrySeq'].find(intron)
                    y = x + len(intron)
                    intronlen = int(rje.matchExp('(\d+) bp',ventry['AlnSeq'][x:y])[0])
                    #i# Create a new entry of the first exon
                    newentry = rje.combineDict({},ventry)
                    for seq in ['HitSeq','QrySeq','AlnSeq']:
                        newentry[seq] = newentry[seq][:x]
                    newentry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx); alnx += 1
                    newentry['QryEnd'] = newentry['QryStart'] + len(newentry['QrySeq']) - string.count(newentry['QrySeq'],'-') - 1
                    newentry['HitEnd'] = newentry['HitStart'] + (len(newentry['HitSeq']) - string.count(newentry['HitSeq'],'-') - 1) * dirn
                    newentry['Length'] = x
                    newentry['Identity'] = string.count(newentry['AlnSeq'],'|')
                    vkeyentries.append(vdb.addEntry(newentry))
                    hitseq += newentry['HitSeq']
                    #i# Update ventry to be the rest of the hit
                    for seq in ['HitSeq','QrySeq','AlnSeq']:
                        ventry[seq] = ventry[seq][y:]
                    ventry['QryStart'] = newentry['QryEnd'] + 1
                    if protqry: ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1
                    ventry['HitStart'] = newentry['HitEnd'] + intronlen * dirn
                #i# Calculate length and identity of final exon
                ventry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx)
                ventry['Length'] = len(ventry['AlnSeq'])
                ventry['Identity'] = string.count(ventry['AlnSeq'],'|')
                #i# Add sequence hits
                hitname += ' (%d alignment blocks)' % alnx
                hitseq += ventry['HitSeq']
                hitseq = string.replace(hitseq,'-','')
                protseq = rje_sequence.dna2prot('%s%s' % ('N' * phase,hitseq))
                self.obj['ProtHits']._addSeq(hitname,protseq)
                if ventry['HitStart'] > ventry['HitEnd']: hitseq = rje_sequence.reverseComplement(hitseq)
                self.obj['DNAHits']._addSeq(hitname,hitseq)

                #i# Update AlnID for proper float sorting
                for ventry in vkeyentries:
                    (vcore,vx) = string.split(ventry['AlnID'],'.')
                    ventry['AlnID'] = '%s.%s' % (vcore,rje.preZero(int(vx),alnx))
                    #self.debug(ventry)
            vdb.dataFormat({'AlnID':'string'})
            vdb.remakeKeys()
            self.debug(vdb.dataKeys())

            ## Seq Check
            for ventry in vdb.entries():
                #self.bugPrint('^%s$' % ventry['QrySeq'])
                #self.bugPrint('^%s$' % ventry['AlnSeq'])
                #self.bugPrint('^%s$\n' % ventry['HitSeq'])
                if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']):
                    self.debug(ventry)
                    raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq'])))

            udb = self.reduceLocal(byqry=True)
            udb.rename('unique')
            udb.newKey(['Qry','Rank','Hit','AlnID'])
            self.debug(vdb.dataKeys())

            #i# Calculate exon phase
            for ventry in vdb.entries() + udb.entries(): ventry['Phase'] = (ventry['QryStart'] - 1) % 3

            #i# Protein Position Conversion
            if protqry:
                for ventry in vdb.entries():
                    ventry['QryStart'] = (ventry['QryStart']+2)/3
                    ventry['QryEnd'] = (ventry['QryEnd']+2)/3
                for ventry in udb.entries():
                    ventry['QryStart'] = (ventry['QryStart']+2)/3
                    ventry['QryEnd'] = (ventry['QryEnd']+2)/3

            #vdb.remakeKeys()
            return vdb

        except: self.errorLog('%s.alignmentToLocal error' % self.prog()); raise
Esempio n. 47
0
 def run(self):  ### Main run method
     '''Main run method.'''
     try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         for fasta in glob.glob('*.fasta'):
             fas = fasta[:-2]
             if os.path.exists(fas): continue
             sx = 0
             for line in open(fasta,'r').readlines():
                 if line[:1] == '>':
                     try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line)
                     except: name = rje.matchExp('^>(\S+)',line)[0]
                     if len(string.split(name,'|')) == 3:
                         name = '6rf_NEIME__%s' % string.split(name,'|')[2]
                         open(fas,'a').write('>%s\n' % name)
                     elif len(string.split(name,'|')) == 5:
                         name = 'ref_NEIME__%s' % string.split(name,'|')[3]
                         open(fas,'a').write('>%s %s\n' % (name,desc))
                     else: print string.split(name,'|'); raise ValueError
                     self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1
                 else: open(fas,'a').write(line)
             self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta))
             rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True)
         ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         rfhits = {}     # Dictionary of {hit:['File:hit_num']}
         acc = 'MC58_6RF_Hits.acc'; open(acc,'w')
         gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt'
         cx = 0
         for csv in glob.glob('MC58_6RF_CSV/*.CSV'):
             cx += 1
             file = os.path.basename(csv)[:-4]
             hits = False
             for line in open(csv,'r').readlines():
                 if line.find('prot_hit_num,prot_acc') == 0: hits = True
                 elif hits:
                     data = rje.readDelimit(line,',')
                     if len(data) < 2: continue
                     [num,name] = data[:2]
                     try: name = string.split(name,'|')[2]
                     except: continue
                     if name not in rfhits:
                         open(acc,'a').write('6rf_NEIME__%s\n' % name)
                         rfhits[name] = []
                     id = '%s:%s' % (file,num)
                     if id not in rfhits[name]: rfhits[name].append(id)
                     self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits))))
         self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc))
         ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if not os.path.exists(gfile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Hits.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam()
         ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         gdata = rje.dataDict(self,gfile,['Qry'],['HitNum'])
         zeros = []
         for hit in gdata:
             if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit)
         zeros = rje.sortUnique(zeros,False)
         open('6rf_zeros.acc','w').write(string.join(zeros,'\n'))
         self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros))
         ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt'
         if not os.path.exists(ufile):
             seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F'])
             seqlist.info['Name'] = 'MC58_6RF_Zeros.fas'
             seqlist.saveFasta()
             gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam()
         gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True)
         fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True)
         headers = gdata.pop('Headers')
         headers.insert(1,'Sample')
         headers.append('BestHit')
         rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True)
         for rf in rje.sortKeys(gdata):
             rfcut = string.split(rf,'__')[1]
             gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ')
             gdata[rf]['Qry'] = rfcut
             try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0]
             except: gdata[rf]['BestHit']  = '-'
             rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf])
         
     except: self.errorLog(rje_zen.Zen().wisdom())
     self.printLog('#ZEN',rje_zen.Zen().wisdom())
Esempio n. 48
0
 def setup(self):  ### Main class setup method.
     '''Main class setup method.'''
     try:  ### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt'
         self.progLog('\r#PPI', 'Loading pairwise data...')
         pairwise = rje.dataDict(self, ppipairwise, ['Hub', 'Spoke'],
                                 ['Spoke', 'SpokeSeq', 'Evidence'])
         gene2seq = {}
         seq2gene = {}
         fullppi = {}
         px = 0.0
         ptot = len(pairwise)
         ppix = 0
         for pair in rje.sortKeys(pairwise):
             self.progLog(
                 '\r#PPI',
                 'Processing full pairwise PPI: %.2f%%' % (px / ptot))
             px += 100.0
             [hub, spoke] = string.split(pair, '\t')
             if spoke not in gene2seq:
                 sseq = pairwise[pair]['SpokeSeq']
                 gene2seq[spoke] = sseq
                 seq2gene[string.split(sseq, '__')[0]] = spoke
             if hub not in fullppi: fullppi[hub] = {}
             if spoke not in fullppi[hub]:
                 fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']
                 ppix += 1
         self.printLog(
             '\r#PPI', 'Processed full pairwise PPI: %s genes; %s ppi.' %
             (rje.integerString(len(fullppi)), rje.integerString(ppix / 2)))
         ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt'
         goodppi = self.loadFromFile(goodppifile, chomplines=True)
         self.dict['PPI'] = {}
         px = 0.0
         ptot = len(fullppi)
         fppix = ppix
         ppix = 0
         for hub in fullppi:
             self.progLog(
                 '\r#PPI', 'Filtering complexes: %.2f%% (%s hubs; %s ppi)' %
                 (px / ptot, rje.integerString(len(
                     self.dict['PPI'])), rje.integerString(ppix)))
             px += 100.0
             self.dict['PPI'][hub] = []
             for spoke in fullppi[hub]:
                 goodspoke = False
                 for ptype in goodppi:
                     if rje.matchExp(':(%s)($|\|)' % ptype,
                                     fullppi[hub][spoke]):
                         goodspoke = True
                         break
                 if goodspoke:
                     self.dict['PPI'][hub].append(spoke)
                     continue
                 goodspoke = True
                 for spoke2 in fullppi[hub]:
                     if spoke2 in [hub, spoke]: continue
                     if spoke2 in fullppi[spoke]:
                         goodspoke = False
                         break
                 if goodspoke: self.dict['PPI'][hub].append(spoke)
             ppix += len(self.dict['PPI'][hub])
             if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub)
         self.printLog(
             '\r#PPI', 'Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' %
             (rje.integerString(
                 len(fullppi)), rje.integerString(len(self.dict['PPI'])),
              rje.integerString(fppix / 2), rje.integerString(ppix / 2)))
         ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas'
         scmd = ['accnr=F', 'seqnr=F',
                 'seqin=%s' % seqfile] + self.cmd_list + ['autoload=T']
         seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd)
         self.dict['SeqObj'] = seqlist.seqNameDic('Max')
         self.dict['Gene2Seq'] = gene2seq
         self.dict['Seq2Gene'] = seq2gene
         return True  # Setup successful
     except:
         self.errorLog('Problem during %s setup.' % self)
         return False  # Setup failed
Esempio n. 49
0
 def ANCHOR(self, retry=2):  ### Runs ANCHOR disorder prediction
     '''Runs ANCHOR disorder prediction.'''
     try:  ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         ## ~ [0a] ~ Setup sequence and temp file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         sequence = self.info['Sequence'].upper()
         name = self.info['Name'][:4] + rje.randomString(8)
         tmp = name + '.tmp'
         ## ~ [0b] ~ Setup ANCHOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
         apath = self.info['ANCHOR']
         if os.path.basename(apath) == 'anchor':
             apath = os.path.dirname(apath)
         anchor = rje.makePath(apath) + 'anchor'
         if not os.path.exists(anchor):
             self.errorLog('Path "%s" not found!' % anchor,
                           printerror=False)
             retry = 0
             raise IOError
         ### ~ [1] Run ANCHOR Disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         open(tmp, 'w').write('>%s\n%s\n' % (name, sequence))
         acmd = '%s %s -d %s' % (anchor, tmp, apath)
         dlines = os.popen(acmd).readlines()
         try:
             os.unlink(tmp)
         except:
             self.errorLog('Cannot delete %s!' % tmp)
         ### ~ [2] Read in results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         if self.info['Name'] not in ['', 'None']: name = self.info['Name']
         self.list['ResidueDisorder'] = []
         for d in dlines:
             if d[:1] == '#': continue
             if rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d):
                 dm = rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d)
                 pos = string.atoi(dm[0])
                 aa = dm[1]
                 score = string.atof(dm[2])
                 i = len(self.list['ResidueDisorder'])
                 if sequence[i] != aa:
                     self.log.errorLog(
                         '%s: Position %d is %s in sequence but %s in ANCHOR output!'
                         % (name, pos, sequence[i], aa),
                         printerror=False)
                     raise ValueError
                 if pos != (i + 1):
                     self.log.errorLog(
                         '%s: Position %d reached in ANCHOR output but previous results missing!'
                         % (name, pos),
                         printerror=False)
                     raise ValueError
                 self.list['ResidueDisorder'].append(score)
         if len(self.list['ResidueDisorder']) != len(sequence):
             self.log.errorLog(
                 '%s: Sequence = %d aa but ANCHOR results stop at %s!' %
                 (name, len(sequence), len(self.list['ResidueDisorder'])),
                 printerror=False)
             raise ValueError
         ### ~ [3] ~ Make Regions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
         self.list['RegionDisorder'] = []
         self.list['RegionFold'] = []
         start = 0
         fstart = 0
         i = 0
         dx = 0
         while i < len(sequence):
             score = self.list['ResidueDisorder'][i]
             i += 1
             if not start and score > self.stat[
                     'IUCut']:  ### Start new disorder ###
                 start = i
             elif start and score <= self.stat['IUCut']:  ### End!
                 self.list['RegionDisorder'].append((start, i - 1))
                 dx += i - start
                 start = 0
             if not fstart and score <= self.stat[
                     'IUCut']:  ### Start new fold ###
                 fstart = i
             elif fstart and score > self.stat['IUCut']:  ### End!
                 self.list['RegionFold'].append((fstart, i - 1))
                 fstart = 0
         if start:
             self.list['RegionDisorder'].append((start, len(sequence)))
             dx += len(sequence) + 1 - start
         if fstart: self.list['RegionFold'].append((fstart, len(sequence)))
         self.minRegion()
         if self.opt['PrintLog']:
             self.log.printLog(
                 '\r#DIS',
                 'ANCHOR Disorder prediction complete: %d disorder regions, %d disordered aa'
                 % (len(self.list['RegionDisorder']), dx))
         return True
     except:
         if retry:
             self.printLog('#RETRY', 'Trying %s again...' % name)
             return self.ANCHOR(retry - 1)
         self.log.errorLog(
             'Error in Disorder.ANCHOR(%s). Disorder prediction failed.' %
             name)
         self.list['RegionDisorder'] = []
         self.list['RegionFold'] = []
         return False
Esempio n. 50
0
 def readHMMPFamSearch(self,resfile=None,readaln=False):  ### Reads HMM PFam Search Results into objects    
     '''
     Reads HMM Search Results into objects.
     >> resfile:str = Results File (set as self.info['OutFile'])
     >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!!
     '''
     try:
         ### Setup ###
         if not resfile or not os.path.exists(resfile):
             self.log.errorLog('Results file "%s" missing!' % resfile,printerror=False)
             return False
         ## Make RegExp for starting next alignment ##
         re_hit = string.join(['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)'],'\s+')
         ## Search dictionary as results come back per sequence, not per HMM! ##
         pfam = {}   # Dictionary of {PFam name:search}
         hitx = 0    # Total number of hits
         hitlist = []        # List of sequences processed from file (may or may not include zero hit sequences)
         ### Read in Search results ###
         if open(resfile,'r').readline().find('hmmpfam') != 0:
             self.errorLog('File "%s" does not appear to be an hmmpfam results file' % resfile,printerror=False)
             if rje.yesNo('Delete incorrect results file? (Check that hmmpfam=T is right!)',default='N'):
                 os.unlink(resfile)
                 self.printLog('#DEL','Dodgy results file "%s" deleted.' % resfile)
             return False
         hitname = None
         i = 0; hx = 0; seqx = 0
         RESFILE = open(resfile,'r')
         #x#resline = self.loadFromFile(resfile,chomplines=True)
         #x#while i < len(resline):
         line = RESFILE.readline()
         newres = [rje.chomp(line)]; newresout = True; newresfile = '%s.partial' % resfile
         if os.path.exists(newresfile): os.unlink(newresfile)
         while line:
             self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx)))
             line = rje.chomp(line)
             #print line
             ## New Sequence ##
             if rje.matchExp('^Query sequence:\s+(\S+)',line):
                 if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n'))
                 newres = ['',line]; newresout = False
                 hitname = rje.matchExp('^Query sequence:\s+(\S+)',line)[0]; hx += 1
                 #x#if hitname not in hitlist: hitlist.append(hitname)
             ## One Line Data for hits ##
             elif line.find('Parsed for domains:') == 0:
                 #x#i += 3      # Skip two complete lines
                 newres += [line,rje.chomp(RESFILE.readline()),rje.chomp(RESFILE.readline())]
                 line = rje.chomp(RESFILE.readline()); newres.append(line)
                 #Model           Domain  seq-f seq-t    hmm-f hmm-t      score  E-value
                 #--------        ------- ----- -----    ----- -----      -----  -------
                 #Lep_receptor_Ig   1/1      24   114 ..     1   103 []   158.4  1.7e-44
                 # ... else ...
                 #         [no hits above thresholds]
                 while rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line):
                     newresout = True
                     (dom,start,end,score,eval) = rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line)
                     if not pfam.has_key(dom):
                         pfam[dom] = self._addSearch()
                         pfam[dom].info['Name'] = dom
                     hit = pfam[dom]._addHit()
                     hit.info['Name'] = hitname
                     aln = hit._addAln()
                     aln.setStat({'SbjStart':string.atoi(start),'SbjEnd':string.atoi(end),'Expect':string.atof(eval),'BitScore':string.atof(score)})
                     hitx += 1
                     self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx)))
                     line = rje.chomp(RESFILE.readline()); newres.append(line)
             ## End of Protein ##
             elif line[:2] == '//': hitname = None; newres.append(line)
             elif rje.matchExp('End of rje_hmm reduced results file: (%d) sequences in original',line):
                 seqx = string.atoi(rje.matchExp('End of rje_hmm reduced results file: (\d+) sequences in original',line)[0])
             elif newres: newres.append(line)
             #x#i += 1
             line = RESFILE.readline()
         if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n'))
         if not seqx: seqx = hx
         if self.opt['CleanRes']:
             open(newresfile,'a').write(string.join(['','End of rje_hmm reduced results file: %d sequences in original' % seqx],'\n'))
             os.unlink(resfile)
             os.rename(newresfile,resfile)
             self.printLog('\r#RED','Results file %s replaced with reduced version (%s Hits only)' % (resfile,rje.integerString(hitx)))
         self.printLog('\r#RES','Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(seqx),rje.integerString(len(pfam)),rje.integerString(hitx)))
         return True
     except:
         self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile))
         return False
Esempio n. 51
0
    def readHMMSearch(self,resfile=None,readaln=False):  ### Reads HMM Search Results into objects    #!# Needs tidying! #!#
        '''
        Reads HMM Search Results into objects.
        >> resfile:str = Results File (set as self.info['OutFile'])
        >> readaln:boolean = whether to bother reading Alignments into objects [False] (!!!currently always True!!!)
        '''
        try:
            ### <a> ### Setup
            _stage = '<a> Setup'
            #print resfile
            if not resfile or not os.path.exists(resfile):
                self.log.errorLog('Results file (%) missing!' % resfile,False,False)
                raise IOError
            _hit_elements = ['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)']
            _hit_re = string.join(_hit_elements,'\s+')

            ### <b> ### Read in Search results
            _stage = '<b> Read Results'
            self.verbose(0,4,'Reading %s HMMer search results' % resfile,0)
            RESFILE = open(resfile, 'r')
            lines = RESFILE.readlines()
            RESFILE.close()
            resline = []
            for line in lines:
                resline.append(re.sub('\n','',line))
            search = None
            i = 0
            hitaln = 0
            if resline[i].find('hmmsearch') != 0:
                self.log.errorLog("File %s does not appear to be an hmmsearch results file" % resfile)
                raise

            while i < len(resline):
                line = resline[i]
                #print line
                ## <i> ## Basic Search Info
                _stage = '<b-i> Basic Search Info'
                if line.find('HMM file:') == 0:
                    search = self._addSearch()
                    search.info['Name'] = rje.matchExp('HMM file:\s+(\S+)',line)[0]
                    self.verbose(0,4,'.',0)
                    self.verbose(1,3,'\n%s' % search.info['Name'],0)
                elif line.find('Sequence database:') == 0:
                    search.info['SearchDB'] = rje.matchExp('Sequence database:\s+(\S+)', line)[0]
                elif line.find('Total sequences searched:') == 0:
                    dbnum = rje.matchExp('Total sequences searched:\s+(\d\S*)', line)[0]
                    dbnum = re.sub('\D','',dbnum)
                    search.stat['DBNum'] = string.atoi(dbnum)
                ## <ii> ## One-line hit data (BLASTHit)
                elif line.find('Scores for complete sequences') == 0: # One-line hits
                    _stage = '<b-ii> One-line hits'
                    i += 3  # Skip two lines
                    while re.search('^(\S+)\s.+\s(\S*\d)\s+(\S*\d)\s+(\d+)\s*$',resline[i]):
                        match = rje.matchExp('^(\S+)\s.+\s(\S*\d)\s+(\S*\d)\s+\d+\s*$',resline[i])
                        self.verbose(2,3,'\n - %s (%s, %s)' % match,0)
                        hit = search._addHit()
                        hit.info['Name'] = match[0]
                        hit.stat['BitScore'] = string.atof(match[1])
                        #print hit.stat['BitScore'], resline[i], match
                        eval = match[2]
                        if eval.find('e') == 0:
                            eval = '1' + eval
                        hit.stat['E-Value'] = string.atof(eval)
                        i += 1
                    line = resline[i]   # End of one-lines (blank line)
                    self.verbose(1,3,'=> %d Hits' % search.hitNum(),1)
                    hitaln = 0
                #!# Make new No hits pattern match
                elif line.find('***** No hits found ******') >= 0:  # No Hits
                    search.hit = []
                    self.verbose(1,3,'=> %d Hits' % search.hitNum(),1)
                    hitaln = 0
                ## <iii> ## Aln Hit data (PWAln)
                #!# Consider reading in the 'parsed for domains' section instead/as well
                elif re.search(_hit_re,line):   # New aln hit
                    _stage = '<b-iii> Aln Hit Info'
                    # Identify hit object
                    _hit_detail = rje.matchExp(_hit_re,line)
                    #print _hit_detail
                    hitname = _hit_detail[0]
                    #hitaln += 1 - string.atoi(_hit_detail[1])
                    #print hitname
                    try:
                        #if hitname != search.hit[hitaln].info['Name']:
                        for hit in search.hit:
                            if hit.info['Name'] == hitname:
                                hitaln = search.hit.index(hit)
                        if hitname != search.hit[hitaln].info['Name']:
                            self.log.errorLog('Problem with HMM results %s - %s single-line hits and alignments do not match' % (hitname,search.info['Name']),printerror=False,quitchoice=True)
                            i += 1
                            continue
                    except:
                        self.log.errorLog('Problem with HMM results reconciling %s - %s single-line hits and alignments.' % (hitname,search.info['Name']),True,True)
                        i += 1
                        continue                        
                    hit = search.hit[hitaln]
                    #print hit
                    hitaln += 1
                    # Add details
                    _stage = '<b-iii> Add Aln Hit Info'
                    aln = hit._addAln()
                    aln.stat['SbjStart'] = string.atoi(_hit_detail[3])
                    aln.stat['SbjEnd'] = string.atoi(_hit_detail[4])
                    aln.stat['BitScore'] = string.atof(_hit_detail[5])
                    aln.stat['Expect'] = string.atof(_hit_detail[6])
                    ## <iv> ## Alignments
                    readaln = True
                    i += 1
                    while readaln:
                        _stage = '<b-iv> Read alignments'
                        line = resline[i]
                        #print line
                        block = rje.matchExp('^(\s+)(\S+)',line)
                        #print block
                        if block:
                            # Query Line
                            leadlen = len(block[0])
                            seqblock = block[1]
                            #print block, leadlen, (leadlen+len(seqblock))
                            if block[1][:3] == '*->':    # Start
                                leadlen += 3
                                #print seqblock[3:]
                                seqblock = seqblock[3:]
                            if block[1][-3:] == '<-*':  # End
                                #print seqblock[:-3]
                                seqblock = seqblock[:-3]
                                readaln = False
                            #print block, leadlen, (leadlen+len(seqblock))
                            aln.info['QrySeq'] += seqblock
                            # Alignment Line
                            i += 1
                            aln.info['AlnSeq'] += resline[i][leadlen:(leadlen+len(seqblock))]
                            # Subject Line
                            i += 1
                            aln.info['SbjSeq'] += resline[i][leadlen:(leadlen+len(seqblock))]
                            # Skip Blank line
                            i += 2
                        else:
                            #print 'This should be a block!:\n', line
                            i += 1

                i += 1
            #print self.search
            #print self.search[0].hit
            #print self.search[0].hit[0].aln
            self.verbose(0,1,'Reading of %s HMM results complete! (%d Searches)' % (resfile,len(self.search)),2)
            return True
        except:
            self.log.errorLog('Calamity during readHMMSearch(%s) %s.' % (resfile,_stage))
            return False
Esempio n. 52
0
    def makeFlySeq(self):  ### Main run method
        '''Main run method.'''
        try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/')
            scmd = ['accnr=F','seqnr=F','gnspacc=F']
            genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd)
            cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd)
            exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd)

            ### ~ [1] ~	Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ###
            genedict = {}   # Dictionary of {ID:Sequence object}
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name'])
                if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                genedict[id] = gene
                gene.setInfo({'Scaffold':scaffold,'Gene':name})
                try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos)
                except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                gene.opt['Complement'] = start > end        # Sequence on "lagging" strand
                gene.setStat({'Start':start,'End':end})
                gene.list['CDS'] = []       # Will add CDS sequences here
                gene.list['Exon'] = []      # Will add exon sequences here
            self.log.printLog('\r#GENE','Processing Gene Annotation complete!')
                           
            ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (cx,ctot) = (0.0,cds.seqNum())
            for seq in cds.seq:
                self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False)
                cx += 100
                try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False)
                seq.obj['Parent'] = gene = genedict[parent]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['CDS'].append(seq)
            self.log.printLog('\r#CDS','Processing CDS Annotation complete!')
                
            ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            (ex,etot) = (0.0,exons.seqNum())
            for seq in exons.seq:
                self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False)
                ex += 100
                try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name'])
                except:
                    self.log.errorLog(seq.info['Name'])
                    raise
                seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]]
                try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos)
                except:
                    try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos)
                    except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos)
                (start,end) = (string.atoi(start),string.atoi(end))
                seq.opt['Complement'] = start > end        # Sequence on "lagging" strand
                seq.setStat({'Start':start,'End':end})
                gene.list['Exon'].append(seq)
            self.log.printLog('\r#EXON','Processing Exon Annotation complete!')
                
            ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            (gx,gtot) = (0.0,genes.seqNum())
            for gene in genes.seq:
                glen = gene.aaLen()
                self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False)
                gx += 100
                clist = []
                for seq in gene.list['CDS']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    clist.append(pos)
                clist = rje.sortUnique(clist,xreplace=False)
                elist = []
                for seq in gene.list['Exon']:
                    if gene.opt['Complement']:  # Must substract from "wrong" end and reverse
                        start = gene.stat['Start'] - seq.stat['Start']
                        end = gene.stat['Start'] - seq.stat['End']
                    else:
                        start = seq.stat['Start'] - gene.stat['Start']
                        end = seq.stat['End'] - gene.stat['Start']
                    pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen))
                    elist.append(pos)
                elist = rje.sortUnique(elist,xreplace=False)
                gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,','))
            self.log.printLog('\r#GENE','Generating new Gene Annotation complete!')
            ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            genes.saveFasta(seqfile='flybase_DROME.genes.fas')

        except: self.log.errorLog(rje_zen.Zen().wisdom())
Esempio n. 53
0
    def makeHTML(self): ### Generates HTML pages for interactive navigation.
        '''Generates HTML pages for interactive navigation.'''
        try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            basefile = self.basefile()
            scmd = self.cmd_list + ['seqin=%s' % self.getStr('Candidates'),'autoload=T','autofilter=F','seqmode=file']
            candseq = rje_seqlist.SeqList(self.log,scmd)
            # All files and directories are named after basefile:
            # *.fas = original target PROTEIN sequences (with original descriptions)
            scmd = self.cmd_list + ['seqin=%s' % self.getStr('SeqIn'),'autoload=T','autofilter=F','seqmode=file']
            seqlist = rje_seqlist.SeqList(self.log,scmd)
            # *.gablam.tdt = GABLAM results with match details. (Might have *.hmmer.tdt instead.)
            gdb = self.db().addTable('%s.gablam.tdt' % basefile,mainkeys=['Qry','Hit'],name='gablam',expect=False)
            # - Contains candidate proteins as Queries and Target proteins as hits
            # *.HAQESAC/ = directory containing individual HAQESAC runs, named after Hit accnum
            haqdir = rje.makePath('./%s.HAQESAC/' % basefile)

            ### ~ [2] Generate front page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            hfile = '%s.html' % basefile
            hobj = self.obj['HTML']
            hobj.list['StyleSheets'] = ['http://www.slimsuite.unsw.edu.au/stylesheets/rje_tabber.css',
                                        'http://www.slimsuite.unsw.edu.au/stylesheets/slimhtml.css']
            html = hobj.htmlHead(basefile)
            # Front page should have:
            html += '<h1>%s</h1>\n\n' % basefile
            htabs = []      # (tab_id, tab_html_text[, tab_title])
            # Target protein list (with links to HAQ HTML)
            ctext = '%s\n' % string.join(['Name','Descripton','Length'],'\t')
            seqdict = seqlist.makeSeqNameDic('short')
            if gdb: hitlist = gdb.indexKeys('Hit')
            else: hitlist = rje.sortKeys(seqdict)
            for name in hitlist:
                seq = seqdict[name]
                cseq = [name,seqlist.seqDesc(seq),'%s aa' % seqlist.seqLen(seq)]
                acc = seqlist.seqAcc(seq)
                if os.path.exists('%s%s.log' % (haqdir,acc)):
                    cseq[0] = '<a href="%s%s.html">%s</a>' % (haqdir,acc,cseq[0])
                ctext += '%s\n' % string.join(cseq,'\t')
            htabs.append(('Hits',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'Target sequences hit by candidates.'))
            # GABLAM/HMM table (with above links)
            if gdb:
                ctext = '%s\n' % string.join(gdb.fields(),'\t')
                for gline in open('%s.gablam.tdt' % basefile,'r').readlines()[1:]:
                    gdata = string.split(gline,'\t')
                    acc = string.split(gdata[0],'__')[-1]
                    gdata[0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (acc,gdata[0])
                    acc = string.split(gdata[1],'__')[-1]
                    gdata[1] = '<a href="%s%s.html">%s</a>' % (haqdir,acc,gdata[1])
                    ctext += '%s\n' % string.join(gdata,'\t')
                htabs.append(('GABLAM',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'GABLAM hit table.'))
            # Candidate list (with DB links)
            if candseq.seqNum():
                ctext = '%s\n' % string.join(['AccNum','ID','Descripton','Length'],'\t')
                accdict = candseq.makeSeqNameDic('accnum')
                for acc in rje.sortKeys(accdict):
                    seq = accdict[acc]
                    cseq = [acc,candseq.seqID(seq),candseq.seqDesc(seq),'%s aa' % candseq.seqLen(seq)]
                    cseq[0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (acc,acc)
                    ctext += '%s\n' % string.join(cseq,'\t')
                htabs.append(('Candidates',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'Candidate sequences to search.'))
            html += hobj.tabberHTML('GABLAM',htabs)
            html += hobj.htmlTail()
            open(hfile,'w').write(html)

            ### ~ [3] Generate sequence-specific pages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            #?# Move this to HAQESAC or MultiHAQ
            for i in range(len(hitlist)):
                hit = string.split(hitlist[i],'__')[-1]
                logfile = '%s%s.log' % (haqdir,hit)
                seqbase = logfile[:-4]
                hfile = '%s.html' % seqbase
                html = hobj.htmlHead(seqbase)
                # Front page should have:
                html += '<h1>%s</h1>\n\n' % seqbase
                html += '<p>Click <a href="../%s.html">here</a> to return to results summary. \n' % basefile
                if i: html += 'Previous: <a href="./%s.html"><code>%s</code></a>. \n' % (string.split(hitlist[i-1],'__')[-1],hitlist[i-1])
                if i < len(hitlist)-1: html += 'Next: <a href="./%s.html"><code>%s</code></a>. \n' % (string.split(hitlist[i+1],'__')[-1],hitlist[i+1])
                html += '</p>\n'
                htabs = []      # (tab_id, tab_html_text[, tab_title])
                for ftype in ['png','tree.txt','fas','nwk','log']:
                    seqfile = '%s.%s' % (seqbase,ftype)
                    if not os.path.exists(seqfile): continue
                    tabtext = '<p><a href="./%s">./%s</a></p>\n' % (os.path.basename(seqfile),os.path.basename(seqfile))
                    if ftype == 'png':
                        tabtext += '<a href="./%s"><img src="%s" width="100%%"></a>\n' % (os.path.basename(seqfile),os.path.basename(seqfile))
                        tabdesc = 'PNG of %s tree.' % seqbase
                    else:
                        tabtext += '<pre>%s</pre>\n' % open(seqfile,'r').read()
                        if ftype == 'tree.txt':
                            for xref in hitlist:
                                reptext = '<a href="./%s.html">%s</a>' % (string.split(xref,'__')[-1],xref)
                                tabtext = string.replace(tabtext,': %s ' % xref,': %s ' % reptext)
                            while rje.matchExp('(: \S+_(\S+)__(\S+) )',tabtext):
                                (oldtext,sid,spec,spacc) = rje.matchExp('(: (\S+)_(\S+)__(\S+) )',tabtext)
                                newtext = ': %s_<a href="http://www.uniprot.org/taxonomy/?query=%s&sort=score" target="_blank">%s</a>__<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a> ' % (sid,spec,spec,spacc,spacc)
                                tabtext = string.replace(tabtext,oldtext,newtext)
                        tabdesc = '%s output' % seqfile
                    htabs.append((ftype,tabtext,tabdesc))
                if htabs: html += hobj.tabberHTML(os.path.basename(seqbase),htabs)
                else: html += '<p><i>No output found for <code>%s</code>!</i></p>\n' % hit
                html += hobj.htmlTail()
                open(hfile,'w').write(html)
        except: self.errorLog('Problem with %s.makeHTML()' % self.prog())
Esempio n. 54
0
    def loadTimePoints(
            self, filename):  ### Load TimePoints from file of various formats
        '''Load TimePoints from file of various formats.'''
        try:  ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            if not os.path.exists(filename):
                return self.errorLog('File %s missing!' % filename)
            data = open(filename, 'r').readlines()
            db = self.db('TimePoints')

            ### ~ [2] Load from File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            ## ~ [2a] Delimited File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            if string.split(data[0])[0] == 'TimePoint Name':  #
                ftype = 'delimited text file'
                temp = self.db().addTable(filename,
                                          mainkeys=['TimePoint Name'],
                                          name='temp')
                for entry in temp.entries():
                    db.addEntry(entry)
                db.deleteTable(temp)
            ## ~ [2b] File of Database Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            elif data[0][0] == '(':
                ftype = 'database string'
                for line in data:
                    line = rje.chomp(line)
                    while line[-1:] == ' ':
                        line = line[:-1]
                    pdata = string.split(string.replace(line[2:-3], ', ', ','),
                                         "','")
                    if not pdata: continue
                    if rje.matchExp('^(\d+)$', pdata[0]):
                        pdata.pop(0)  # Database output with key ID numbers
                    entry = {}
                    for field in db.fields():
                        entry[field] = pdata[db.fields().index(field)]
                    db.addEntry(entry)
            ## ~ [2c] Glossary Text File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ##
            else:
                ftype = 'glossary text file'
                for line in data:
                    if '(TimePoint)' not in line: continue
                    # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history)
                    pdata = string.split(line, '. ')
                    if pdata[2][-2:] == 'ya':
                        pdata[1] = '%s. %s' % (pdata[1], pdata.pop(2))
                    entry = {'TimePoint Name': pdata[0]}
                    try:
                        entry['Source URL'] = rje.matchExp(
                            'Source: <(\S+)>', line)[0]
                    except:
                        self.errorLog('Cannot read Source URL')
                    try:
                        entry['TimePoint Description'] = rje.matchExp(
                            '^(\S.+\S) Source: <',
                            string.join(pdata[2:], '. '))[0]
                    except:
                        self.errorLog('Cannot read TimePoint Description: %s' %
                                      line)
                    if pdata[1][-2:] == 'ya':
                        [entry['Year'],
                         entry['yearUnit']] = string.split(pdata[1])[-2:]
                    else:
                        try:
                            ydata = rje.matchExp('(\d+) (\S+), (\d+) (\S+)$',
                                                 pdata[1])
                            if ydata:
                                for i in range(4):
                                    entry[['Year', 'yearUnit', 'month',
                                           'day'][i]] = ydata[i]
                            else:
                                (entry['Year'],
                                 entry['yearUnit']) = rje.matchExp(
                                     '(\d+) (\S+)$', pdata[1])
                        except:
                            self.errorLog('Cannot parse time from %s' %
                                          pdata[1])
                    kfield = [
                        'keyword1', 'keyword2', 'keyword3', 'keyword4',
                        'keyword5'
                    ]
                    try:
                        keywords = string.split(
                            rje.matchExp('\(Keywords: (\S.+)\)', pdata[-1])[0],
                            ', ')
                        while keywords and kfield:
                            entry[kfield.pop(0)] = keywords.pop(0)
                        while kfield:
                            entry[kfield.pop(0)] = 'blank'
                        if keywords:
                            self.printLog(
                                '#ERR', '%d extra Keywords (%s)!' %
                                (len(keywords), string.join(keywords, ', ')))
                    except:
                        self.errorLog('Cannot read Keywords (%s)' % pdata[-1])
                    db.addEntry(entry)
            ### ~ [3] Summarise Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ###
            self.printLog(
                '#TP', 'Timepoints read from %s: %s TimePoints total.' %
                (ftype, db.entryNum()))
            return True
        except:
            self.errorLog('%s.loadTimePoints(%s) error' % (self, filename))
            return False