def ensLoci(self): ### Reads from EnsLoci file if it exists and parses into dictionaries. '''Reads from EnsLoci file if it exists and parses into dictionaries.''' self.dict['EnsLoci'] = {} # Dictionary of {EnsGene:shortName()} self.dict['EnsDesc'] = {} # Dictionary of {EnsGene:Description} self.dict['UniEns'] = {} # Dictionary of {UniProt?:EnsGene} if os.path.exists(self.info['EnsLoci']): elines = self.loadFromFile(self.info['EnsLoci']) (ex,etot) = (0.0,len(elines)) while elines: ex += 100.0 line = elines.pop(0) if line[:1] != '>': continue if rje.matchExp('^>(\S+).+ gene:(\S+)\]',line): (name,gene) = rje.matchExp('^>(\S+).+ gene:(\S+)\]',line) else: self.log.errorLog('Problem with EnsLoci line: %s' % line,printerror=False) continue try: acc = rje.matchExp('\[acc:(\S+)',line)[0] except: acc = '' if acc: self.dict['UniEns'][acc] = gene self.dict['EnsLoci'][gene] = name self.dict['EnsDesc'][gene] = string.join(string.split(string.split(line,' [acc:')[0][1:])[1:]) if self.opt['FullEns'] and gene not in self.list['Genes']: self.list['Genes'].append(gene) if self.opt['FullEns'] and gene not in self.dict['GeneCard']: self.dict['GeneCard'][gene] = {'EnsEMBL':gene,'Symbol':'!FAILED!'} self.log.printLog('\r#ENS','Parsing EnsLoci %.1f%%: %s genes' % (ex/etot,rje.integerString(len(self.dict['EnsLoci']))),newline=False,log=False) self.log.printLog('\r#ENS','Parsing EnsLoci complete: %s genes' % (rje.integerString(len(self.dict['EnsLoci']))))
def tabulatePPIRegion(self): ### Tabulates regions of known PPI from DAT file '''Tabulates regions of known PPI from DAT file.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tabfile = 'ppi_region.tdt' unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat' if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB','%s found. (Force=F)' % tabfile) headers = ['Protein','Start','End','Interactor'] rje.delimitedFileOutput(self,tabfile,headers,rje_backup=True) ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gcmd = "grep -P '(ID |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile self.printLog('#GREP',gcmd) prot = None; rx = 0; plist = []; ilist = [] for gline in os.popen(gcmd).readlines(): if rje.matchExp('ID (\S+)',gline): prot = rje.matchExp('ID (\S+)',gline)[0] if rje.matchExp('FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline): (rstart,rend,rint) = rje.matchExp('FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline) for ppi in string.split(rint): if rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi): datadict = {'Protein':prot,'Start':rstart,'End':rend,'Interactor':rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi)[0]} rje.delimitedFileOutput(self,tabfile,headers,datadict=datadict); rx += 1 if prot not in plist: plist.append(prot) if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor']) self.progLog('\r#REGTAB','Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString(len(plist)),rje.integerString(len(ilist)), rje.integerString(rx))) self.printLog('\r#REGTAB','Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)),rje.integerString(len(ilist)),rje.integerString(rx),tabfile)) return True except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def convert(self,filelist=[],outfile=None): ### Converts scansite output files in FileList to Outfile ''' Converts scansite output files in FileList to Outfile. ''' try: ### Setup ### _stage = 'Setup' if len(filelist) < 1: filelist = self.list['FileList'] if not outfile: outfile = self.info['Name'] if len(filelist) < 1: self.log.errorLog('No scansite files to convert! %s unchanged/not made.' % outfile,printerror=False) return False delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) if ext != outfile[-3:]: newfile = outfile[:-3] + ext if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)): outfile = newfile self.log.printLog('#OUT','Converting %d file(s), output to %s.' % (len(filelist),outfile)) ### Output File ### _stage = 'Output File' if not self.opt['Append'] or not os.path.exists(outfile): # Create with header OUTFILE = open(outfile,'w') headers = ['seq_id','enzyme','enz_group','aa','pos','score','percentile','matchseq','sa'] rje.writeDelimit(OUTFILE,headers,delimit) else: OUTFILE = open(outfile,'a') ### Conversion ### _stage = 'Conversion' sx = 0 for infile in filelist: if not os.path.exists(infile): self.log.errorLog('Input file %s does not exist! :o(' % infile,False,False) continue fx = 0 INFILE = open(infile,'r') inline = rje.nextLine(INFILE) while inline != None: if rje.matchExp(re_scansite,inline): scanlist = rje.matchExp(re_scansite,inline) rje.writeDelimit(OUTFILE,scanlist,delimit) sx += 1 fx += 1 rje.progressPrint(self,sx) inline = rje.nextLine(INFILE) self.log.printLog('#OUT','%s scansite results from %s. (%s Total.)' % (rje.integerString(fx),infile,rje.integerString(sx))) INFILE.close() ### End ### _stage = 'End' OUTFILE.close() self.log.printLog('#OUT','%s scansite results output to %s.' % (rje.integerString(sx),outfile)) return True except: self.log.errorLog('Error in convert(%s)' % _stage,printerror=True,quitchoice=False) raise
def parseDisorder(self): ### Parses disordered regions from sequence name (e.g. DisProt download) ''' Parses disordered regions from sequence name (e.g. DisProt download). #X-Y = disordered region [1.0]; &X-Y = ordered region [0.0]; All else neutral [0.5]; ''' try: ### Setup sequence and name ### sequence = self.info['Sequence'] name = self.info['Name'] self.list['ResidueDisorder'] = [0.5] * len(sequence) self.list['RegionDisorder'] = [] scoredict = {'#':1.0,'&':0.0} ### Process ### for region in string.split(name)[1:]: if rje.matchExp('^[#&](\d+)-(\d+)',region): (i,x,y) = rje.matchExp('^([#&])(\d+)-(\d+)',region) score = scoredict[i] start = string.atoi(x) - 1 end = string.atoi(y) for r in range(start,end): self.list['ResidueDisorder'][r] = score if i == '#': self.list['RegionDisorder'].append((start,end)) self.minRegion() if self.opt['PrintLog']: self.log.printLog('\r#DIS','DisProt Disorder parsing complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']),self.list['ResidueDisorder'].count(1.0))) return True except: self.log.errorLog('Error in Disorder.foldIndex(%s)' % self.info['Name'],quitchoice=True) return False
def readAAProp(self, filename=None): ### Reads AA Property Matrix from file ''' Reads AA Property Matrix from file. >> filename:str = Filename. If None, will use self.info['Name'] ''' try: ### <a> ### Load and read if filename: self.info['Name'] = filename else: filename = self.info['Name'] readtxt = 'Reading AA Properties from %s...' % filename self.progLog('\r#AAPROP', readtxt) proplines = self.loadFromFile(filename, v=2) ### <b> ### Process self.alphabet = [] self.prop = {} ## <i> ## Properties and alphabet for line in proplines: line = rje.chomp(line) if line.find('#') == 0: # Comment line continue elif line.find('PROP') == 0: # Header line - has amino acids line = rje.matchExp('^\S+(\s.+)', line)[0] while re.search('^\s+\S.*', line): (aa, line) = rje.matchExp('^\s+(\S)(.*)', line) self.alphabet.append(aa) readtxt += ' ...%s' % string.join(self.alphabet) self.progLog('\r#AAPROP', readtxt) elif re.search('^\S', line) and self.alphabet: # Property line (aaproperty, line) = rje.matchExp('^(\S+)(\s.+)', line) readtxt += ' ...%s' % aaproperty self.progLog('\r#AAPROP', readtxt) self.prop[aaproperty] = {} for aa in self.alphabet: (p, line) = rje.matchExp('^\s+(\S)(.*)', line) self.prop[aaproperty][aa] = p #self.verbose(2,3,'...%s' % self.prop[property],0) readtxt += ' ...Done!' self.printLog('\r#AAPROP', readtxt) except IOError: self.log.errorLog( 'AA Property matrix file %s missing?' % self.info['Name'], True) raise except: self.log.errorLog( 'Major Problem reading AA Property matrix(%s)' % self.info['Name'], True) return add = [] if 'X' not in self.alphabet: add.append('X') if '-' not in self.alphabet: add.append('-') if add: add = self.alphabet + add self.useAlphabet(alphabet=add) self.makePropDif()
def parseOMIM(self): ### Main parsing method '''Main parsing method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} aas = string.split(string.join(rje_sequence.aa_code_3.values()).upper()) oline = os.path.exists(self.info['Name']) (olen,ox,mx) = (len(open(self.info['Name'],'r').readlines()),0.0,0) OMIM = open(self.info['Name'],'r') ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### record = gene = subid = disease = mutation = '' av = False # Whether reading *FIELD* AV for mutation data while oline: oline = OMIM.readline() self.log.printLog('\r#OMIM','Processing OMIM: %.2f%% (%s genes)' % (ox/olen,rje.integerString(len(self.dict['Records']))),newline=False,log=False) ox += 100.0 if not av and oline[:1] != '*': continue line = rje.chomp(oline) while line[-1:] == ' ': line = line[:-1] ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if line == '*RECORD*': (record,av) = ('',False) elif line == '*FIELD* NO': # New record record = rje.chomp(OMIM.readline()) gene = '' ox += 100.0 ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* TI': # New gene gene = string.split(rje.chomp(OMIM.readline()))[-1] subid = '' av = False ox += 100.0 ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* AV': av = True # Start of mutation records elif av and rje.matchExp('^(\.\d+)',line): # New subid mutation record subid = rje.matchExp('^(\.\d+)',line)[0] disease = rje.chomp(OMIM.readline()) ox += 100.0 try: mutation = rje.matchExp('^%s, (\D\D\D\d+\D\D\D)' % gene,rje.chomp(OMIM.readline()))[0] except: continue # No mutation or not coding change ox += 100.0 subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)',mutation) if subaa[0] not in aas or subaa[1] not in aas: continue if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} mx += 1 self.dict['Mutations'][gene][subid] = (disease,mutation) ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OMIM.close() self.log.printLog('\r#OMIM','Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len(self.dict['Records'])),rje.integerString(mx))) self.saveMutations() except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def readAAProp(self,filename=None): ### Reads AA Property Matrix from file ''' Reads AA Property Matrix from file. >> filename:str = Filename. If None, will use self.info['Name'] ''' try: ### <a> ### Load and read if filename: self.info['Name'] = filename else: filename = self.info['Name'] readtxt = 'Reading AA Properties from %s...' % filename self.progLog('\r#AAPROP',readtxt) proplines = self.loadFromFile(filename,v=2) ### <b> ### Process self.alphabet = [] self.prop = {} ## <i> ## Properties and alphabet for line in proplines: line = rje.chomp(line) if line.find('#') == 0: # Comment line continue elif line.find('PROP') == 0: # Header line - has amino acids line = rje.matchExp('^\S+(\s.+)',line)[0] while re.search('^\s+\S.*',line): (aa,line) = rje.matchExp('^\s+(\S)(.*)',line) self.alphabet.append(aa) readtxt += ' ...%s' % string.join(self.alphabet) self.progLog('\r#AAPROP',readtxt) elif re.search('^\S',line) and self.alphabet: # Property line (aaproperty,line) = rje.matchExp('^(\S+)(\s.+)',line) readtxt += ' ...%s' % aaproperty self.progLog('\r#AAPROP',readtxt) self.prop[aaproperty] = {} for aa in self.alphabet: (p,line) = rje.matchExp('^\s+(\S)(.*)',line) self.prop[aaproperty][aa] = p #self.verbose(2,3,'...%s' % self.prop[property],0) readtxt += ' ...Done!' self.printLog('\r#AAPROP',readtxt) except IOError: self.log.errorLog('AA Property matrix file %s missing?' % self.info['Name'],True) raise except: self.log.errorLog('Major Problem reading AA Property matrix(%s)' % self.info['Name'],True) return add = [] if 'X' not in self.alphabet: add.append('X') if '-' not in self.alphabet: add.append('-') if add: add = self.alphabet + add self.useAlphabet(alphabet=add) self.makePropDif()
def foldIndex(self): ### Runs FoldIndex disorder prediction '''Runs FoldIndex disorder prediction.''' try: ### Setup sequence and name ### sequence = self.info['Sequence'] ### Run Disorder ### retry = self.stat['FILoop'] url = "http://bioportal.weizmann.ac.il/fldbin/findex" params = "m=xml&sq=" + sequence + " " while retry: try: flines = urllib2.urlopen(url, params).readlines() except: flines = [] if flines: break retry -= 1 time.sleep(self.stat['FISleep']) if not flines: self.log.errorLog('FoldIndex run for "%s" failed.' % self.info['Name'], printerror=False) self.list['ResidueDisorder'] = [] self.list['RegionDisorder'] = [] return False ### Process ### self.list['ResidueDisorder'] = [0.0] * len(sequence) self.list['RegionDisorder'] = [] for f in flines: if rje.matchExp( '<segment start="(\d+)" end="(\d+)" len="(\d+)"', f): fm = rje.matchExp( '<segment start="(\d+)" end="(\d+)" len="(\d+)"', f) self.list['RegionDisorder'].append( (string.atoi(fm[0]), string.atoi(fm[1]))) for i in range(string.atoi(fm[0]) - 1, string.atoi(fm[1])): self.list['ResidueDisorder'][i] = 1.0 self.minRegion() if self.opt['PrintLog']: self.log.printLog( '\r#DIS', 'FoldIndex Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']), sum(self.list['ResidueDisorder']))) self.opt['Flat'] = True return True except: self.log.errorLog('Error in Disorder.foldIndex(%s)' % self.info['Name'], quitchoice=True) return False
def setup(self): ### Main class setup method. Makes sumfile if necessary. '''Main class setup method. Makes sumfile if necessary.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.debug(self.getStrLC('SumFile')); self.debug(self.getStr('SumFile')) if self.getStrLC('Basefile') in ['','none']: self.baseFile(rje.baseFile(self.info['SumFile'])) if self.getStrLC('SumFile') in ['','none']: self.info['SumFile'] = '%s.tdt' % self.basefile() self.printLog('#SUM','Summary file: %s' % self.getStr('SumFile')) if os.path.exists(self.info['SumFile']) and not self.opt['Force']: if rje.yesNo('%s found. Use these results?' % self.info['SumFile']): return self.printLog('#SUM','Summary results file found. No MASCOT processing.') mapgi = False ### ~ [2] Process MASCOT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for mfile in self.list['ResFiles']: bud = budapest.Budapest(self.log,self.cmd_list+['mascot=%s' % mfile]) bud.info['Name'] = mfile bud.readMascot() self.dict['Searches'][mfile] = bud.dict['Hits'] protacclist = rje.sortKeys(bud.dict['Hits']) for protacc in protacclist: if rje.matchExp('gi\|(\d+)',protacc): mapgi = True accfile = '%s.%s.protacc' % (self.baseFile(),rje.baseFile(mfile)) self.debug(accfile) open(accfile,'w').write(string.join(protacclist,'\n')) self.printLog('#MFILE','%s: %s proteins.' % (mfile,rje.iLen(protacclist))) ## ~ [2a] gi Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## #if mapgi: # mapgi = self.dict['MapGI'] = seqlist.seqNameDic('NCBI') # open('mapgi.tmp','w').write(string.join(rje.sortKeys(mapgi),'\n')) ### ~ [3] Setup seqlist ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = rje_seq.SeqList(self.log,['gnspacc=T']+self.cmd_list) self.dict['Acc2Seq'] = seqlist.seqNameDic('Max') ### ~ [4] Generate Summary File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### sumhead = string.split('search,prot_hit_num,prot_acc,prot_desc,pep_seq',',') rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,rje_backup=True) for mfile in rje.sortKeys(self.dict['Searches']): bud = self.dict['Searches'][mfile] for protacc in rje.sortKeys(bud)[0:]: protname = bud[protacc]['prot_acc'] protdesc = bud[protacc]['prot_desc'] if rje.matchExp('gi\|(\d+)',protacc): gi = rje.matchExp('gi\|(\d+)',protacc)[0] try: protname = self.dict['Acc2Seq'][gi].shortName() protdesc = self.dict['Acc2Seq'][gi].info['Description'] except: protname = 'gi_UNK__%s' % gi #x#print protname, protdesc, bud[protacc] for pep in bud[protacc]['Peptides']: data = {'search':rje.baseFile(mfile,True),'prot_desc':protdesc,'prot_acc':protname, 'pep_seq':pep,'prot_hit_num':bud[protacc]['prot_hit_num']} rje.delimitedFileOutput(self,self.info['SumFile'],sumhead,datadict=data) except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def readSLiMSearchOcc(self,motifs=[]): ### Reads SLiMSearch results into data dictionary '''Reads SLiMSearch results into data dictionary.''' try:### ~ [1] Read ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not motifs: self.printLog('#OCC','Cannot process occurrences for No motifs!') occfile = '%s.csv' % self.info['ResFile'] delimit = rje.delimitFromExt(filename=occfile) data = rje.dataDict(self,occfile,mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=string.split('Seq,Desc,Start_Pos,End_Pos,Cons,HomNum,GlobID,LocID,Hyd,SA',',')) self.dict['Occ'] = {} ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (mx,ox,otot) = (0,0.0,len(data)) for occ in data: self.progLog('\r#OCC','Processing occurrences (%d motifs): %.2f%%' % (mx,ox/otot)); ox += 100.0 #x#self.deBug('%s vs MinHom %d' % (data[occ],self.stat['MinHom'])) if string.atoi(data[occ]['HomNum']) < self.stat['MinHom']: continue (motif,seq,start,end) = string.split(occ,delimit) if motif not in motifs: continue try: gene = rje.matchExp('gene:(\S+)\]',data[occ]['Desc'])[0] self.deBug('%s:%s' % (gene,self.ensGO(gene))) if not self.ensGO(gene): continue except: continue if motif[-3:] == 'rev': (motif,type) = (motif[:-4],'Rev') elif motif[-5:] == 'scram': (motif,type) = (motif[:-6],'Scr') else: type = 'ELM' if motif not in self.dict['Occ']: self.dict['Occ'][motif] = {}; mx += 1 if type not in self.dict['Occ'][motif]: self.dict['Occ'][motif][type] = {} if gene not in self.dict['Occ'][motif][type]: self.dict['Occ'][motif][type][gene] = [] self.dict['Occ'][motif][type][gene].append(data[occ]) self.printLog('\r#OCC','Processed %s occurrences: %d motifs with GO-links' % (rje.integerString(otot),mx)) except: self.log.errorLog(rje_zen.Zen().wisdom())
def report(self): ### Run qstat to get job list then showstart on each job '''Run qstat to get job list then showstart on each job .''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qidlist = [] qidjob = {} ### ~ [2] ~ Read in List of IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qline in os.popen('qstat'): try: (qid, job) = rje.matchExp('^(\d+)\.\S+\s+(\S+)', qline) qidlist.append(qid) qidjob[qid] = job except: continue ### ~ [3] ~ Report ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#QSTAT', '%d jobs in queue.' % len(qidlist)) for qid in qidlist: self.printLog('#JOB', '%s = %s' % (qid, qidjob[qid]), timeout=False) for qline in os.popen('showstart %s' % qid): if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False) self.printLog('#ZEN', rje_zen.Zen().wisdom()) except: self.errorLog('QSub.report problem')
def setupStatFilter( callobj, statlist=[], filterlist=[] ): ### Makes StatFilter dictionary from statlist and filterlist ''' Makes StatFilter dictionary from statlist and filterlist (from cmd_list) !!! Changes case of statfilter keys. !!! >> callobj:RJE_Object [None] = calling object for Error Messages etc. >> statlist:list of stats that are allowed for filtering. Generally column headers for output. >> filterlist:list of StatFilters read in from commandline consisting of StatOperatorValue << statfilter:dictionary of StatFilter {Stat:(Operator,String,Numeric)} ''' try: ## Setup dictionary ## statfilter = {} for filter in filterlist: ## Extract details ## match = rje.matchExp( '^(\S*[A-Za-z0-9])(>|>=|=<|=>|<=|==|=|<|!=|<>)(-*[A-Za-z0-9]\S*)$', filter) if not match: callobj.log.errorLog('Filter "%s" not recognised.' % filter, printerror=False) continue (stat, op, cutoff) = match if op == '<>': op = '!=' if op == '=': op = '==' if op in ['=>', '=<']: op = rje.strReverse(op) if op not in ['=>', '=<', '!=', '==', '>', '<']: callobj.log.errorLog('Filter "%s" operator "%s" not known!' % (filter, op), printerror=False) continue ## Check for numeric value ## try: numcut = float(cutoff) except: numcut = None ## Check stat ## if stat not in statlist: for h in statlist: if h.lower() == stat.lower(): stat = h break if stat not in statlist: callobj.log.errorLog('Stat "%s" in filter "%s" not found.' % (stat, filter), printerror=False) continue ## Update dictionary ## statfilter[stat] = (op, cutoff, numcut) ### Finish ### return statfilter except: callobj.log.errorLog('Error in rje_scoring.setupStatFilter()', quitchoice=True) return statfilter
def run(self,iterate=None,log=True): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### slim = '' if iterate == None: iterate = self.getBool('Iterate') elif iterate: self.setBool({'Iterate':True}) self.setup(log=log) self.setInt({'MinSeq':max(1,self.getInt('MinSeq'))}) if len(self.list['Peptides']) < self.getInt('MinSeq'): if log: self.printLog('#MIN','Too few peptides (%d) for minseq=%d' % (len(self.list['Peptides']),self.getInt('MinSeq'))) return ('','Too few peptides (%d) for minseq=%d' % (len(self.list['Peptides']),self.getInt('MinSeq'))) if not self.list['Input']: self.list['Input'] = self.list['Peptides'][0:] equiv = [] if self.getBool('ExtendAA'): #self.warnLog('Equivalence mode (extendaa=T) not yet implemented! Please contact author.') self.printLog('#EQUIV','[%s]' % string.join(self.list['Equiv'],'] [')) equiv = self.list['Equiv'][0:] ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### slim = rje_slim.makeSlim(self.list['Peptides'],self.getInt('MinSeq'),self.getNum('MinFreq'),self.getInt('MaxAA'),self,self.getStr('Ignore'),self.getBool('VarLength'),equiv) self.dict['Output']['slim'] = slim if log: self.printLog('#SLIM','SLiM generated: "%s"' % slim) if not slim: return (slim,'Unable to make a SLiM with these settings and peptides') ## ~ [2a] ~ Assess matches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## matched = [] if self.getBool('DNA'): regexp = string.replace(slim,'N','.') else: regexp = string.replace(slim,'X','.') for peptide in self.list['Peptides']: searchpep = string.replace('X%sX' % peptide,'$X','') searchpep = string.replace(searchpep,'X^','') searchpep = string.replace(searchpep,'-','') try: if rje.matchExp('(%s)' % regexp,searchpep): matched.append(peptide) except: self.errorLog('Error with SLiM/peptide match, %s vs %s' % (regexp,searchpep)) sx = len(matched) matchstr = 'SLiM matches %d of %d sequences (%.1f%%).' % (sx,len(self.list['Peptides']),(100.0*sx)/len(self.list['Peptides'])) if log: self.printLog('#FREQ',matchstr) if iterate: self.dict['Output']['iterate'] += '%s: %s\n' % (slim,matchstr) self.dict['Output']['iterate'] += '-> %s\n' % string.join(matched,',') if iterate and (len(matched) != len(self.list['Peptides'])): if not matched: return (slim,'Unable to make an interative SLiM with these settings and peptides') if self.getStrLC('PeptAlign'): self.list['Peptides'] = string.split(string.replace(string.join(matched),'-','')) else: self.list['Peptides'] = matched return self.run(iterate=True) ### ~ [3] REST Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if iterate: (matchstr,matched) = self.inputMatches(regexp) if log: self.printLog('#FREQ',matchstr) self.dict['Output']['match'] = matchstr self.dict['Output']['matches'] = string.join(matched,'\n') try: unmatched = self.list['Input'][0:] for pep in matched: unmatched.remove(pep) self.dict['Output']['unmatched'] = string.join(unmatched,'\n') except: self.dict['Output']['unmatched'] = self.errorLog('SLiMMaker Umatched Error') return (slim,matchstr) except: return (slim,self.errorLog('SLiMMaker Error'))
def addLinks(self,nested): ### Adds href aname links to definitions. '''Adds href aname links to definitions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### endstrip = [')','.',',',':',';','!'] if self.getBool('Plurals'): endstrip.append('s') for term in rje.sortKeys(nested): if term == '=': linkdef = [] rawdef = string.split(string.replace(nested['='],'(','( ')) while rawdef: glossary = self.dict['Glossary'] if self.getBool('HRef') and rje.matchExp('<(\S+)>',rawdef[0]): safetynet = rawdef[0:] url = rje.matchExp('<(\S+)>',rawdef[0])[0] if rje.matchExp('<(\S+)>\[(\S+)',rawdef[0]): rawdef[0] = '[%s' % rje.matchExp('<(\S+)>\[(\S+)',rawdef[0])[1] elif rje.matchExp('<(\S+)>(\S+)',rawdef[0]): rawdef[0] = '[%s]%s' % (url,rje.matchExp('<(\S+)>(\S+)',rawdef[0])[1]) else: rawdef[0] = '[%s]' % url try: while ']' not in rawdef[0]: rawdef[0] = '%s %s' % (rawdef[0],rawdef.pop(1)) (linktext,linkextra) = rje.matchExp('\[(.+)\](\S*)',rawdef.pop(0)) if url[:3] not in ['htt','ftp']: url = 'http://%s' % url linkdef.append('<a href="%s">%s</a>%s' % (url,linktext,linkextra)) continue except: self.errorLog('Problem parsing URL from "%s"' % nested['=']) rawdef = safetynet if rawdef[0].lower() not in glossary: if rawdef[0].lower()[:-1] not in glossary or rawdef[0].lower()[-1] not in endstrip: linkdef.append(rawdef.pop(0)); continue akey = []; alink = [] while rawdef and (rawdef[0].lower() in glossary or rawdef[0].lower()[:-1] in glossary): if rawdef[0].lower() in glossary and '=' in glossary[rawdef[0].lower()]: rterm = rawdef[0].lower() elif len(rawdef) > 1 and rawdef[0].lower() in glossary and (rawdef[1].lower() in glossary[rawdef[0].lower()] or rawdef[1].lower()[:-1] in glossary[rawdef[0].lower()]): rterm = rawdef[0].lower() elif rawdef[0].lower()[-1] in endstrip and rawdef[0].lower()[:-1] in glossary: rterm = rawdef[0].lower()[:-1] elif rawdef[0].lower() in glossary: rterm = rawdef[0].lower() else: break glossary = glossary[rterm] akey.append(rterm) alink.append(rawdef.pop(0)) akey = string.join(akey,'_') if '=' in glossary: alink = string.join(alink) if nested == glossary: linkdef.append(alink) elif self.getStr('HTMLStyle') != 'tab': if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<a href="#%s">%s</a>%s' % (akey,alink[:-1],alink[-1])) else: linkdef.append('<a href="#%s">%s</a>' % (akey,alink)) else: if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<scaps>%s</scaps>%s' % (alink[:-1],alink[-1])) else: linkdef.append('<scaps>%s</scaps>' % (alink)) else: linkdef.append(alink[0]) rawdef = alink[1:] + rawdef nested['+'] = string.replace(string.join(linkdef),'( ','(') while rje.matchExp(' _([^_]+)_',nested['+']): italics = rje.matchExp(' _([^_]+)_',nested['+'])[0] nested['+'] = string.replace(nested['+'],' _%s_' % italics,' <i>%s</i>' % italics) #self.deBug(nested) elif term != '+': self.addLinks(nested[term]) except: self.errorLog('%s.addLinks error' % self)
def stripTags(html,keeptags=[]): ### Strips all HTML tag text from html code, except listed keeptags '''Strips all HTML tag text from html code, except listed keeptags.''' keeptags = string.split(string.join(keeptags).lower()) tagsplit = string.split(html,'<') newhtml = tagsplit.pop(0) while tagsplit: tagtxt = tagsplit.pop(0) tag = rje.matchExp('^\\\\?([A-Za-z0-9]+)',tagtxt) if tag and tag[0].lower() in keeptags: newhtml += '<%s' % tagtxt elif tagtxt.find('>') >= 0: newhtml += ' %s' % tagtxt[tagtxt.find('>')+1:] return string.replace(newhtml,' ',' ')
def run(self,iterate=None,log=True): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### slim = '' if iterate == None: iterate = self.getBool('Iterate') elif iterate: self.setBool({'Iterate':True}) if not self.setup(log=log): return ('','SLiMMaker setup failed. Check log.') if not self.list['Input']: self.list['Input'] = self.list['Peptides'][0:] equiv = [] if self.getBool('ExtendAA'): #self.warnLog('Equivalence mode (extendaa=T) not yet implemented! Please contact author.') self.printLog('#EQUIV','[%s]' % string.join(self.list['Equiv'],'] [')) equiv = self.list['Equiv'][0:] ### ~ [2] ~ Add main run code here ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### slim = rje_slim.makeSlim(self.list['Peptides'],self.getInt('MinSeq'),self.getNum('MinFreq'),self.getInt('MaxAA'),self,self.getStr('Ignore'),self.getBool('VarLength'),equiv) self.dict['Output']['slim'] = slim if log: self.printLog('#SLIM','SLiM generated: "%s"' % slim) if not slim: return (slim,'Unable to make a SLiM with these settings and peptides') ## ~ [2a] ~ Assess matches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## matched = [] if self.getBool('DNA'): regexp = string.replace(slim,'N','.') else: regexp = string.replace(slim,'X','.') for peptide in self.list['Peptides']: searchpep = string.replace('X%sX' % peptide,'$X','') searchpep = string.replace(searchpep,'X^','') searchpep = string.replace(searchpep,'-','') try: if rje.matchExp('(%s)' % regexp,searchpep): matched.append(peptide) except: self.errorLog('Error with SLiM/peptide match, %s vs %s' % (regexp,searchpep)) sx = len(matched) matchstr = 'SLiM matches %d of %d sequences (%.1f%%).' % (sx,len(self.list['Peptides']),(100.0*sx)/len(self.list['Peptides'])) if log: self.printLog('#FREQ',matchstr) if iterate: self.dict['Output']['iterate'] += '%s: %s\n' % (slim,matchstr) self.dict['Output']['iterate'] += '-> %s\n' % string.join(matched,',') if iterate and (len(matched) != len(self.list['Peptides'])): if not matched: return (slim,'Unable to make an interative SLiM with these settings and peptides') if self.getStrLC('PeptAlign'): self.list['Peptides'] = string.split(string.replace(string.join(matched),'-','')) else: self.list['Peptides'] = matched return self.run(iterate=True) ### ~ [3] REST Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if iterate: (matchstr,matched) = self.inputMatches(regexp) if log: self.printLog('#FREQ',matchstr) self.dict['Output']['match'] = matchstr self.dict['Output']['matches'] = string.join(matched,'\n') try: unmatched = self.list['Input'][0:] for pep in matched: unmatched.remove(pep) self.dict['Output']['unmatched'] = string.join(unmatched,'\n') except: self.dict['Output']['unmatched'] = self.errorLog('SLiMMaker Umatched Error') return (slim,matchstr) except: return (slim,self.errorLog('SLiMMaker Error'))
def parse(self): ### Parse REST file into dictionaries '''Parse REST file into dictionaries.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RestKeys'] = [] rbase = '%s%s' % (self.getStr('RestOutDir'),rje.baseFile(self.getStr('RestBase'),strip_path=True,keepext=True)) if rje.exists(self.getStr('RestIn')): restin = open(self.getStr('RestIn'),'r').read() elif rje.matchExp('^(\d+)$',self.getStr('RestIn')): url = '%sretrieve&jobid=%s&password=%s' % (self.getStr('RestURL'),self.getStr('RestIn'),self.getStr('Password')) if self.getBool('PureAPI') and self.getStrLC('Rest'): url += '&rest=%s' % (self.getStr('Rest')) else: url += '&rest=full' restin = urllib2.urlopen(url).read() if self.getBool('PureAPI'): return restin else: raise IOError('%s not found!' % self.getStr('RestIn')) jobid = None ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for restdata in string.split(restin,'###~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~###\n'): if not jobid: self.dict['Output']['intro'] = restdata prog = rje.matchExp('Output for (\S+)',restdata)[0] self.dict['Output']['prog'] = prog jobid = rje.matchExp('JobID: (\d+)',restdata)[0] self.dict['Output']['jobid'] = jobid if not self.getStrLC('RestBase'): rbase = '%s%s' % (self.getStr('RestOutDir'),jobid) self.dict['Outfile']['jobid'] = '%s.jobid' % (rbase) continue restlines = string.split(restdata,'\n') rparse = string.split(restlines.pop(0)) if rparse[0] != '#': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue if rparse[1][-1] != ':': self.errorLog('REST output format error: %s' % string.join(rparse),printerror=False); continue rkey = rparse[1][:-1] try: rfile = '%s.%s' % (rbase,rje.baseFile(rparse[2],strip_path=True,keepext=True)) except: rfile = '' if not rfile: rfile = '%s.%s' % (rbase,rkey) rfile = string.replace(rfile,'%s.%s.' % (jobid,jobid),'%s.' % jobid) self.dict['Output'][rkey] = string.join(restlines,'\n') self.dict['Outfile'][rkey] = rfile self.list['RestKeys'].append(rkey) self.printLog('#PARSE','Parsed %s: %d REST outputs.' % (self.getStr('RestIn'),len(self.dict['Output']))) return True except: self.errorLog('%s.parse error' % self); return False
def foldIndex(self): ### Runs FoldIndex disorder prediction '''Runs FoldIndex disorder prediction.''' try: ### Setup sequence and name ### sequence = self.info['Sequence'] ### Run Disorder ### retry = self.stat['FILoop'] url = "http://bioportal.weizmann.ac.il/fldbin/findex" params = "m=xml&sq=" + sequence + " " while retry: try: flines = urllib2.urlopen(url, params).readlines() except: flines = [] if flines: break retry -= 1 time.sleep(self.stat['FISleep']) if not flines: self.log.errorLog('FoldIndex run for "%s" failed.' % self.info['Name'],printerror=False) self.list['ResidueDisorder'] = [] self.list['RegionDisorder'] = [] return False ### Process ### self.list['ResidueDisorder'] = [0.0] * len(sequence) self.list['RegionDisorder'] = [] for f in flines: if rje.matchExp('<segment start="(\d+)" end="(\d+)" len="(\d+)"',f): fm = rje.matchExp('<segment start="(\d+)" end="(\d+)" len="(\d+)"',f) self.list['RegionDisorder'].append((string.atoi(fm[0]),string.atoi(fm[1]))) for i in range(string.atoi(fm[0])-1,string.atoi(fm[1])): self.list['ResidueDisorder'][i] = 1.0 self.minRegion() if self.opt['PrintLog']: self.log.printLog('\r#DIS','FoldIndex Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']),sum(self.list['ResidueDisorder']))) self.opt['Flat'] = True return True except: self.log.errorLog('Error in Disorder.foldIndex(%s)' % self.info['Name'],quitchoice=True) return False
def parseDisorder( self ): ### Parses disordered regions from sequence name (e.g. DisProt download) ''' Parses disordered regions from sequence name (e.g. DisProt download). #X-Y = disordered region [1.0]; &X-Y = ordered region [0.0]; All else neutral [0.5]; ''' try: ### Setup sequence and name ### sequence = self.info['Sequence'] name = self.info['Name'] self.list['ResidueDisorder'] = [0.5] * len(sequence) self.list['RegionDisorder'] = [] scoredict = {'#': 1.0, '&': 0.0} ### Process ### for region in string.split(name)[1:]: if rje.matchExp('^[#&](\d+)-(\d+)', region): (i, x, y) = rje.matchExp('^([#&])(\d+)-(\d+)', region) score = scoredict[i] start = string.atoi(x) - 1 end = string.atoi(y) for r in range(start, end): self.list['ResidueDisorder'][r] = score if i == '#': self.list['RegionDisorder'].append((start, end)) self.minRegion() if self.opt['PrintLog']: self.log.printLog( '\r#DIS', 'DisProt Disorder parsing complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']), self.list['ResidueDisorder'].count(1.0))) return True except: self.log.errorLog('Error in Disorder.foldIndex(%s)' % self.info['Name'], quitchoice=True) return False
def taxDict(self,taxid,store=False,skipuni=False): ### Extracts taxonomy details from SpecFile for taxid '''Extracts taxonomy details from SpecFile for taxid. If taxid is a list, will process each element.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxdict = {} ### ~ [1] Taxa List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tlist = True try: taxid.sort() except: tlist = False if tlist: tx = 0.0; ttot = len(taxid); mx = 0 for t in taxid: self.progLog('\r#SPEC','Extracting Uniprot species details: %.1f%%' % (tx/ttot)); tx += 100.0 taxdict[t] = self.taxDict(t,store) if not taxdict[t]: mx += 1 self.printLog('\r#SPEC','Extracted Uniprot/NCBI species details for %s TaxID: %s missing' % (rje.iStr(ttot),rje.iStr(mx))) return taxdict ### ~ [2] Individual taxa ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxid = '%s' % taxid if taxid in self.dict['TaxDict']: return self.dict['TaxDict'][taxid] if not skipuni: greplines = os.popen('grep -A 1 " %s:" %s' % (taxid, self.getStr('SpecFile'))).readlines() for entry in greplines: nmatch = rje.matchExp('^(\S+)\s+\S+\s+(\d+):\s+N=(\S.+)\s*$',entry) if nmatch and nmatch[1] != taxid: break # Next taxon if nmatch: taxdict['spcode'] = nmatch[0]; taxdict['name'] = nmatch[2] elif rje.matchExp('C=(\S.+)\s*$',entry): taxdict['common'] = rje.matchExp('C=(\S.+)\s*$',entry)[0] #if not taxdict and taxid in self.list['RankID']: self.warnLog('Cannot find TaxID "%s" in %s!' % (taxid,self.getStr('SpecFile')),'Missing_TaxID',suppress=True) ## ~ [2b] ~ Adding missing scientific names from NameMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not taxdict: for entry in os.popen('grep -i -e "^%s\t" %s' % (taxid, self.getStr('NameMap'))).readlines(): tdata = string.split(entry,'\t|\t') if not tdata[3].startswith('scientific name'): continue tname = tdata[1] if 'name' in taxdict: self.warnLog('TaxID %d hits "%s" and "%s"!' % (taxid, taxdict[name],tname)) else: taxdict['name'] = tname return taxdict except: self.errorLog('%s.taxDict() error' % (self)); raise
def parseTMHMM(self,file=None): ### Parses TMHMM into dictionary ''' Parses TMHMM into dictionary self.tmhmm. >> file:str = will read from file if given, else self.info['TMHMM'] ''' try: ### <a> ### Setup _stage = '<a> Setup' tmhmm_pattern = '^(\S+)\s+(len.+)$' if file == None: file = self.info['TMHMM'] if file == 'None': self.verbose(0,1,'No TMHMM file given.',2) return self.verbose(0,3,'Parsing TMHMM file %s...' % file,0) TMRES = open(file, 'r') ### <b> ### Read in _stage = '<b> Read in' while 1: tmline = re.sub('\t',' ',TMRES.readline()) if tmline: tmres = rje.matchExp(tmhmm_pattern,tmline) if tmres: acc = tmres[0] if rje.matchExp('^\S+__(\S+)',acc): acc = rje.matchExp('^\S+__(\S+)',acc)[0] self.tmhmm[acc] = {} reslist = string.split(tmres[1]) for res in reslist: split = string.split(res,'=') self.tmhmm[acc][split[0]] = split[1] else: break TMRES.close() self.verbose(0,1,'Done!',2) except: self.log.errorLog('Problem with parseTMHMM() %s.' % _stage)
def inputMatches(self,regexp): ### Returns the matches for the original peptides '''Returns the matches for the original peptides.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### matched = [] for peptide in self.list['Input']: searchpep = string.replace('X%sX' % peptide,'$X','') searchpep = string.replace(searchpep,'X^','') searchpep = string.replace(searchpep,'-','') try: if rje.matchExp('(%s)' % regexp,searchpep): matched.append(peptide) except: self.errorLog('Error with SLiM/peptide match, %s vs %s' % (regexp,searchpep)) sx = len(matched) matchstr = 'SLiM matches %d of %d input sequences (%.1f%%).' % (sx,len(self.list['Input']),(100.0*sx)/len(self.list['Input'])) except: self.errorLog('Error with inputMatches()'); matchstr = 'Error with inputMatches()'; matched = [] return (matchstr,matched)
def setupStatFilter(callobj, statlist=[], filterlist=[]): ### Makes StatFilter dictionary from statlist and filterlist """ Makes StatFilter dictionary from statlist and filterlist (from cmd_list) !!! Changes case of statfilter keys. !!! >> callobj:RJE_Object [None] = calling object for Error Messages etc. >> statlist:list of stats that are allowed for filtering. Generally column headers for output. >> filterlist:list of StatFilters read in from commandline consisting of StatOperatorValue << statfilter:dictionary of StatFilter {Stat:(Operator,String,Numeric)} """ try: ## Setup dictionary ## statfilter = {} for filter in filterlist: ## Extract details ## match = rje.matchExp("^(\S*[A-Za-z0-9])(>|>=|=<|=>|<=|==|=|<|!=|<>)(-*[A-Za-z0-9]\S*)$", filter) if not match: callobj.log.errorLog('Filter "%s" not recognised.' % filter, printerror=False) continue (stat, op, cutoff) = match if op == "<>": op = "!=" if op == "=": op = "==" if op in ["=>", "=<"]: op = rje.strReverse(op) if op not in ["=>", "=<", "!=", "==", ">", "<"]: callobj.log.errorLog('Filter "%s" operator "%s" not known!' % (filter, op), printerror=False) continue ## Check for numeric value ## try: numcut = float(cutoff) except: numcut = None ## Check stat ## if stat not in statlist: for h in statlist: if h.lower() == stat.lower(): stat = h break if stat not in statlist: callobj.log.errorLog('Stat "%s" in filter "%s" not found.' % (stat, filter), printerror=False) continue ## Update dictionary ## statfilter[stat] = (op, cutoff, numcut) ### Finish ### return statfilter except: callobj.log.errorLog("Error in rje_scoring.setupStatFilter()", quitchoice=True) return statfilter
def saveMutations(self): ### Outputs parsed mutations into a delimited file '''Outputs parsed mutations into a delimited file.''' try:### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease'] outfile = 'omim_mutations.tdt' rje.delimitedFileOutput(self,outfile,headers,'\t',rje_backup=True) ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for gene in rje.sortKeys(self.dict['Mutations']): for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease,mutation) = self.dict['Mutations'][gene][subid] (wild,pos,mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)',mutation) datadict = {'OMIM_ID':string.join(self.dict['Records'][gene],'; '),'SubID':subid,'Gene':gene, 'Pos':pos,'WildAA':wild,'MutAA':mut,'Disease':disease} rje.delimitedFileOutput(self,outfile,headers,'\t',datadict) self.log.printLog('#OUT','OMIM Mutation output to %s complete' % outfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt' self.progLog('\r#PPI','Loading pairwise data...') pairwise = rje.dataDict(self,ppipairwise,['Hub','Spoke'],['Spoke','SpokeSeq','Evidence']) gene2seq = {}; seq2gene = {} fullppi = {}; px = 0.0; ptot = len(pairwise); ppix = 0 for pair in rje.sortKeys(pairwise): self.progLog('\r#PPI','Processing full pairwise PPI: %.2f%%' % (px/ptot)); px += 100.0 [hub,spoke] = string.split(pair,'\t') if spoke not in gene2seq: sseq = pairwise[pair]['SpokeSeq'] gene2seq[spoke] = sseq; seq2gene[string.split(sseq,'__')[0]] = spoke if hub not in fullppi: fullppi[hub] = {} if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence']; ppix += 1 self.printLog('\r#PPI','Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)),rje.integerString(ppix/2))) ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt' goodppi = self.loadFromFile(goodppifile,chomplines=True) self.dict['PPI'] = {} px = 0.0; ptot = len(fullppi); fppix = ppix; ppix = 0 for hub in fullppi: self.progLog('\r#PPI','Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px/ptot,rje.integerString(len(self.dict['PPI'])),rje.integerString(ppix))); px +=100.0 self.dict['PPI'][hub] = [] for spoke in fullppi[hub]: goodspoke = False for ptype in goodppi: if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True; break if goodspoke: self.dict['PPI'][hub].append(spoke); continue goodspoke = True for spoke2 in fullppi[hub]: if spoke2 in [hub,spoke]: continue if spoke2 in fullppi[spoke]: goodspoke = False; break if goodspoke: self.dict['PPI'][hub].append(spoke) ppix += len(self.dict['PPI'][hub]) if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub) self.printLog('\r#PPI','Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString(len(fullppi)),rje.integerString(len(self.dict['PPI'])),rje.integerString(fppix/2),rje.integerString(ppix/2))) ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas' scmd = ['accnr=F','seqnr=F','seqin=%s' % seqfile] + self.cmd_list + ['autoload=T'] seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,scmd) self.dict['SeqObj'] = seqlist.seqNameDic('Max') self.dict['Gene2Seq'] = gene2seq; self.dict['Seq2Gene'] = seq2gene return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def taxaChildren(self,taxid): ### Extracts TaxID children from TaxMap file and updates RankID and TaxMap dicts. '''Extracts TaxID children from TaxMap file and updates RankID and TaxMap dicts.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### # NB. This is very slow and so reading the while. self.debug(taxid) taxmap = self.dict['TaxMap'] if taxid in taxmap: return taxmap[taxid] ### ~ [1] Parse from TaxMap ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### taxmap[taxid] = [] for tline in os.popen('grep -e "\s%s\s" %s' % (taxid,self.getStr('TaxMap'))).readlines(): try: (child,parent,taxtype) = rje.matchExp('^(\d+)\s+\|\s+(\d+)\s+\|\s+(\S+)\s+',tline) except: continue if parent not in taxmap: taxmap[parent] = [] taxmap[parent].append(child) if taxtype in ['species','subspecies']: self.list['RankID'].append(child) self.progLog('\r#TAXID','Reading %s: %s TaxID' % (self.getStr('TaxMap'),rje.iLen(taxmap))) return taxmap[taxid] except: self.errorLog('%s.taxaChildren(%s) error' % (self,taxid)); raise
def report(self): ### Run qstat to get job list then showstart on each job '''Run qstat to get job list then showstart on each job .''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qidlist = [] qidjob = {} ### ~ [2] ~ Read in List of IDs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qline in os.popen('qstat'): try: (qid,job) = rje.matchExp('^(\d+)\.\S+\s+(\S+)',qline) qidlist.append(qid) qidjob[qid] = job except: continue ### ~ [3] ~ Report ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#QSTAT','%d jobs in queue.' % len(qidlist)) for qid in qidlist: self.printLog('#JOB', '%s = %s' % (qid,qidjob[qid]), timeout=False) for qline in os.popen('showstart %s' % qid): if rje.chomp(qline): self.printLog('#INFO', qline, timeout=False) self.printLog('#ZEN',rje_zen.Zen().wisdom()) except: self.errorLog('QSub.report problem')
def absCons(callobj,Occ,hithom,seqfrag,seqwt): ### Absolute conservation score. '''Absolute conservation score.''' try: ### Absolute matching of motif in corresponding homologous region ### Motif = Occ.obj['Motif'] hitcon = {} # Dictionary of {seq:conservation} for seq in hithom: hitcon[seq] = 0.0 if callobj.opt['ConsAmb']: # Search degenerate motif vlist = Motif.dict['Search'][0] else: # Search with matched variant vlist = [Occ.getData('Variant')] for variant in vlist: searchvar = '(%s)' % string.replace(variant,'X','[A-Z]') if rje.matchExp(searchvar,seqfrag[seq]): hitcon[seq] = 1.0 break ### Weight by distance? ### return consWeight(callobj,hitcon,seqwt) except: callobj.log.errorLog('Error in rje_motif_cons.absCons()',quitchoice=True) return
def saveMutations( self): ### Outputs parsed mutations into a delimited file '''Outputs parsed mutations into a delimited file.''' try: ### ~ [1] Setup output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = [ 'OMIM_ID', 'SubID', 'Gene', 'Pos', 'WildAA', 'MutAA', 'Disease' ] outfile = 'omim_mutations.tdt' rje.delimitedFileOutput(self, outfile, headers, '\t', rje_backup=True) ### ~ [2] Output mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for gene in rje.sortKeys(self.dict['Mutations']): for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease, mutation) = self.dict['Mutations'][gene][subid] (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)', mutation) datadict = { 'OMIM_ID': string.join(self.dict['Records'][gene], '; '), 'SubID': subid, 'Gene': gene, 'Pos': pos, 'WildAA': wild, 'MutAA': mut, 'Disease': disease } rje.delimitedFileOutput(self, outfile, headers, '\t', datadict) self.log.printLog('#OUT', 'OMIM Mutation output to %s complete' % outfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def codons(self): ### Main codons analysis method '''Main codons analysis method.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F', 'seqnr=F', 'gnspacc=F'] cds = rje_seq.SeqList( self.log, self.cmd_list + ['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase] + scmd) gcode = rje_sequence.genetic_code ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nts = ['A', 'C', 'G', 'T'] ntfreq = cds.aaFreq(alphabet=nts) codons = [] # List of codons obs_cfreq = {} # Observed codon frequencies nts_cfreq = {} # Codon frequencies from NT frequencies obs_tfreq = {} # Observed triplet frequencies nts_tfreq = {} # Predicted triplet frequencies from NT frequencies ocd_tfreq = { } # Predicted triplet frequencies from observed codon frequencies ncd_tfreq = { } # Predicted triplet frequencies from nt-predicted codon frequencies ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for n1 in nts: for n2 in nts: for n3 in nts: cod = '%s%s%s' % (n1, n2, n3) codons.append(cod) aa = gcode[string.replace(cod, 'T', 'U')] if aa not in obs_cfreq: obs_cfreq[aa] = {} if aa not in nts_cfreq: nts_cfreq[aa] = {} obs_cfreq[aa][cod] = 0.0 nts_cfreq[aa][ cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] obs_tfreq[cod] = 0.0 nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] ocd_tfreq[cod] = 0.0 ncd_tfreq[cod] = 0.0 nts_tfreq = rje.dictFreq(nts_tfreq, total=False) # Normalise triplet freq. for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq( nts_cfreq[aa], total=False) # Normalise codon freq. self.log.printLog('#FREQ', 'Frequency dictionaries set up.') ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (sx, stot) = (0.0, cds.seqNum()) for seq in cds.seq[0:]: self.log.printLog( '\r#OBS', 'Calculating observed codon frequencies: %.1f%%' % (sx / stot), newline=False, log=False) sx += 100.0 try: (id, scaffold, pos, name, glen, parent) = rje.matchExp( '^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;', seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)', pos)[0] except: try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)', pos)[0] except: exons = rje.matchExp('^(\d+\.\.\d+)', pos)[0] self.deBug(exons) exons = string.split(exons, ',') elen = [] try: for exon in exons: (start, end) = string.split(exon, '..') elen.append(string.atoi(end) - string.atoi(start) + 1) except: self.log.errorLog(id) cds.seq.remove(seq) continue if pos[:4] == 'comp': elen.reverse() seq.list['ExonLen'] = elen self.deBug(elen) if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id, printerror=False) if seq.aaLen() / 3 != seq.aaLen() / 3.0: self.log.errorLog('%s not a multiple of 3nt long!' % id, printerror=False) cds.seq.remove(seq) continue #!# Add use exon option - single full-length exon if false (mature mRNA) #!# sequence = seq.info['Sequence'][0:] if string.count(sequence, 'N') > 0: self.log.errorLog('%s has 1+ Ns!' % id, printerror=False) cds.seq.remove(seq) continue while sequence: cod = sequence[:3] sequence = sequence[3:] aa = gcode[string.replace(cod, 'T', 'U')] obs_cfreq[aa][cod] += 1 for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq( obs_cfreq[aa], total=False) # Normalise codon freq. self.log.printLog( '\r#OBS', 'Calculating observed codon frequencies complete.') ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx, stot) = (0.0, cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#TRIP', 'Calculating triplet frequencies: %.1f%%' % (sx / stot), newline=False, log=False) sx += 100.0 elen = seq.list['ExonLen'] sequence = seq.info['Sequence'][0:] aa = '' cod = '' ax = 0 # Measure sequence length processed for exon boundary checks while sequence: prevcod = cod cod = sequence[:3] prevaa = aa sequence = sequence[3:] aa = gcode[string.replace(cod, 'T', 'U')] ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for cod2 in obs_cfreq[aa]: if elen[0] > ax + 3: # Exon boundary beyond this codon ocd_tfreq[cod2] += obs_cfreq[aa][cod2] ncd_tfreq[cod2] += nts_cfreq[aa][cod2] if prevaa: # Look at overlap with previous codon for cod1 in obs_cfreq[prevaa]: for i in range(1, 3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = cod1[i:] + cod2[:i] ocd_tfreq[acod] += ( obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2]) ncd_tfreq[acod] += ( nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2]) ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if elen[0] > ax + 3: # Exon boundary beyond this codon obs_tfreq[cod] += 1 if prevcod: # Look at overlap with previous codon for i in range(1, 3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = prevcod[i:] + cod[:i] obs_tfreq[acod] += 1 # Check exons # ax += 3 if ax >= elen[0]: ax -= elen.pop(0) obs_tfreq = rje.dictFreq(obs_tfreq, total=False) ocd_tfreq = rje.dictFreq(ocd_tfreq, total=False) ncd_tfreq = rje.dictFreq(ncd_tfreq, total=False) self.log.printLog('\r#TRIP', 'Calculating triplet frequencies complete.') ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = [ 'Triplet', 'AA', 'Degen', 'Obs_Codon', 'NT_Codon', 'Obs_Trip', 'NT_Trip', 'ObCod_Trip', 'NTCod_Trip' ] tfile = 'quad_triplet.tdt' rje.delimitedFileOutput(self, tfile, headers, rje_backup=True) for cod in codons: aa = gcode[string.replace(cod, 'T', 'U')] datadict = { 'Triplet': cod, 'AA': aa, 'Degen': len(obs_cfreq[aa]), 'Obs_Codon': obs_cfreq[aa][cod], 'NT_Codon': nts_cfreq[aa][cod], 'Obs_Trip': obs_tfreq[cod], 'NT_Trip': nts_tfreq[cod], 'ObCod_Trip': ocd_tfreq[cod], 'NTCod_Trip': ncd_tfreq[cod] } rje.delimitedFileOutput(self, tfile, headers, datadict=datadict) self.log.printLog('#OUT', 'Triplet & codon data output to %s' % tfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def addLinks(self, nested): ### Adds href aname links to definitions. '''Adds href aname links to definitions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### endstrip = [')', '.', ',', ':', ';', '!'] if self.getBool('Plurals'): endstrip.append('s') for term in rje.sortKeys(nested): if term == '=': linkdef = [] rawdef = string.split( string.replace(nested['='], '(', '( ')) while rawdef: glossary = self.dict['Glossary'] if self.getBool('HRef') and rje.matchExp( '<(\S+)>', rawdef[0]): safetynet = rawdef[0:] url = rje.matchExp('<(\S+)>', rawdef[0])[0] if rje.matchExp('<(\S+)>\[(\S+)', rawdef[0]): rawdef[0] = '[%s' % rje.matchExp( '<(\S+)>\[(\S+)', rawdef[0])[1] elif rje.matchExp('<(\S+)>(\S+)', rawdef[0]): rawdef[0] = '[%s]%s' % ( url, rje.matchExp('<(\S+)>(\S+)', rawdef[0])[1]) else: rawdef[0] = '[%s]' % url try: while ']' not in rawdef[0]: rawdef[0] = '%s %s' % (rawdef[0], rawdef.pop(1)) (linktext, linkextra) = rje.matchExp( '\[(.+)\](\S*)', rawdef.pop(0)) if url[:3] not in ['htt', 'ftp']: url = 'http://%s' % url linkdef.append('<a href="%s">%s</a>%s' % (url, linktext, linkextra)) continue except: self.errorLog('Problem parsing URL from "%s"' % nested['=']) rawdef = safetynet if rawdef[0].lower() not in glossary: if rawdef[0].lower( )[:-1] not in glossary or rawdef[0].lower( )[-1] not in endstrip: linkdef.append(rawdef.pop(0)) continue akey = [] alink = [] while rawdef and (rawdef[0].lower() in glossary or rawdef[0].lower()[:-1] in glossary): if rawdef[0].lower( ) in glossary and '=' in glossary[ rawdef[0].lower()]: rterm = rawdef[0].lower() elif len(rawdef) > 1 and rawdef[0].lower( ) in glossary and (rawdef[1].lower() in glossary[rawdef[0].lower()] or rawdef[1].lower()[:-1] in glossary[rawdef[0].lower()]): rterm = rawdef[0].lower() elif rawdef[0].lower()[-1] in endstrip and rawdef[ 0].lower()[:-1] in glossary: rterm = rawdef[0].lower()[:-1] elif rawdef[0].lower() in glossary: rterm = rawdef[0].lower() else: break glossary = glossary[rterm] akey.append(rterm) alink.append(rawdef.pop(0)) akey = string.join(akey, '_') if '=' in glossary: alink = string.join(alink) if nested == glossary: linkdef.append(alink) elif self.getStr('HTMLStyle') != 'tab': if alink[-1] in endstrip and alink[-1] != 's': linkdef.append( '<a href="#%s">%s</a>%s' % (akey, alink[:-1], alink[-1])) else: linkdef.append('<a href="#%s">%s</a>' % (akey, alink)) else: if alink[-1] in endstrip and alink[-1] != 's': linkdef.append('<scaps>%s</scaps>%s' % (alink[:-1], alink[-1])) else: linkdef.append('<scaps>%s</scaps>' % (alink)) else: linkdef.append(alink[0]) rawdef = alink[1:] + rawdef nested['+'] = string.replace(string.join(linkdef), '( ', '(') while rje.matchExp(' _([^_]+)_', nested['+']): italics = rje.matchExp(' _([^_]+)_', nested['+'])[0] nested['+'] = string.replace(nested['+'], ' _%s_' % italics, ' <i>%s</i>' % italics) #self.deBug(nested) elif term != '+': self.addLinks(nested[term]) except: self.errorLog('%s.addLinks error' % self)
def makeHistory(self): ### Extracts history information from docstrings. '''Extracts history information from docstrings.''' try: ### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hdb = self.db('history') mdb = self.db('Module') pydoc = self.obj['PyDoc'] uhtml = [] # Update HTML text udir = '' ### ~ [1] Work through python modules ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for mkey in mdb.dataKeys(): entry = mdb.data(mkey) pyfile = entry['File'] mod = entry['Module'] prev = '-' lastv = '' ## ~ [1a] Parse out history text ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## history = [] # History lines updates = [] # Version number entries for mentry in self.db('Method').indexEntries('File', pyfile): if mentry['Method'] == 'history': try: history = string.split(mentry['DocString'], '\n') break except: history = [] self.errorLog('History parsing problem: %s' % mentry, printerror=False) for dline in history: if rje.matchExp('# (\d+\.\d+\.?\d*)\s?-\s(\S.+)$', dline): (v, text) = rje.matchExp('# (\d\.\d+\.?\d*)\s?-\s(\S.+)$', dline) if v == lastv: # Continuation of update if prev == lastv: continue # In previous release updates[-1]['Update'] += ' %s' % text continue lastv = v ventry = { 'Dir': entry['SourceDir'], 'Module': mod, 'Version': v, 'Update': text, 'Release': rje.dateTime(dateonly=True) } vkey = hdb.makeKey(ventry) if hdb.data(vkey): prev = v else: updates.append(ventry) ## ~ [1b] Assess/report updates ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if prev != lastv: if entry['SourceDir'] != udir: uhtml.append('<h2>Updates in %s/:</h2>\n' % entry['SourceDir']) udir = entry['SourceDir'] if prev == '-' and lastv: self.printLog( '#VNUM', '%s: Creation -> Version %s' % (mod, entry['Version'])) uhtml.append( '<p><b>• %s:</b> <i>Created/Renamed/moved.</i>' % mod) elif prev not in ['-', lastv]: self.printLog( '#VNUM', '%s: Version %s -> Version %s' % (mod, prev, entry['Version'])) uhtml.append( '<p><b>• %s:</b> <i>Updated from Version %s.</i>' % (mod, prev)) for ventry in updates: self.printLog('#V%s' % ventry['Version'], ventry['Update']) uhtml.append('<br>→ Version %s: %s' % (ventry['Version'], ventry['Update'])) hdb.addEntry(ventry) if uhtml and uhtml[-1] != '</p>': uhtml.append('</p>') if lastv != entry['Version']: self.warnLog( 'Module %s Version %s but history() ends at %s' % (mod, entry['Version'], lastv)) self.deBug('>>>') if 'history' in self.list['Output']: hdb.saveToFile(backup=False) ### ~ [2] Make updates.html file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if 'updates' in self.list['Output']: htmlfile = '%s.updates.html' % self.basefile() title = 'SLiMSuite updates' stylesheets = [] for css in pydoc.list['StyleSheets']: stylesheets.append(pydoc.getStr('StylePath') + css) htmlhead = rje_html.htmlHead( title, stylesheets, tabber=True, frontpage=False, nobots=False, keywords=pydoc.list['Keywords'], javascript=pydoc.getStr('StylePath')) htmlbody = string.join(['<h1>SLiMSuite updates</h1>'] + uhtml, '\n') htmltail = rje_html.htmlTail( '%s %s' % (pydoc.getStr('Author'), string.split(time.asctime(time.localtime( time.time())))[-1])) open(htmlfile, 'w').write(htmlhead + htmlbody + htmltail) self.printLog('#HTML', 'HTML update summary output to %s' % (htmlfile)) except: self.errorLog('Error in %s.makeHistory()' % self.prog())
def codons(self): ### Main codons analysis method '''Main codons analysis method.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F','seqnr=F','gnspacc=F'] cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd) gcode = rje_sequence.genetic_code ### ~ [1] ~ Make codon frequency tables (a) Observed, (b) Based on NTFreq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### nts = ['A','C','G','T'] ntfreq = cds.aaFreq(alphabet=nts) codons = [] # List of codons obs_cfreq = {} # Observed codon frequencies nts_cfreq = {} # Codon frequencies from NT frequencies obs_tfreq = {} # Observed triplet frequencies nts_tfreq = {} # Predicted triplet frequencies from NT frequencies ocd_tfreq = {} # Predicted triplet frequencies from observed codon frequencies ncd_tfreq = {} # Predicted triplet frequencies from nt-predicted codon frequencies ## ~ [1a] ~ Setup dictionaries using nt freqs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for n1 in nts: for n2 in nts: for n3 in nts: cod = '%s%s%s' % (n1,n2,n3) codons.append(cod) aa = gcode[string.replace(cod,'T','U')] if aa not in obs_cfreq: obs_cfreq[aa] = {} if aa not in nts_cfreq: nts_cfreq[aa] = {} obs_cfreq[aa][cod] = 0.0 nts_cfreq[aa][cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] obs_tfreq[cod] = 0.0 nts_tfreq[cod] = ntfreq[n1] * ntfreq[n2] * ntfreq[n3] ocd_tfreq[cod] = 0.0 ncd_tfreq[cod] = 0.0 nts_tfreq = rje.dictFreq(nts_tfreq,total=False) # Normalise triplet freq. for aa in nts_cfreq: nts_cfreq[aa] = rje.dictFreq(nts_cfreq[aa],total=False) # Normalise codon freq. self.log.printLog('#FREQ','Frequency dictionaries set up.') ## ~ [1b] ~ Observed codon freq ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (sx,stot) = (0.0,cds.seqNum()) for seq in cds.seq[0:]: self.log.printLog('\r#OBS','Calculating observed codon frequencies: %.1f%%' % (sx/stot),newline=False,log=False) sx += 100.0 try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise try: exons = rje.matchExp('^complement\((\d+\..*\.\d+)\)',pos)[0] except: try: exons = rje.matchExp('^join\((\d+\..*\.\d+)\)',pos)[0] except: exons = rje.matchExp('^(\d+\.\.\d+)',pos)[0] self.deBug(exons) exons = string.split(exons,',') elen = [] try: for exon in exons: (start,end) = string.split(exon,'..') elen.append(string.atoi(end) - string.atoi(start) + 1) except: self.log.errorLog(id) cds.seq.remove(seq) continue if pos[:4] == 'comp': elen.reverse() seq.list['ExonLen'] = elen self.deBug(elen) if sum(elen) != seq.aaLen(): self.log.errorLog('%s exon length error' % id,printerror=False) if seq.aaLen()/3 != seq.aaLen()/3.0: self.log.errorLog('%s not a multiple of 3nt long!' % id,printerror=False) cds.seq.remove(seq) continue #!# Add use exon option - single full-length exon if false (mature mRNA) #!# sequence = seq.info['Sequence'][0:] if string.count(sequence,'N') > 0: self.log.errorLog('%s has 1+ Ns!' % id,printerror=False) cds.seq.remove(seq) continue while sequence: cod = sequence[:3] sequence = sequence[3:] aa = gcode[string.replace(cod,'T','U')] obs_cfreq[aa][cod] += 1 for aa in obs_cfreq: obs_cfreq[aa] = rje.dictFreq(obs_cfreq[aa],total=False) # Normalise codon freq. self.log.printLog('\r#OBS','Calculating observed codon frequencies complete.') ### ~ [2] ~ Generate Triplet freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (sx,stot) = (0.0,cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#TRIP','Calculating triplet frequencies: %.1f%%' % (sx/stot),newline=False,log=False) sx += 100.0 elen = seq.list['ExonLen'] sequence = seq.info['Sequence'][0:] aa = '' cod = '' ax = 0 # Measure sequence length processed for exon boundary checks while sequence: prevcod = cod cod = sequence[:3] prevaa = aa sequence = sequence[3:] aa = gcode[string.replace(cod,'T','U')] ## ~ [2a] ~ Predicted Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for cod2 in obs_cfreq[aa]: if elen[0] > ax + 3: # Exon boundary beyond this codon ocd_tfreq[cod2] += obs_cfreq[aa][cod2] ncd_tfreq[cod2] += nts_cfreq[aa][cod2] if prevaa: # Look at overlap with previous codon for cod1 in obs_cfreq[prevaa]: for i in range(1,3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = cod1[i:] + cod2[:i] ocd_tfreq[acod] += (obs_cfreq[prevaa][cod1] * obs_cfreq[aa][cod2]) ncd_tfreq[acod] += (nts_cfreq[prevaa][cod1] * nts_cfreq[aa][cod2]) ## ~ [2b] ~ Observed Triplet Freq. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if elen[0] > ax + 3: # Exon boundary beyond this codon obs_tfreq[cod] += 1 if prevcod: # Look at overlap with previous codon for i in range(1,3): if elen[0] > ax + i: # Exon boundary beyond overlap acod = prevcod[i:] + cod[:i] obs_tfreq[acod] += 1 # Check exons # ax += 3 if ax >= elen[0]: ax -= elen.pop(0) obs_tfreq = rje.dictFreq(obs_tfreq,total=False) ocd_tfreq = rje.dictFreq(ocd_tfreq,total=False) ncd_tfreq = rje.dictFreq(ncd_tfreq,total=False) self.log.printLog('\r#TRIP','Calculating triplet frequencies complete.') ### ~ [3] ~ Output results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Triplet','AA','Degen','Obs_Codon','NT_Codon','Obs_Trip','NT_Trip','ObCod_Trip','NTCod_Trip'] tfile = 'quad_triplet.tdt' rje.delimitedFileOutput(self,tfile,headers,rje_backup=True) for cod in codons: aa = gcode[string.replace(cod,'T','U')] datadict = {'Triplet':cod,'AA':aa,'Degen':len(obs_cfreq[aa]),'Obs_Codon':obs_cfreq[aa][cod], 'NT_Codon':nts_cfreq[aa][cod],'Obs_Trip':obs_tfreq[cod],'NT_Trip':nts_tfreq[cod], 'ObCod_Trip':ocd_tfreq[cod],'NTCod_Trip':ncd_tfreq[cod]} rje.delimitedFileOutput(self,tfile,headers,datadict=datadict) self.log.printLog('#OUT','Triplet & codon data output to %s' % tfile) except: self.log.errorLog(rje_zen.Zen().wisdom())
def mapHit(self,seq,hits,hitdict,method): ### Tries to map seq onto hitseq and returns hit if successful ''' Tries to map seq onto hitseq and returns hit if successful. >> seq:Query Sequence Object >> hits:List of hits in rough order of goodness >> hitdict:Dictionary of {hitname:stats} >> method:Mapping method to use ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (name,sequence) = seq data = rje_sequence.extractNameDetails(name,self) data['Sequence'] = seq[1] data['ShortName'] = string.split(seq[0])[0] for hit in hitdict: hitdict[hit]['Data'] = rje_sequence.extractNameDetails(hitdict[hit]['Seq'][0],self) hitdict[hit]['Data']['Sequence'] = hitdict[hit]['Seq'][1] hitdict[hit]['Data']['ShortName'] = string.split(hitdict[hit]['Seq'][0])[0] ### SkipGene ### if method == 'id' and rje.matchExp('^(\S+)_\S+',data['ID']): gene = rje.matchExp('^(\S+)_\S+',data['ID']) if gene in self.list['SkipGene']: return None ### Name, AccNum, Sequence and ID ### if method_info[method] in ['Name', 'AccNum', 'Sequence', 'ID']: for hit in hits: hitdata = hitdict[hit['Hit']]['Data'] if hitdata[method_info[method]] == data[method_info[method]]: if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit ### DescAcc ### if method == 'descacc': for hit in hits: hitdata = hitdict[hit['Hit']]['Data'] if rje.matchExp('\W(%s)\W' % data['AccNum'],hitdata['Name']): if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit ### GABLAM ### if method != 'gablam': return None focus = self.str['MapFocus'][:1].upper() + self.str['MapFocus'][1:].lower() gstat = gstat_type[self.str['MapStat'].lower()] possibles = [] # List of Hits that meet MinMap criterion for hit in hits: hitname = hit['Hit'] hitdata = hitdict[hit['Hit']]['Data'] if self.getNum('AutoMap') > 0.0 and hitdict[hitname]['%s_%s' % (focus,gstat)] >= self.getNum('AutoMap'): if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit elif hitdict[hitname]['%s_%s' % (focus,gstat)] >= self.getNum('MinMap'): possibles.append(hit) ### Manual GABLAM Choice ### if self.i() < 0 or not possibles: return None possibles.reverse() print '\nMapping options for %s:\n' % data['ShortName'] for p in range(len(possibles)): hit = possibles[p] hitname = hit['Hit'] hitdata = hitdict[hit['Hit']]['Data'] print '<%d> %s (%d aa) =\t' % (len(possibles)-p,hitdata['Name'],hit['Length']), print '%.1f%% Qry Len,' % (100.0 * hit['Length'] / len(seq[1])), print '%.1f%% ID (%.1f%% Sim, %.1f%% Cov.)' % (hitdict[hitname]['Hit_ID'],hitdict[hitname]['Hit_Sim'],hitdict[hitname]['Hit_Len']), print '(Qry: %.1f%% ID (%.1f%% Sim, %.1f%% Cov.)' % (hitdict[hitname]['Query_ID'],hitdict[hitname]['Query_Sim'],hitdict[hitname]['Query_Len']) choice = -1 print '<0> No mapping.\n' ## Choice ## while 1: choice = rje.getInt('Select sequence to replace %s?' % data['ShortName'],default=1,confirm=True) i = len(possibles) - choice if choice == 0: # No mapping if self.i() < 2 or rje.yesNo('No GABLAM mapping for %s?' % (data['ShortName'])): return None elif choice > 0 and choice <= len(possibles): hit = possibles[i] hitdata = hitdict[hit['Hit']]['Data'] if self.i() < 2 or rje.yesNo('Map %s to %s?' % (data['ShortName'],hitdata['ShortName'])): return hit except: self.errorLog('Problem during SeqMapper.mapHit(%s)' % method,quitchoice=True) return None
def iuPred(self, retry=2): ### Runs IUPred disorder prediction '''Runs IUPred disorder prediction.''' mydir = os.path.abspath(os.curdir) try: ### Setup sequence and temp file ### sequence = self.info['Sequence'].upper() name = self.info['Name'][:4] + rje.randomString(8) tmp = name + '.tmp' ### Run Disorder ### iupath = string.join( string.split(self.info['IUPath'], os.sep)[:-1], os.sep) iupred = string.split(self.info['IUPath'], os.sep)[-1] if self.opt['IUChDir']: os.chdir( string.join( string.split(self.info['IUPath'], os.sep)[:-1], os.sep)) open(tmp, 'w').write('>%s\n%s\n' % (name, sequence)) if self.opt['IUChDir'] and self.opt['Win32']: iucmd = '%s %s %s' % (iupred, tmp, self.info['IUMethod'].lower()) elif self.opt['IUChDir']: iucmd = './%s %s %s' % (iupred, tmp, self.info['IUMethod'].lower()) else: iucmd = '%s %s %s' % (self.info['IUPath'], tmp, self.info['IUMethod'].lower()) dlines = os.popen(iucmd).readlines() try: os.unlink(tmp) except: self.errorLog('Cannot delete %s!' % tmp) if self.opt['IUChDir']: os.chdir(mydir) if self.info['Name'] not in ['', 'None']: name = self.info['Name'] self.list['ResidueDisorder'] = [] for d in dlines: if rje.matchExp('^\s*(\d+)\s+(\S)\s+(\S+)', d): dm = rje.matchExp('^\s*(\d+)\s+(\S)\s+(\S+)', d) pos = string.atoi(dm[0]) aa = dm[1] score = string.atof(dm[2]) i = len(self.list['ResidueDisorder']) if sequence[i] != aa: self.log.errorLog( '%s: Position %d is %s in sequence but %s in IUPred output!' % (name, pos, sequence[i], aa), printerror=False) raise ValueError if pos != (i + 1): self.log.errorLog( '%s: Position %d reached in IUPred output but previous results missing!' % (name, pos), printerror=False) raise ValueError self.list['ResidueDisorder'].append(score) if len(self.list['ResidueDisorder']) != len(sequence): self.log.errorLog( '%s: Sequence = %d aa but IUPred results stop at %s!' % (name, len(sequence), len(self.list['ResidueDisorder'])), printerror=False) raise ValueError ### Make Regions ### self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] start = 0 fstart = 0 i = 0 dx = 0 while i < len(sequence): score = self.list['ResidueDisorder'][i] i += 1 if not start and score > self.stat[ 'IUCut']: ### Start new disorder ### start = i elif start and score <= self.stat['IUCut']: ### End! self.list['RegionDisorder'].append((start, i - 1)) dx += i - start start = 0 if not fstart and score <= self.stat[ 'IUCut']: ### Start new fold ### fstart = i elif fstart and score > self.stat['IUCut']: ### End! self.list['RegionFold'].append((fstart, i - 1)) fstart = 0 if start: self.list['RegionDisorder'].append((start, len(sequence))) dx += len(sequence) + 1 - start if fstart: self.list['RegionFold'].append((fstart, len(sequence))) self.minRegion() if self.opt['PrintLog']: self.log.printLog( '\r#DIS', 'IUPred (%s) Disorder prediction complete: %d disorder regions, %d disordered aa' % (self.info['IUMethod'].lower(), len(self.list['RegionDisorder']), dx)) return True except: if self.opt['IUChDir']: os.chdir(mydir) if retry: self.printLog('#RETRY', 'Trying %s again...' % name) return self.iuPred(retry - 1) self.log.errorLog( 'Error in Disorder.iuPred(%s). Disorder prediction failed. Check (setenv?) IUPred_PATH environment variable.' % name) self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] #try: os.system('rm %s*tmp' % (rje.makePath(os.path.split(self.info['IUPath'])[0]))) #except: pass return False
def convert(self, filelist=[], outfile=None ): ### Converts scansite output files in FileList to Outfile ''' Converts scansite output files in FileList to Outfile. ''' try: ### Setup ### _stage = 'Setup' if len(filelist) < 1: filelist = self.list['FileList'] if not outfile: outfile = self.info['Name'] if len(filelist) < 1: self.log.errorLog( 'No scansite files to convert! %s unchanged/not made.' % outfile, printerror=False) return False delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) if ext != outfile[-3:]: newfile = outfile[:-3] + ext if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)): outfile = newfile self.log.printLog( '#OUT', 'Converting %d file(s), output to %s.' % (len(filelist), outfile)) ### Output File ### _stage = 'Output File' if not self.opt['Append'] or not os.path.exists( outfile): # Create with header OUTFILE = open(outfile, 'w') headers = [ 'seq_id', 'enzyme', 'enz_group', 'aa', 'pos', 'score', 'percentile', 'matchseq', 'sa' ] rje.writeDelimit(OUTFILE, headers, delimit) else: OUTFILE = open(outfile, 'a') ### Conversion ### _stage = 'Conversion' sx = 0 for infile in filelist: if not os.path.exists(infile): self.log.errorLog( 'Input file %s does not exist! :o(' % infile, False, False) continue fx = 0 INFILE = open(infile, 'r') inline = rje.nextLine(INFILE) while inline != None: if rje.matchExp(re_scansite, inline): scanlist = rje.matchExp(re_scansite, inline) rje.writeDelimit(OUTFILE, scanlist, delimit) sx += 1 fx += 1 rje.progressPrint(self, sx) inline = rje.nextLine(INFILE) self.log.printLog( '#OUT', '%s scansite results from %s. (%s Total.)' % (rje.integerString(fx), infile, rje.integerString(sx))) INFILE.close() ### End ### _stage = 'End' OUTFILE.close() self.log.printLog( '#OUT', '%s scansite results output to %s.' % (rje.integerString(sx), outfile)) return True except: self.log.errorLog('Error in convert(%s)' % _stage, printerror=True, quitchoice=False) raise
def tabulatePPIRegion( self): ### Tabulates regions of known PPI from DAT file '''Tabulates regions of known PPI from DAT file.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tabfile = 'ppi_region.tdt' unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat' if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB', '%s found. (Force=F)' % tabfile) headers = ['Protein', 'Start', 'End', 'Interactor'] rje.delimitedFileOutput(self, tabfile, headers, rje_backup=True) ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gcmd = "grep -P '(ID |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile self.printLog('#GREP', gcmd) prot = None rx = 0 plist = [] ilist = [] for gline in os.popen(gcmd).readlines(): if rje.matchExp('ID (\S+)', gline): prot = rje.matchExp('ID (\S+)', gline)[0] if rje.matchExp( 'FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)', gline): (rstart, rend, rint) = rje.matchExp( 'FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)', gline) for ppi in string.split(rint): if rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi): datadict = { 'Protein': prot, 'Start': rstart, 'End': rend, 'Interactor': rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi)[0] } rje.delimitedFileOutput(self, tabfile, headers, datadict=datadict) rx += 1 if prot not in plist: plist.append(prot) if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor']) self.progLog( '\r#REGTAB', 'Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString( len(plist)), rje.integerString( len(ilist)), rje.integerString(rx))) self.printLog( '\r#REGTAB', 'Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)), rje.integerString( len(ilist)), rje.integerString(rx), tabfile)) return True except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def run(self): ### Main Run Method '''Main Run Method.''' try: ### ~ [1] Parse/Read Mutation data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['Force'] or not self.loadMutations(): self.parseOMIM() ### ~ [2] Additional Pingu incorporation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #!# Load PPI data using Pingu, map genes to sequences and check mutation residues #!# ## ~ [2a] Setup Pingu ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## import pingu pcmd = self.cmd_list + ['fulloutput=F'] ping = self.obj['Pingu'] = pingu.PINGU(self.log, pcmd) ping.run() ## ~ [2b] Read in EnsLoci sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not ping.obj['GeneCards']: return self.log.errorLog( 'Cannot map EnsLoci without GeneCards.', printerror=False) genecards = ping.obj['GeneCards'].dict[ 'GeneCard'] # GeneCards dictionary ensloci = ping.getEnsLoci( ) # EnsLoci SeqList object (ping.obj['EnsLoci']) seqdict = ensloci.seqNameDic() if not seqdict: return self.log.errorLog( 'Failed to read in EnsLoci sequences.', printerror=False) ## ~ [2c] Calculate fudge factor for each gene ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fudge'] = {} ensback = {} # Dictionary of {EnsLoci name:OMIM gene} mutations = {} # Reorganised dictionary of {gene:{pos:Mutation}} for gene in rje.sortKeys(self.dict['Mutations']): try: seq = seqdict[genecards[gene]['EnsLoci']] except: self.log.printLog( '#MAP', 'No EnsLoci protein mapped for %s' % gene) continue mutations[gene] = {} ensback[genecards[gene]['EnsLoci']] = gene mutpos = {} # Dictionary of {pos:AA} to map onto sequence for subid in rje.sortKeys(self.dict['Mutations'][gene]): (disease, mutation) = self.dict['Mutations'][gene][subid] (wild, pos, mut) = rje.matchExp('(\D\D\D)(\d+)(\D\D\D)', mutation) mutpos[int(pos)] = rje_sequence.aa_3to1[wild.upper()] mutations[gene][int( pos)] = self.dict['Mutations'][gene][subid] self.dict['Fudge'][seq] = seq.fudgeFactor(mutpos) self.deBug(self.dict['Fudge']) ### ~ [3] Cross-reference to SLiMFinder ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### allslims = { } # Full dictionary of SLiMFinder results matching OMIM genes slimomim = [] # List of (gene,pos) overlapping with SLiMs outfile = 'rje_omim.slimfinder.tdt' dataheaders = string.split( 'Dataset,Rank,Pattern,Hit,Pos,EndPos,SeqLen,Variant,Match,AbsChg,NetChg,PepSeq,PepDesign', ',') headers = ['Gene', 'OMIM', 'SubID', 'Mutation', 'Disease' ] + dataheaders rje.delimitedFileOutput(self, outfile, headers, delimit='\t', rje_backup=True) for file in glob.glob(self.info['SlimDir'] + '*.occ.csv'): # Potential SLiM slimdata = rje.dataDict(self, file, ['Pattern', 'Hit', 'Pos', 'Match'], dataheaders, delimit=',') for occ in slimdata: if slimdata[occ][ 'Hit'] in ensback: # OMIM gene - possible overlap gene = ensback[slimdata[occ]['Hit']] (start, end) = (int(slimdata[occ]['Pos']), int(slimdata[occ]['EndPos'])) if gene not in allslims: allslims[gene] = {} allslims[gene][occ] = slimdata[occ] for mpos in mutations[gene]: if start <= (mpos + self.dict['Fudge'][seqdict[ genecards[gene]['EnsLoci']]]) <= end: self.log.printLog( '#OMIMSLIM', '%s %s %s (%d-%d) = %s' % (slimdata[occ]['Dataset'], slimdata[occ]['Hit'], slimdata[occ]['Pattern'], start, end, mutations[gene][mpos])) slimdata[occ]['Gene'] = gene slimdata[occ]['OMIM'] = string.join( self.dict['Records'][gene]) slimdata[occ]['Mutation'] = mutations[gene][ mpos][1] slimdata[occ]['Disease'] = mutations[gene][ mpos][0] rje.delimitedFileOutput( self, outfile, headers, '\t', slimdata[occ]) if (gene, mpos) not in slimomim: slimomim.append((gene, mpos)) ### ~ [4] Calculate coverage of SLiMs for "significance" assessment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (inslim, resx, mutx) = ( 0, 0, 0 ) # No. of residues in SLiMs, total residue count + no. mutations that may overlap for gene in mutations: # These are just the genes that mapped to sequences mutx += len(mutations[gene]) resx += seqdict[genecards[gene]['EnsLoci']].aaLen() if gene in allslims: # Partially covered by SLiMs res = [0] * seqdict[genecards[gene]['EnsLoci']].aaLen() for occ in allslims[gene]: (start, end) = (int(allslims[gene][occ]['Pos']) - 1, int(allslims[gene][occ]['EndPos'])) res = res[:start] + [1] * (end - start) + res[end - 1:] self.deBug('%s %d (%d)' % (gene, sum(res), seqdict[genecards[gene]['EnsLoci']].aaLen())) inslim += sum(res) self.log.printLog( '#COV', 'SLiMs have %.1f%% coverage of OMIM gene sequences' % (100.0 * inslim / resx)) self.log.printLog( '#MUT', '%d mutations that could potentially occur in SLiMs' % mutx) self.log.printLog( '#PROB', 'Probability of observed %d mutation overlap = %.4f' % (len(slimomim), rje.binomial( len(slimomim), mutx, float(inslim) / resx, callobj=self))) except: self.log.errorLog(rje_zen.Zen().wisdom())
def readHMMSearch( self, resfile=None, readaln=False ): ### Reads HMM Search Results into objects #!# Needs tidying! #!# ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] (!!!currently always True!!!) ''' try: ### <a> ### Setup _stage = '<a> Setup' #print resfile if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file (%) missing!' % resfile, False, False) raise IOError _hit_elements = [ '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)', 'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)' ] _hit_re = string.join(_hit_elements, '\s+') ### <b> ### Read in Search results _stage = '<b> Read Results' self.verbose(0, 4, 'Reading %s HMMer search results' % resfile, 0) RESFILE = open(resfile, 'r') lines = RESFILE.readlines() RESFILE.close() resline = [] for line in lines: resline.append(re.sub('\n', '', line)) search = None i = 0 hitaln = 0 if resline[i].find('hmmsearch') != 0: self.log.errorLog( "File %s does not appear to be an hmmsearch results file" % resfile) raise while i < len(resline): line = resline[i] #print line ## <i> ## Basic Search Info _stage = '<b-i> Basic Search Info' if line.find('HMM file:') == 0: search = self._addSearch() search.info['Name'] = rje.matchExp('HMM file:\s+(\S+)', line)[0] self.verbose(0, 4, '.', 0) self.verbose(1, 3, '\n%s' % search.info['Name'], 0) elif line.find('Sequence database:') == 0: search.info['SearchDB'] = rje.matchExp( 'Sequence database:\s+(\S+)', line)[0] elif line.find('Total sequences searched:') == 0: dbnum = rje.matchExp('Total sequences searched:\s+(\d\S*)', line)[0] dbnum = re.sub('\D', '', dbnum) search.stat['DBNum'] = string.atoi(dbnum) ## <ii> ## One-line hit data (BLASTHit) elif line.find( 'Scores for complete sequences') == 0: # One-line hits _stage = '<b-ii> One-line hits' i += 3 # Skip two lines while re.search( '^(\S+)\s.+\s(\S*\d)\s+(\S*\d)\s+(\d+)\s*$', resline[i]): match = rje.matchExp( '^(\S+)\s.+\s(\S*\d)\s+(\S*\d)\s+\d+\s*$', resline[i]) self.verbose(2, 3, '\n - %s (%s, %s)' % match, 0) hit = search._addHit() hit.info['Name'] = match[0] hit.stat['BitScore'] = string.atof(match[1]) #print hit.stat['BitScore'], resline[i], match eval = match[2] if eval.find('e') == 0: eval = '1' + eval hit.stat['E-Value'] = string.atof(eval) i += 1 line = resline[i] # End of one-lines (blank line) self.verbose(1, 3, '=> %d Hits' % search.hitNum(), 1) hitaln = 0 #!# Make new No hits pattern match elif line.find('***** No hits found ******') >= 0: # No Hits search.hit = [] self.verbose(1, 3, '=> %d Hits' % search.hitNum(), 1) hitaln = 0 ## <iii> ## Aln Hit data (PWAln) #!# Consider reading in the 'parsed for domains' section instead/as well elif re.search(_hit_re, line): # New aln hit _stage = '<b-iii> Aln Hit Info' # Identify hit object _hit_detail = rje.matchExp(_hit_re, line) #print _hit_detail hitname = _hit_detail[0] #hitaln += 1 - string.atoi(_hit_detail[1]) #print hitname try: #if hitname != search.hit[hitaln].info['Name']: for hit in search.hit: if hit.info['Name'] == hitname: hitaln = search.hit.index(hit) if hitname != search.hit[hitaln].info['Name']: self.log.errorLog( 'Problem with HMM results %s - %s single-line hits and alignments do not match' % (hitname, search.info['Name']), printerror=False, quitchoice=True) i += 1 continue except: self.log.errorLog( 'Problem with HMM results reconciling %s - %s single-line hits and alignments.' % (hitname, search.info['Name']), True, True) i += 1 continue hit = search.hit[hitaln] #print hit hitaln += 1 # Add details _stage = '<b-iii> Add Aln Hit Info' aln = hit._addAln() aln.stat['SbjStart'] = string.atoi(_hit_detail[3]) aln.stat['SbjEnd'] = string.atoi(_hit_detail[4]) aln.stat['BitScore'] = string.atof(_hit_detail[5]) aln.stat['Expect'] = string.atof(_hit_detail[6]) ## <iv> ## Alignments readaln = True i += 1 while readaln: _stage = '<b-iv> Read alignments' line = resline[i] #print line block = rje.matchExp('^(\s+)(\S+)', line) #print block if block: # Query Line leadlen = len(block[0]) seqblock = block[1] #print block, leadlen, (leadlen+len(seqblock)) if block[1][:3] == '*->': # Start leadlen += 3 #print seqblock[3:] seqblock = seqblock[3:] if block[1][-3:] == '<-*': # End #print seqblock[:-3] seqblock = seqblock[:-3] readaln = False #print block, leadlen, (leadlen+len(seqblock)) aln.info['QrySeq'] += seqblock # Alignment Line i += 1 aln.info['AlnSeq'] += resline[i][leadlen:( leadlen + len(seqblock))] # Subject Line i += 1 aln.info['SbjSeq'] += resline[i][leadlen:( leadlen + len(seqblock))] # Skip Blank line i += 2 else: #print 'This should be a block!:\n', line i += 1 i += 1 #print self.search #print self.search[0].hit #print self.search[0].hit[0].aln self.verbose( 0, 1, 'Reading of %s HMM results complete! (%d Searches)' % (resfile, len(self.search)), 2) return True except: self.log.errorLog('Calamity during readHMMSearch(%s) %s.' % (resfile, _stage)) return False
def readHMMPFamSearch( self, resfile=None, readaln=False): ### Reads HMM PFam Search Results into objects ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!! ''' try: ### Setup ### if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file "%s" missing!' % resfile, printerror=False) return False ## Make RegExp for starting next alignment ## re_hit = string.join([ '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)', 'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)' ], '\s+') ## Search dictionary as results come back per sequence, not per HMM! ## pfam = {} # Dictionary of {PFam name:search} hitx = 0 # Total number of hits hitlist = [ ] # List of sequences processed from file (may or may not include zero hit sequences) ### Read in Search results ### if open(resfile, 'r').readline().find('hmmpfam') != 0: self.errorLog( 'File "%s" does not appear to be an hmmpfam results file' % resfile, printerror=False) if rje.yesNo( 'Delete incorrect results file? (Check that hmmpfam=T is right!)', default='N'): os.unlink(resfile) self.printLog('#DEL', 'Dodgy results file "%s" deleted.' % resfile) return False hitname = None i = 0 hx = 0 seqx = 0 RESFILE = open(resfile, 'r') #x#resline = self.loadFromFile(resfile,chomplines=True) #x#while i < len(resline): line = RESFILE.readline() newres = [rje.chomp(line)] newresout = True newresfile = '%s.partial' % resfile if os.path.exists(newresfile): os.unlink(newresfile) while line: self.progLog( '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(hx), rje.integerString(len(pfam)), rje.integerString(hitx))) line = rje.chomp(line) #print line ## New Sequence ## if rje.matchExp('^Query sequence:\s+(\S+)', line): if newres and newresout and self.opt['CleanRes']: open(newresfile, 'a').write(string.join(newres, '\n')) newres = ['', line] newresout = False hitname = rje.matchExp('^Query sequence:\s+(\S+)', line)[0] hx += 1 #x#if hitname not in hitlist: hitlist.append(hitname) ## One Line Data for hits ## elif line.find('Parsed for domains:') == 0: #x#i += 3 # Skip two complete lines newres += [ line, rje.chomp(RESFILE.readline()), rje.chomp(RESFILE.readline()) ] line = rje.chomp(RESFILE.readline()) newres.append(line) #Model Domain seq-f seq-t hmm-f hmm-t score E-value #-------- ------- ----- ----- ----- ----- ----- ------- #Lep_receptor_Ig 1/1 24 114 .. 1 103 [] 158.4 1.7e-44 # ... else ... # [no hits above thresholds] while rje.matchExp( string.join([ '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)', '(\S+)\s*$' ], '\s+'), line): newresout = True (dom, start, end, score, eval) = rje.matchExp( string.join([ '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)', '(\S+)\s*$' ], '\s+'), line) if not pfam.has_key(dom): pfam[dom] = self._addSearch() pfam[dom].info['Name'] = dom hit = pfam[dom]._addHit() hit.info['Name'] = hitname aln = hit._addAln() aln.setStat({ 'SbjStart': string.atoi(start), 'SbjEnd': string.atoi(end), 'Expect': string.atof(eval), 'BitScore': string.atof(score) }) hitx += 1 self.progLog( '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(hx), rje.integerString( len(pfam)), rje.integerString(hitx))) line = rje.chomp(RESFILE.readline()) newres.append(line) ## End of Protein ## elif line[:2] == '//': hitname = None newres.append(line) elif rje.matchExp( 'End of rje_hmm reduced results file: (%d) sequences in original', line): seqx = string.atoi( rje.matchExp( 'End of rje_hmm reduced results file: (\d+) sequences in original', line)[0]) elif newres: newres.append(line) #x#i += 1 line = RESFILE.readline() if newres and newresout and self.opt['CleanRes']: open(newresfile, 'a').write(string.join(newres, '\n')) if not seqx: seqx = hx if self.opt['CleanRes']: open(newresfile, 'a').write( string.join([ '', 'End of rje_hmm reduced results file: %d sequences in original' % seqx ], '\n')) os.unlink(resfile) os.rename(newresfile, resfile) self.printLog( '\r#RED', 'Results file %s replaced with reduced version (%s Hits only)' % (resfile, rje.integerString(hitx))) self.printLog( '\r#RES', 'Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(seqx), rje.integerString( len(pfam)), rje.integerString(hitx))) return True except: self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile)) return False
def parseOMIM(self): ### Main parsing method '''Main parsing method.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} aas = string.split( string.join(rje_sequence.aa_code_3.values()).upper()) oline = os.path.exists(self.info['Name']) (olen, ox, mx) = (len(open(self.info['Name'], 'r').readlines()), 0.0, 0) OMIM = open(self.info['Name'], 'r') ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### record = gene = subid = disease = mutation = '' av = False # Whether reading *FIELD* AV for mutation data while oline: oline = OMIM.readline() self.log.printLog( '\r#OMIM', 'Processing OMIM: %.2f%% (%s genes)' % (ox / olen, rje.integerString(len(self.dict['Records']))), newline=False, log=False) ox += 100.0 if not av and oline[:1] != '*': continue line = rje.chomp(oline) while line[-1:] == ' ': line = line[:-1] ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if line == '*RECORD*': (record, av) = ('', False) elif line == '*FIELD* NO': # New record record = rje.chomp(OMIM.readline()) gene = '' ox += 100.0 ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* TI': # New gene gene = string.split(rje.chomp(OMIM.readline()))[-1] subid = '' av = False ox += 100.0 ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* AV': av = True # Start of mutation records elif av and rje.matchExp('^(\.\d+)', line): # New subid mutation record subid = rje.matchExp('^(\.\d+)', line)[0] disease = rje.chomp(OMIM.readline()) ox += 100.0 try: mutation = rje.matchExp( '^%s, (\D\D\D\d+\D\D\D)' % gene, rje.chomp(OMIM.readline()))[0] except: continue # No mutation or not coding change ox += 100.0 subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)', mutation) if subaa[0] not in aas or subaa[1] not in aas: continue if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} mx += 1 self.dict['Mutations'][gene][subid] = (disease, mutation) ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OMIM.close() self.log.printLog( '\r#OMIM', 'Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len( self.dict['Records'])), rje.integerString(mx))) self.saveMutations() except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def mySQLOut(self,tmfile='tm.tdt',domfile='domains.tdt',sigfile='signalp.tdt',makenew=False): ### Output to tdt files ''' Output to tdt files. >> tmfile:str = File to save TM numbers in (no save if None) >> domfile:str = File to save domain data in (no save if None) >> sigfile:str = File to save choice SignalP data in (no save if None) >> makenew:boolean [False] = whether to make new files (True) or append (False) ''' try: ### <a> ### Setup _stage = '<a> Setup' siglist = ['nn_cleavemax','nn_cleavepos','nn_cleave','nn_sig_mean','nn_sig','nn_dscore','nn_d', 'hmm_cmax','hmm_cpos','hmm_cleave','hmm_sigprob','hmm_sig'] self.deBug(self.tmhmm) if tmfile and self.tmhmm.keys(): _stage = '<a-i> TM File' if makenew or os.access(tmfile, os.F_OK) == False: TMFILE = open(tmfile, 'w') TMFILE.write('acc_num\ttm\tnterm\tcterm\n') else: TMFILE = open(tmfile, 'a') if domfile and (self.tmhmm.keys()+self.signalp.keys()): _stage = '<a-ii> Dom File' if makenew or os.access(domfile, os.F_OK) == False: DOMFILE = open(domfile, 'w') DOMFILE.write('acc_num\tdomain\tdom_start\tdom_end\tsource\n') else: DOMFILE = open(domfile, 'a') if sigfile and self.signalp.keys(): _stage = '<a-iii> Sig File' if makenew or os.access(sigfile, os.F_OK) == False: SIGFILE = open(sigfile, 'w') sheader = string.join(siglist,'\t') SIGFILE.write('acc_num\t%s\n' % sheader) else: SIGFILE = open(sigfile, 'a') ### <b> ### TMHMM for acc in self.tmhmm.keys(): _stage = '<b> TMHMM' TMFILE.write('%s\t%s\t%s\t%s\n' % (acc,self.tmhmm[acc]['PredHel'],self.tmhmm[acc]['Topology'][0],self.tmhmm[acc]['Topology'][-1])) domains = self.tmhmm[acc]['Topology'] tm = False dom = 'CYTOPLASMIC' if domains[0] == 'o': dom = 'EXTRACELLULAR' domains = re.sub('o', '-', domains) domains = re.sub('i', '-', domains) domains = string.split('1' + domains + self.tmhmm[acc]['len'],'-') started = False while len(domains) > 1: _stage = '<b-i> TM Dom Write' start = domains.pop(0) end = domains[0] if tm: type = 'TRANSMEMBRANE' else: type = dom if started: #print start, start = '%d' % (string.atoi(start) + 1) #print start else: started = True if len(domains) > 1: #print end, end = '%d' % (string.atoi(end) - 1) #print end DOMFILE.write('%s\n' % string.join([acc,type,start,end,self.info['Source']],'\t')) if tm: tm = False if dom == 'CYTOPLASMIC': dom = 'EXTRACELLULAR' else: dom = 'CYTOPLASMIC' else: tm = True _stage = '<b-ii> TM Dom Check' if tm == False: self.log.errorLog('Problem with %s TM domains - wrong number of domains!' % acc) sigdic = {'nn_cleavemax':'nn_ymax','nn_cleavepos':'nn_ymaxpos','nn_cleave':'nn_ymax?','nn_sig_mean':'nn_smean','nn_sig':'nn_smean?', 'nn_dscore':'nn_d','nn_d':'nn_d?','hmm_cmax':'hmm_cmax','hmm_cpos':'hmm_cmaxpos','hmm_cleave':'hmm_cmax?', 'hmm_sigprob':'hmm_sprob','hmm_sig':'hmm_sprob?'} ### <c> ### SignalP for acc in self.signalp.keys(): _stage = '<c> SignalP' accout = acc if re.search('_HUMAN_(\S+)$', acc): accout = rje.matchExp('_HUMAN_(\S+)$', acc)[0] writelist = [accout] for stat in siglist: #print accout, stat writelist.append(self.signalp[acc][sigdic[stat]]) #print '%s\n' % string.join(writelist,'\t') SIGFILE.write('%s\n' % string.join(writelist,'\t')) _stage = '<c-ii> SingalP domains' nn_y = string.atoi(self.signalp[acc]['nn_ymaxpos']) - 1 hmm_c = string.atoi(self.signalp[acc]['hmm_cmaxpos']) - 1 if self.signalp[acc]['nn_d?'] == 'Y': DOMFILE.write('%s\n' % string.join([accout,'SIGNALP','1','%d' % nn_y,'signalp-NN'],'\t')) if self.signalp[acc]['nn_ymax?'] == 'Y': DOMFILE.write('%s\n' % string.join([accout,'CLEAVAGE','%d' % nn_y,'%d' % (nn_y+1),'signalp-NN'],'\t')) if self.signalp[acc]['hmm_sprob?'] == 'Y': DOMFILE.write('%s\n' % string.join([accout,'SIGNALP','1','%d' % hmm_c,'signalp-HMM'],'\t')) if self.signalp[acc]['hmm_cmax?'] == 'Y': DOMFILE.write('%s\n' % string.join([accout,'CLEAVAGE','%d' % hmm_c,'%d' % (hmm_c+1),'signalp-HMM'],'\t')) ### <d> ### Finish _stage = '<d> Finish' if tmfile and self.tmhmm.keys(): TMFILE.close() if domfile and (self.tmhmm.keys()+self.signalp.keys()): DOMFILE.close() if sigfile and self.signalp.keys(): SIGFILE.close() return except: self.log.errorLog('Problem with mySQLOut() %s.' % _stage)
def alignmentToLocal(self,alignment=[],protqry=False): ### Converts alignment into local hits table ''' Converts alignment into local hits table. >> alignment:list of alignment text strings parsed from exonerate output. >> protqry:bool[False] = Whether query is protein << returns local database table. ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### vfields = ['Qry','Hit','AlnID','Score','Expect','Length','Identity','Positives','QryStart','QryEnd','HitStart','HitEnd','QrySeq','HitSeq','AlnSeq','Rank','Phase','HitStrand'] vdb = self.db().addEmptyTable('local',vfields,['Qry','Hit','AlnID']) ### ~ [2] Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ''' Query: FAXD1_NOTSC (P82807) Venom prothrombin activator notecarin-D1 [Notechis scutatus scutatus] Target: ahap_PSETE__EBS10XV2AHAP187 haploidB edges=694320..157489 left=833615 right=281503 ver=1.9 style=4:[revcomp] Model: protein2genome:local Raw score: 1170 Query range: 19 -> 295 Target range: 12312786 -> 12307250 20 : AlaGluSerAsnValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg : 37 ..!...||| |||||||||||||||||||||||||||||||||||||||||| CysSerSerLeuValPheLeuLysSerLysValAlaAsnArgPheLeuGlnArg 12312786 : TGTTCTTCTTTAGTATTCTTAAAAAGCAAAGTGGCAAATAGATTTTTGCAAAGA : 12312735 264 : {G} >>>> Target Intron 7 >>>> {ly}GluIleAspIleSerArg : 270 {|} 1304 bp {||}|||||||||||||||!!! {G}++ ++{ly}GluIleAspIleSerSer 12308652 : {G}gt.........................ag{GG}GAAATAGACATATCAAGC : 12307328 289 : ValProProAsnTyrTyrTyr : 295 |||||| !!!..||| !!||| ValProAlaThrTyrAspTyr 12307273 : GTTCCTGCCACGTATGACTAT : 12307251 ''' qry = None hit = None alnx = {} ventry = {} parsing = alignment[0:] rank = 1 while parsing: line = parsing.pop(0) #self.bugPrint(line) # Query if rje.matchExp('Query: (\S+)',line): if ventry: vdb.addEntry(ventry) ventry = {'Qry':rje.matchExp('Query: (\S+)',line)[0],'QrySeq':'','HitSeq':'','AlnSeq':'','Rank':rank} rank += 1 # Hit if rje.matchExp('Target: (\S+)',line): ventry['Hit'] = rje.matchExp('Target: (\S+)',line)[0] qh = (ventry['Qry'],ventry['Hit']) if qh in alnx: alnx[qh] += 1 else: alnx[qh] = 1 ventry['AlnID'] = alnx[qh] # Score if rje.matchExp('core: (\S+)',line): ventry['Score'] = int(rje.matchExp('core: (\S+)',line)[0]) # Alignment if rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line): adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',line) #self.bugPrint('= new aln: %s -> %s' % (adata[0],adata[2])) start = int(adata[0]) end = int(adata[2]) aln = adata[1] x = line.find(aln) if 'QryStart' not in ventry: ventry['QryStart'] = start ventry['QryEnd'] = end ventry['QrySeq'] += aln #self.bugPrint('^%s$' % ventry['QrySeq']) line = parsing.pop(0) #self.bugPrint(line) #self.bugPrint(']%s[' % aln) #self.bugPrint(']%s[' % line[x:x+len(aln)]) ventry['AlnSeq'] += line[x:x+len(aln)] #self.debug('^%s$' % ventry['AlnSeq']) #self.bugPrint(parsing[0]) adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0)) if not adata: #self.deBug(parsing[0]) adata = rje.matchExp('^\s+(\d+) : (.+) :\s+(\d+)',parsing.pop(0)) if not adata: raise ValueError('Partial alignment! Truncated output?') #self.bugPrint('+ hit aln: %s -> %s' % (adata[0],adata[2])) start = int(adata[0]) end = int(adata[2]) aln = adata[1] if 'HitStart' not in ventry: ventry['HitStart'] = start ventry['HitEnd'] = end ventry['HitSeq'] += aln if ventry: vdb.addEntry(ventry) ## Seq Check for ventry in vdb.entries(): #self.bugPrint('^%s$' % ventry['QrySeq']) #self.bugPrint('^%s$' % ventry['AlnSeq']) #self.bugPrint('^%s$' % ventry['HitSeq']) if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']): self.debug(ventry) raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq']))) ### ~ [3] Split on introns ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['DNAHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F','dna=T']) self.obj['ProtHits'] = rje_seqlist.SeqList(self.log,self.cmd_list+['seqin=None','seqmode=tuple','autoload=F']) #i# Protein Position Conversion if protqry: for ventry in vdb.entries(): # 1->1, 2->4, 3->7 = 1+3*(n-1) ventry['QryStart'] = 1+3*(ventry['QryStart']-1) if ventry['QrySeq'].startswith('{'): codend = ventry['QrySeq'].find('}') # {X} = phase 2, find = 2 if codend == 2: ventry['QryStart'] += 2 # {XX} = phase 1, find = 3 elif codend == 3: ventry['QryStart'] += 1 else: raise ValueError('QrySeq {} bracket mismatch!: %s' % ventry) ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1 vdb.newKey(['Qry','Rank','Hit','AlnID']) for vkey in vdb.dataKeys(): ventry = vdb.data(vkey) #i# Make a combined hitseq to output to fasta #># phap_PSETE__EBS10XV2PHAP187.FAXD1_NOTSC.XXX hitname = '%s.ex%s %s %s-%s' % (ventry['Qry'],ventry['Rank'],ventry['Hit'],rje.iStr(ventry['HitStart']),rje.iStr(ventry['HitEnd'])) hitseq = '' phase = (ventry['QryStart'] + 2) % 3 alnx = 1 vkeyentries = [ventry] dirn = 1 if ventry['HitEnd'] < ventry['HitStart']: dirn = -1 ventry['HitStrand'] = '-' else: ventry['HitStrand'] = '+' for seq in ['HitSeq','QrySeq','AlnSeq']: ventry[seq] = string.replace(ventry[seq],'}','') ventry[seq] = string.replace(ventry[seq],'{','') while rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq']): intron = rje.matchExp('(\s+>>>> Target Intron \d+ >>>>\s+)',ventry['QrySeq'])[0] x = ventry['QrySeq'].find(intron) y = x + len(intron) intronlen = int(rje.matchExp('(\d+) bp',ventry['AlnSeq'][x:y])[0]) #i# Create a new entry of the first exon newentry = rje.combineDict({},ventry) for seq in ['HitSeq','QrySeq','AlnSeq']: newentry[seq] = newentry[seq][:x] newentry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx); alnx += 1 newentry['QryEnd'] = newentry['QryStart'] + len(newentry['QrySeq']) - string.count(newentry['QrySeq'],'-') - 1 newentry['HitEnd'] = newentry['HitStart'] + (len(newentry['HitSeq']) - string.count(newentry['HitSeq'],'-') - 1) * dirn newentry['Length'] = x newentry['Identity'] = string.count(newentry['AlnSeq'],'|') vkeyentries.append(vdb.addEntry(newentry)) hitseq += newentry['HitSeq'] #i# Update ventry to be the rest of the hit for seq in ['HitSeq','QrySeq','AlnSeq']: ventry[seq] = ventry[seq][y:] ventry['QryStart'] = newentry['QryEnd'] + 1 if protqry: ventry['QryEnd'] = ventry['QryStart'] + len(ventry['QrySeq']) - string.count(ventry['QrySeq'],'-') - 1 ventry['HitStart'] = newentry['HitEnd'] + intronlen * dirn #i# Calculate length and identity of final exon ventry['AlnID'] = '%s.%d' % (ventry['AlnID'],alnx) ventry['Length'] = len(ventry['AlnSeq']) ventry['Identity'] = string.count(ventry['AlnSeq'],'|') #i# Add sequence hits hitname += ' (%d alignment blocks)' % alnx hitseq += ventry['HitSeq'] hitseq = string.replace(hitseq,'-','') protseq = rje_sequence.dna2prot('%s%s' % ('N' * phase,hitseq)) self.obj['ProtHits']._addSeq(hitname,protseq) if ventry['HitStart'] > ventry['HitEnd']: hitseq = rje_sequence.reverseComplement(hitseq) self.obj['DNAHits']._addSeq(hitname,hitseq) #i# Update AlnID for proper float sorting for ventry in vkeyentries: (vcore,vx) = string.split(ventry['AlnID'],'.') ventry['AlnID'] = '%s.%s' % (vcore,rje.preZero(int(vx),alnx)) #self.debug(ventry) vdb.dataFormat({'AlnID':'string'}) vdb.remakeKeys() self.debug(vdb.dataKeys()) ## Seq Check for ventry in vdb.entries(): #self.bugPrint('^%s$' % ventry['QrySeq']) #self.bugPrint('^%s$' % ventry['AlnSeq']) #self.bugPrint('^%s$\n' % ventry['HitSeq']) if len(ventry['QrySeq']) != len(ventry['AlnSeq']) or len(ventry['QrySeq']) != len(ventry['HitSeq']): self.debug(ventry) raise ValueError('Alignment sequence length mismatch! Qry:%d ; Aln:%d ; Hit:%d' % (len(ventry['QrySeq']),len(ventry['AlnSeq']),len(ventry['HitSeq']))) udb = self.reduceLocal(byqry=True) udb.rename('unique') udb.newKey(['Qry','Rank','Hit','AlnID']) self.debug(vdb.dataKeys()) #i# Calculate exon phase for ventry in vdb.entries() + udb.entries(): ventry['Phase'] = (ventry['QryStart'] - 1) % 3 #i# Protein Position Conversion if protqry: for ventry in vdb.entries(): ventry['QryStart'] = (ventry['QryStart']+2)/3 ventry['QryEnd'] = (ventry['QryEnd']+2)/3 for ventry in udb.entries(): ventry['QryStart'] = (ventry['QryStart']+2)/3 ventry['QryEnd'] = (ventry['QryEnd']+2)/3 #vdb.remakeKeys() return vdb except: self.errorLog('%s.alignmentToLocal error' % self.prog()); raise
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta,'r').readlines(): if line[:1] == '>': try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line) except: name = rje.matchExp('^>(\S+)',line)[0] if len(string.split(name,'|')) == 3: name = '6rf_NEIME__%s' % string.split(name,'|')[2] open(fas,'a').write('>%s\n' % name) elif len(string.split(name,'|')) == 5: name = 'ref_NEIME__%s' % string.split(name,'|')[3] open(fas,'a').write('>%s %s\n' % (name,desc)) else: print string.split(name,'|'); raise ValueError self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1 else: open(fas,'a').write(line) self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc'; open(acc,'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv,'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line,',') if len(data) < 2: continue [num,name] = data[:2] try: name = string.split(name,'|')[2] except: continue if name not in rfhits: open(acc,'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file,num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits)))) self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self,gfile,['Qry'],['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros,False) open('6rf_zeros.acc','w').write(string.join(zeros,'\n')) self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam() gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True) fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True) headers = gdata.pop('Headers') headers.insert(1,'Sample') headers.append('BestHit') rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf,'__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN',rje_zen.Zen().wisdom())
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt' self.progLog('\r#PPI', 'Loading pairwise data...') pairwise = rje.dataDict(self, ppipairwise, ['Hub', 'Spoke'], ['Spoke', 'SpokeSeq', 'Evidence']) gene2seq = {} seq2gene = {} fullppi = {} px = 0.0 ptot = len(pairwise) ppix = 0 for pair in rje.sortKeys(pairwise): self.progLog( '\r#PPI', 'Processing full pairwise PPI: %.2f%%' % (px / ptot)) px += 100.0 [hub, spoke] = string.split(pair, '\t') if spoke not in gene2seq: sseq = pairwise[pair]['SpokeSeq'] gene2seq[spoke] = sseq seq2gene[string.split(sseq, '__')[0]] = spoke if hub not in fullppi: fullppi[hub] = {} if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence'] ppix += 1 self.printLog( '\r#PPI', 'Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)), rje.integerString(ppix / 2))) ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt' goodppi = self.loadFromFile(goodppifile, chomplines=True) self.dict['PPI'] = {} px = 0.0 ptot = len(fullppi) fppix = ppix ppix = 0 for hub in fullppi: self.progLog( '\r#PPI', 'Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px / ptot, rje.integerString(len( self.dict['PPI'])), rje.integerString(ppix))) px += 100.0 self.dict['PPI'][hub] = [] for spoke in fullppi[hub]: goodspoke = False for ptype in goodppi: if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True break if goodspoke: self.dict['PPI'][hub].append(spoke) continue goodspoke = True for spoke2 in fullppi[hub]: if spoke2 in [hub, spoke]: continue if spoke2 in fullppi[spoke]: goodspoke = False break if goodspoke: self.dict['PPI'][hub].append(spoke) ppix += len(self.dict['PPI'][hub]) if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub) self.printLog( '\r#PPI', 'Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString( len(fullppi)), rje.integerString(len(self.dict['PPI'])), rje.integerString(fppix / 2), rje.integerString(ppix / 2))) ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas' scmd = ['accnr=F', 'seqnr=F', 'seqin=%s' % seqfile] + self.cmd_list + ['autoload=T'] seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd) self.dict['SeqObj'] = seqlist.seqNameDic('Max') self.dict['Gene2Seq'] = gene2seq self.dict['Seq2Gene'] = seq2gene return True # Setup successful except: self.errorLog('Problem during %s setup.' % self) return False # Setup failed
def ANCHOR(self, retry=2): ### Runs ANCHOR disorder prediction '''Runs ANCHOR disorder prediction.''' try: ### ~ [0] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [0a] ~ Setup sequence and temp file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## sequence = self.info['Sequence'].upper() name = self.info['Name'][:4] + rje.randomString(8) tmp = name + '.tmp' ## ~ [0b] ~ Setup ANCHOR ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## apath = self.info['ANCHOR'] if os.path.basename(apath) == 'anchor': apath = os.path.dirname(apath) anchor = rje.makePath(apath) + 'anchor' if not os.path.exists(anchor): self.errorLog('Path "%s" not found!' % anchor, printerror=False) retry = 0 raise IOError ### ~ [1] Run ANCHOR Disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open(tmp, 'w').write('>%s\n%s\n' % (name, sequence)) acmd = '%s %s -d %s' % (anchor, tmp, apath) dlines = os.popen(acmd).readlines() try: os.unlink(tmp) except: self.errorLog('Cannot delete %s!' % tmp) ### ~ [2] Read in results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Name'] not in ['', 'None']: name = self.info['Name'] self.list['ResidueDisorder'] = [] for d in dlines: if d[:1] == '#': continue if rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d): dm = rje.matchExp('^(\d+)\s+(\S)\s+(\S+)', d) pos = string.atoi(dm[0]) aa = dm[1] score = string.atof(dm[2]) i = len(self.list['ResidueDisorder']) if sequence[i] != aa: self.log.errorLog( '%s: Position %d is %s in sequence but %s in ANCHOR output!' % (name, pos, sequence[i], aa), printerror=False) raise ValueError if pos != (i + 1): self.log.errorLog( '%s: Position %d reached in ANCHOR output but previous results missing!' % (name, pos), printerror=False) raise ValueError self.list['ResidueDisorder'].append(score) if len(self.list['ResidueDisorder']) != len(sequence): self.log.errorLog( '%s: Sequence = %d aa but ANCHOR results stop at %s!' % (name, len(sequence), len(self.list['ResidueDisorder'])), printerror=False) raise ValueError ### ~ [3] ~ Make Regions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] start = 0 fstart = 0 i = 0 dx = 0 while i < len(sequence): score = self.list['ResidueDisorder'][i] i += 1 if not start and score > self.stat[ 'IUCut']: ### Start new disorder ### start = i elif start and score <= self.stat['IUCut']: ### End! self.list['RegionDisorder'].append((start, i - 1)) dx += i - start start = 0 if not fstart and score <= self.stat[ 'IUCut']: ### Start new fold ### fstart = i elif fstart and score > self.stat['IUCut']: ### End! self.list['RegionFold'].append((fstart, i - 1)) fstart = 0 if start: self.list['RegionDisorder'].append((start, len(sequence))) dx += len(sequence) + 1 - start if fstart: self.list['RegionFold'].append((fstart, len(sequence))) self.minRegion() if self.opt['PrintLog']: self.log.printLog( '\r#DIS', 'ANCHOR Disorder prediction complete: %d disorder regions, %d disordered aa' % (len(self.list['RegionDisorder']), dx)) return True except: if retry: self.printLog('#RETRY', 'Trying %s again...' % name) return self.ANCHOR(retry - 1) self.log.errorLog( 'Error in Disorder.ANCHOR(%s). Disorder prediction failed.' % name) self.list['RegionDisorder'] = [] self.list['RegionFold'] = [] return False
def readHMMPFamSearch(self,resfile=None,readaln=False): ### Reads HMM PFam Search Results into objects ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!! ''' try: ### Setup ### if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file "%s" missing!' % resfile,printerror=False) return False ## Make RegExp for starting next alignment ## re_hit = string.join(['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)'],'\s+') ## Search dictionary as results come back per sequence, not per HMM! ## pfam = {} # Dictionary of {PFam name:search} hitx = 0 # Total number of hits hitlist = [] # List of sequences processed from file (may or may not include zero hit sequences) ### Read in Search results ### if open(resfile,'r').readline().find('hmmpfam') != 0: self.errorLog('File "%s" does not appear to be an hmmpfam results file' % resfile,printerror=False) if rje.yesNo('Delete incorrect results file? (Check that hmmpfam=T is right!)',default='N'): os.unlink(resfile) self.printLog('#DEL','Dodgy results file "%s" deleted.' % resfile) return False hitname = None i = 0; hx = 0; seqx = 0 RESFILE = open(resfile,'r') #x#resline = self.loadFromFile(resfile,chomplines=True) #x#while i < len(resline): line = RESFILE.readline() newres = [rje.chomp(line)]; newresout = True; newresfile = '%s.partial' % resfile if os.path.exists(newresfile): os.unlink(newresfile) while line: self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx))) line = rje.chomp(line) #print line ## New Sequence ## if rje.matchExp('^Query sequence:\s+(\S+)',line): if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n')) newres = ['',line]; newresout = False hitname = rje.matchExp('^Query sequence:\s+(\S+)',line)[0]; hx += 1 #x#if hitname not in hitlist: hitlist.append(hitname) ## One Line Data for hits ## elif line.find('Parsed for domains:') == 0: #x#i += 3 # Skip two complete lines newres += [line,rje.chomp(RESFILE.readline()),rje.chomp(RESFILE.readline())] line = rje.chomp(RESFILE.readline()); newres.append(line) #Model Domain seq-f seq-t hmm-f hmm-t score E-value #-------- ------- ----- ----- ----- ----- ----- ------- #Lep_receptor_Ig 1/1 24 114 .. 1 103 [] 158.4 1.7e-44 # ... else ... # [no hits above thresholds] while rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line): newresout = True (dom,start,end,score,eval) = rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line) if not pfam.has_key(dom): pfam[dom] = self._addSearch() pfam[dom].info['Name'] = dom hit = pfam[dom]._addHit() hit.info['Name'] = hitname aln = hit._addAln() aln.setStat({'SbjStart':string.atoi(start),'SbjEnd':string.atoi(end),'Expect':string.atof(eval),'BitScore':string.atof(score)}) hitx += 1 self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx))) line = rje.chomp(RESFILE.readline()); newres.append(line) ## End of Protein ## elif line[:2] == '//': hitname = None; newres.append(line) elif rje.matchExp('End of rje_hmm reduced results file: (%d) sequences in original',line): seqx = string.atoi(rje.matchExp('End of rje_hmm reduced results file: (\d+) sequences in original',line)[0]) elif newres: newres.append(line) #x#i += 1 line = RESFILE.readline() if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n')) if not seqx: seqx = hx if self.opt['CleanRes']: open(newresfile,'a').write(string.join(['','End of rje_hmm reduced results file: %d sequences in original' % seqx],'\n')) os.unlink(resfile) os.rename(newresfile,resfile) self.printLog('\r#RED','Results file %s replaced with reduced version (%s Hits only)' % (resfile,rje.integerString(hitx))) self.printLog('\r#RES','Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(seqx),rje.integerString(len(pfam)),rje.integerString(hitx))) return True except: self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile)) return False
def readHMMSearch(self,resfile=None,readaln=False): ### Reads HMM Search Results into objects #!# Needs tidying! #!# ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] (!!!currently always True!!!) ''' try: ### <a> ### Setup _stage = '<a> Setup' #print resfile if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file (%) missing!' % resfile,False,False) raise IOError _hit_elements = ['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)'] _hit_re = string.join(_hit_elements,'\s+') ### <b> ### Read in Search results _stage = '<b> Read Results' self.verbose(0,4,'Reading %s HMMer search results' % resfile,0) RESFILE = open(resfile, 'r') lines = RESFILE.readlines() RESFILE.close() resline = [] for line in lines: resline.append(re.sub('\n','',line)) search = None i = 0 hitaln = 0 if resline[i].find('hmmsearch') != 0: self.log.errorLog("File %s does not appear to be an hmmsearch results file" % resfile) raise while i < len(resline): line = resline[i] #print line ## <i> ## Basic Search Info _stage = '<b-i> Basic Search Info' if line.find('HMM file:') == 0: search = self._addSearch() search.info['Name'] = rje.matchExp('HMM file:\s+(\S+)',line)[0] self.verbose(0,4,'.',0) self.verbose(1,3,'\n%s' % search.info['Name'],0) elif line.find('Sequence database:') == 0: search.info['SearchDB'] = rje.matchExp('Sequence database:\s+(\S+)', line)[0] elif line.find('Total sequences searched:') == 0: dbnum = rje.matchExp('Total sequences searched:\s+(\d\S*)', line)[0] dbnum = re.sub('\D','',dbnum) search.stat['DBNum'] = string.atoi(dbnum) ## <ii> ## One-line hit data (BLASTHit) elif line.find('Scores for complete sequences') == 0: # One-line hits _stage = '<b-ii> One-line hits' i += 3 # Skip two lines while re.search('^(\S+)\s.+\s(\S*\d)\s+(\S*\d)\s+(\d+)\s*$',resline[i]): match = rje.matchExp('^(\S+)\s.+\s(\S*\d)\s+(\S*\d)\s+\d+\s*$',resline[i]) self.verbose(2,3,'\n - %s (%s, %s)' % match,0) hit = search._addHit() hit.info['Name'] = match[0] hit.stat['BitScore'] = string.atof(match[1]) #print hit.stat['BitScore'], resline[i], match eval = match[2] if eval.find('e') == 0: eval = '1' + eval hit.stat['E-Value'] = string.atof(eval) i += 1 line = resline[i] # End of one-lines (blank line) self.verbose(1,3,'=> %d Hits' % search.hitNum(),1) hitaln = 0 #!# Make new No hits pattern match elif line.find('***** No hits found ******') >= 0: # No Hits search.hit = [] self.verbose(1,3,'=> %d Hits' % search.hitNum(),1) hitaln = 0 ## <iii> ## Aln Hit data (PWAln) #!# Consider reading in the 'parsed for domains' section instead/as well elif re.search(_hit_re,line): # New aln hit _stage = '<b-iii> Aln Hit Info' # Identify hit object _hit_detail = rje.matchExp(_hit_re,line) #print _hit_detail hitname = _hit_detail[0] #hitaln += 1 - string.atoi(_hit_detail[1]) #print hitname try: #if hitname != search.hit[hitaln].info['Name']: for hit in search.hit: if hit.info['Name'] == hitname: hitaln = search.hit.index(hit) if hitname != search.hit[hitaln].info['Name']: self.log.errorLog('Problem with HMM results %s - %s single-line hits and alignments do not match' % (hitname,search.info['Name']),printerror=False,quitchoice=True) i += 1 continue except: self.log.errorLog('Problem with HMM results reconciling %s - %s single-line hits and alignments.' % (hitname,search.info['Name']),True,True) i += 1 continue hit = search.hit[hitaln] #print hit hitaln += 1 # Add details _stage = '<b-iii> Add Aln Hit Info' aln = hit._addAln() aln.stat['SbjStart'] = string.atoi(_hit_detail[3]) aln.stat['SbjEnd'] = string.atoi(_hit_detail[4]) aln.stat['BitScore'] = string.atof(_hit_detail[5]) aln.stat['Expect'] = string.atof(_hit_detail[6]) ## <iv> ## Alignments readaln = True i += 1 while readaln: _stage = '<b-iv> Read alignments' line = resline[i] #print line block = rje.matchExp('^(\s+)(\S+)',line) #print block if block: # Query Line leadlen = len(block[0]) seqblock = block[1] #print block, leadlen, (leadlen+len(seqblock)) if block[1][:3] == '*->': # Start leadlen += 3 #print seqblock[3:] seqblock = seqblock[3:] if block[1][-3:] == '<-*': # End #print seqblock[:-3] seqblock = seqblock[:-3] readaln = False #print block, leadlen, (leadlen+len(seqblock)) aln.info['QrySeq'] += seqblock # Alignment Line i += 1 aln.info['AlnSeq'] += resline[i][leadlen:(leadlen+len(seqblock))] # Subject Line i += 1 aln.info['SbjSeq'] += resline[i][leadlen:(leadlen+len(seqblock))] # Skip Blank line i += 2 else: #print 'This should be a block!:\n', line i += 1 i += 1 #print self.search #print self.search[0].hit #print self.search[0].hit[0].aln self.verbose(0,1,'Reading of %s HMM results complete! (%d Searches)' % (resfile,len(self.search)),2) return True except: self.log.errorLog('Calamity during readHMMSearch(%s) %s.' % (resfile,_stage)) return False
def makeFlySeq(self): ### Main run method '''Main run method.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### flybase = rje.makePath('/scratch/Databases/NewDB/FlyBase/Fasta/') scmd = ['accnr=F','seqnr=F','gnspacc=F'] genes = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-gene-r5.5.fasta' % flybase]+scmd) cds = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-CDS-r5.5.fasta' % flybase]+scmd) exons = rje_seq.SeqList(self.log, self.cmd_list+['seqin=%sdmel-all-exon-r5.5.fasta' % flybase]+scmd) ### ~ [1] ~ Read in full-length gene and note start and end positions in parent scaffold ~~~~~~~~~~~~~~~~ ### genedict = {} # Dictionary of {ID:Sequence object} (gx,gtot) = (0.0,genes.seqNum()) for gene in genes.seq: self.log.printLog('\r#GENE','Processing Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100 (id,scaffold,pos,name,glen) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);',gene.info['Name']) if string.atoi(glen) != gene.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) genedict[id] = gene gene.setInfo({'Scaffold':scaffold,'Gene':name}) try: (end,start) = rje.matchExp('^complement\((\d+)\.\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) gene.opt['Complement'] = start > end # Sequence on "lagging" strand gene.setStat({'Start':start,'End':end}) gene.list['CDS'] = [] # Will add CDS sequences here gene.list['Exon'] = [] # Will add exon sequences here self.log.printLog('\r#GENE','Processing Gene Annotation complete!') ### ~ [2] ~ Read in associated CDS sequences and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (cx,ctot) = (0.0,cds.seqNum()) for seq in cds.seq: self.log.printLog('\r#CDS','Processing CDS Annotation: %.1f%%' % (cx/ctot),newline=False,log=False) cx += 100 try: (id,scaffold,pos,name,glen,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+length=(\d+);.+parent=(\S+),\S+;',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise if string.atoi(glen) != seq.aaLen(): self.log.errorLog('%s Length mismatch!' % id, printerror=False) seq.obj['Parent'] = gene = genedict[parent] try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos) except: try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) seq.opt['Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start':start,'End':end}) gene.list['CDS'].append(seq) self.log.printLog('\r#CDS','Processing CDS Annotation complete!') ### ~ [3] ~ Read in associated exons and note start and end positions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (ex,etot) = (0.0,exons.seqNum()) for seq in exons.seq: self.log.printLog('\r#EXON','Processing Exon Annotation: %.1f%%' % (ex/etot),newline=False,log=False) ex += 100 try: (id,scaffold,pos,name,parent) = rje.matchExp('^(\S+)\s.+loc=(\S+):(\S+);.+name=(\S+);.+parent=(\S+);',seq.info['Name']) except: self.log.errorLog(seq.info['Name']) raise seq.obj['Parent'] = gene = genedict[string.split(parent,',')[0]] try: (end,start) = rje.matchExp('^complement\((\d+)\..*\.(\d+)\)',pos) except: try: (start,end) = rje.matchExp('^join\((\d+)\..*\.(\d+)\)',pos) except: (start,end) = rje.matchExp('^(\d+)\.\.(\d+)',pos) (start,end) = (string.atoi(start),string.atoi(end)) seq.opt['Complement'] = start > end # Sequence on "lagging" strand seq.setStat({'Start':start,'End':end}) gene.list['Exon'].append(seq) self.log.printLog('\r#EXON','Processing Exon Annotation complete!') ### ~ [4] ~ Regenerate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [4a] ~ Convert to relative positions and store ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (gx,gtot) = (0.0,genes.seqNum()) for gene in genes.seq: glen = gene.aaLen() self.log.printLog('\r#GENE','Generating new Gene Annotation: %.1f%%' % (gx/gtot),newline=False,log=False) gx += 100 clist = [] for seq in gene.list['CDS']: if gene.opt['Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen)) clist.append(pos) clist = rje.sortUnique(clist,xreplace=False) elist = [] for seq in gene.list['Exon']: if gene.opt['Complement']: # Must substract from "wrong" end and reverse start = gene.stat['Start'] - seq.stat['Start'] end = gene.stat['Start'] - seq.stat['End'] else: start = seq.stat['Start'] - gene.stat['Start'] end = seq.stat['End'] - gene.stat['Start'] pos = '%s-%s' % (rje.preZero(start,glen),rje.preZero(end,glen)) elist.append(pos) elist = rje.sortUnique(elist,xreplace=False) gene.info['Name'] = '%s_%s__%s Length=%d; CDS=%s; Exons=%s;' % (gene.info['Gene'],gene.info['SpecCode'],gene.info['AccNum'],gene.aaLen(),string.join(clist,','),string.join(elist,',')) self.log.printLog('\r#GENE','Generating new Gene Annotation complete!') ## ~ [4b] ~ Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## genes.saveFasta(seqfile='flybase_DROME.genes.fas') except: self.log.errorLog(rje_zen.Zen().wisdom())
def makeHTML(self): ### Generates HTML pages for interactive navigation. '''Generates HTML pages for interactive navigation.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### basefile = self.basefile() scmd = self.cmd_list + ['seqin=%s' % self.getStr('Candidates'),'autoload=T','autofilter=F','seqmode=file'] candseq = rje_seqlist.SeqList(self.log,scmd) # All files and directories are named after basefile: # *.fas = original target PROTEIN sequences (with original descriptions) scmd = self.cmd_list + ['seqin=%s' % self.getStr('SeqIn'),'autoload=T','autofilter=F','seqmode=file'] seqlist = rje_seqlist.SeqList(self.log,scmd) # *.gablam.tdt = GABLAM results with match details. (Might have *.hmmer.tdt instead.) gdb = self.db().addTable('%s.gablam.tdt' % basefile,mainkeys=['Qry','Hit'],name='gablam',expect=False) # - Contains candidate proteins as Queries and Target proteins as hits # *.HAQESAC/ = directory containing individual HAQESAC runs, named after Hit accnum haqdir = rje.makePath('./%s.HAQESAC/' % basefile) ### ~ [2] Generate front page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hfile = '%s.html' % basefile hobj = self.obj['HTML'] hobj.list['StyleSheets'] = ['http://www.slimsuite.unsw.edu.au/stylesheets/rje_tabber.css', 'http://www.slimsuite.unsw.edu.au/stylesheets/slimhtml.css'] html = hobj.htmlHead(basefile) # Front page should have: html += '<h1>%s</h1>\n\n' % basefile htabs = [] # (tab_id, tab_html_text[, tab_title]) # Target protein list (with links to HAQ HTML) ctext = '%s\n' % string.join(['Name','Descripton','Length'],'\t') seqdict = seqlist.makeSeqNameDic('short') if gdb: hitlist = gdb.indexKeys('Hit') else: hitlist = rje.sortKeys(seqdict) for name in hitlist: seq = seqdict[name] cseq = [name,seqlist.seqDesc(seq),'%s aa' % seqlist.seqLen(seq)] acc = seqlist.seqAcc(seq) if os.path.exists('%s%s.log' % (haqdir,acc)): cseq[0] = '<a href="%s%s.html">%s</a>' % (haqdir,acc,cseq[0]) ctext += '%s\n' % string.join(cseq,'\t') htabs.append(('Hits',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'Target sequences hit by candidates.')) # GABLAM/HMM table (with above links) if gdb: ctext = '%s\n' % string.join(gdb.fields(),'\t') for gline in open('%s.gablam.tdt' % basefile,'r').readlines()[1:]: gdata = string.split(gline,'\t') acc = string.split(gdata[0],'__')[-1] gdata[0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (acc,gdata[0]) acc = string.split(gdata[1],'__')[-1] gdata[1] = '<a href="%s%s.html">%s</a>' % (haqdir,acc,gdata[1]) ctext += '%s\n' % string.join(gdata,'\t') htabs.append(('GABLAM',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'GABLAM hit table.')) # Candidate list (with DB links) if candseq.seqNum(): ctext = '%s\n' % string.join(['AccNum','ID','Descripton','Length'],'\t') accdict = candseq.makeSeqNameDic('accnum') for acc in rje.sortKeys(accdict): seq = accdict[acc] cseq = [acc,candseq.seqID(seq),candseq.seqDesc(seq),'%s aa' % candseq.seqLen(seq)] cseq[0] = '<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a>' % (acc,acc) ctext += '%s\n' % string.join(cseq,'\t') htabs.append(('Candidates',rje_html.tableToHTML(ctext,'\t',tabid='parse'),'Candidate sequences to search.')) html += hobj.tabberHTML('GABLAM',htabs) html += hobj.htmlTail() open(hfile,'w').write(html) ### ~ [3] Generate sequence-specific pages ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### #?# Move this to HAQESAC or MultiHAQ for i in range(len(hitlist)): hit = string.split(hitlist[i],'__')[-1] logfile = '%s%s.log' % (haqdir,hit) seqbase = logfile[:-4] hfile = '%s.html' % seqbase html = hobj.htmlHead(seqbase) # Front page should have: html += '<h1>%s</h1>\n\n' % seqbase html += '<p>Click <a href="../%s.html">here</a> to return to results summary. \n' % basefile if i: html += 'Previous: <a href="./%s.html"><code>%s</code></a>. \n' % (string.split(hitlist[i-1],'__')[-1],hitlist[i-1]) if i < len(hitlist)-1: html += 'Next: <a href="./%s.html"><code>%s</code></a>. \n' % (string.split(hitlist[i+1],'__')[-1],hitlist[i+1]) html += '</p>\n' htabs = [] # (tab_id, tab_html_text[, tab_title]) for ftype in ['png','tree.txt','fas','nwk','log']: seqfile = '%s.%s' % (seqbase,ftype) if not os.path.exists(seqfile): continue tabtext = '<p><a href="./%s">./%s</a></p>\n' % (os.path.basename(seqfile),os.path.basename(seqfile)) if ftype == 'png': tabtext += '<a href="./%s"><img src="%s" width="100%%"></a>\n' % (os.path.basename(seqfile),os.path.basename(seqfile)) tabdesc = 'PNG of %s tree.' % seqbase else: tabtext += '<pre>%s</pre>\n' % open(seqfile,'r').read() if ftype == 'tree.txt': for xref in hitlist: reptext = '<a href="./%s.html">%s</a>' % (string.split(xref,'__')[-1],xref) tabtext = string.replace(tabtext,': %s ' % xref,': %s ' % reptext) while rje.matchExp('(: \S+_(\S+)__(\S+) )',tabtext): (oldtext,sid,spec,spacc) = rje.matchExp('(: (\S+)_(\S+)__(\S+) )',tabtext) newtext = ': %s_<a href="http://www.uniprot.org/taxonomy/?query=%s&sort=score" target="_blank">%s</a>__<a href="http://www.uniprot.org/uniprot/%s" target="_blank">%s</a> ' % (sid,spec,spec,spacc,spacc) tabtext = string.replace(tabtext,oldtext,newtext) tabdesc = '%s output' % seqfile htabs.append((ftype,tabtext,tabdesc)) if htabs: html += hobj.tabberHTML(os.path.basename(seqbase),htabs) else: html += '<p><i>No output found for <code>%s</code>!</i></p>\n' % hit html += hobj.htmlTail() open(hfile,'w').write(html) except: self.errorLog('Problem with %s.makeHTML()' % self.prog())
def loadTimePoints( self, filename): ### Load TimePoints from file of various formats '''Load TimePoints from file of various formats.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(filename): return self.errorLog('File %s missing!' % filename) data = open(filename, 'r').readlines() db = self.db('TimePoints') ### ~ [2] Load from File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Delimited File Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if string.split(data[0])[0] == 'TimePoint Name': # ftype = 'delimited text file' temp = self.db().addTable(filename, mainkeys=['TimePoint Name'], name='temp') for entry in temp.entries(): db.addEntry(entry) db.deleteTable(temp) ## ~ [2b] File of Database Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif data[0][0] == '(': ftype = 'database string' for line in data: line = rje.chomp(line) while line[-1:] == ' ': line = line[:-1] pdata = string.split(string.replace(line[2:-3], ', ', ','), "','") if not pdata: continue if rje.matchExp('^(\d+)$', pdata[0]): pdata.pop(0) # Database output with key ID numbers entry = {} for field in db.fields(): entry[field] = pdata[db.fields().index(field)] db.addEntry(entry) ## ~ [2c] Glossary Text File ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: ftype = 'glossary text file' for line in data: if '(TimePoint)' not in line: continue # American Independence. (TimePoint) 1776 AD, 4 July. The US declared independence from the British Empire. Source: <http://en.wikipedia.org/wiki/United_States_Declaration_of_Independence>[Wikipedia]. (Keywords: history) pdata = string.split(line, '. ') if pdata[2][-2:] == 'ya': pdata[1] = '%s. %s' % (pdata[1], pdata.pop(2)) entry = {'TimePoint Name': pdata[0]} try: entry['Source URL'] = rje.matchExp( 'Source: <(\S+)>', line)[0] except: self.errorLog('Cannot read Source URL') try: entry['TimePoint Description'] = rje.matchExp( '^(\S.+\S) Source: <', string.join(pdata[2:], '. '))[0] except: self.errorLog('Cannot read TimePoint Description: %s' % line) if pdata[1][-2:] == 'ya': [entry['Year'], entry['yearUnit']] = string.split(pdata[1])[-2:] else: try: ydata = rje.matchExp('(\d+) (\S+), (\d+) (\S+)$', pdata[1]) if ydata: for i in range(4): entry[['Year', 'yearUnit', 'month', 'day'][i]] = ydata[i] else: (entry['Year'], entry['yearUnit']) = rje.matchExp( '(\d+) (\S+)$', pdata[1]) except: self.errorLog('Cannot parse time from %s' % pdata[1]) kfield = [ 'keyword1', 'keyword2', 'keyword3', 'keyword4', 'keyword5' ] try: keywords = string.split( rje.matchExp('\(Keywords: (\S.+)\)', pdata[-1])[0], ', ') while keywords and kfield: entry[kfield.pop(0)] = keywords.pop(0) while kfield: entry[kfield.pop(0)] = 'blank' if keywords: self.printLog( '#ERR', '%d extra Keywords (%s)!' % (len(keywords), string.join(keywords, ', '))) except: self.errorLog('Cannot read Keywords (%s)' % pdata[-1]) db.addEntry(entry) ### ~ [3] Summarise Input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog( '#TP', 'Timepoints read from %s: %s TimePoints total.' % (ftype, db.entryNum())) return True except: self.errorLog('%s.loadTimePoints(%s) error' % (self, filename)) return False