def makePPIDatasets(self): ### Generate PPI datasets from pairwise data '''Generate PPI datasets from pairwise data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self, 'YeastPPI/') seqdict = self.dict['SeqDict'] ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx, htot, fx) = (0.0, len(self.dict['PPI']), 0) for hub in rje.sortKeys(self.dict['PPI']): self.progLog( '\r#FAS', 'Generating %s PPI fasta files: %.2f' % (rje.integerString(fx), hx / htot)) hx += 100.0 if len(self.dict['PPI'][hub]) < 3: continue seqs = [] for spoke in self.dict['PPI'][hub]: if spoke not in seqdict: continue seqs.append(seqdict[spoke]) if len(seqs) < 3: continue self.obj['SeqList'].saveFasta(seqs, rje.makePath('YeastPPI/%s.fas' % hub, wholepath=True), log=False) fx += 1 self.printLog( '\r#FAS', 'Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx), rje.integerString(htot))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def startElement(self, tag, attributes): ### Called when a new element begins ### ~ [1] Generate XML object for element ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog( '\r#PARSE', 'Parsing %s: %s elements (%s level 1; %s retained)' % (self.xml.info['Name'], rje.integerString( self.e), rje.integerString(self.x), rje.integerString(self.r)), False, False) self.e += 1 if not self.parsing: myxml = self.xml # Very first Element goes in main XML Object myxml.list['ParentTags'] = [tag] else: myxml = XML(log=self.xml.log, cmd_list=self.xml.cmd_list) myxml.obj['Parent'] = self.parsing[-1] myxml.list[ 'ParentTags'] = myxml.obj['Parent'].list['ParentTags'] + [tag] if self.parsing[-1] == self.xml: self.x += 1 if tag in self.xml.list['Elements'] or not self.xml.list['Elements']: if myxml.obj['Parent']: self.parsing[-1].list['XML'].append(myxml) if myxml.list['ParentTags'] not in self.schemalist: self.schemalist.append(myxml.list['ParentTags']) myxml.info['Name'] = tag self.parsing.append(myxml) myxml.stat['Level'] = len(self.parsing) ### ~ [2] Update Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### schjoin = string.join(myxml.list['ParentTags'], ':') if schjoin not in self.schatts: self.schatts[schjoin] = [] for q in attributes.getQNames(): if q in self.xml.list['Attributes'] or not self.xml.list[ 'Attributes']: # Only add if wanted myxml.dict['Attributes'][q] = attributes.getValueByQName(q) if q not in self.schatts[schjoin]: self.schatts[schjoin].append(q)
def dpi(self): ### Domain-protein interactions '''Domain-protein interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_DPI' rje.mkDir(self, outdir) dpi = {} # Dictionary of {domain:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['Domain']): dpi[dom] = [] for hub in self.dict['Domain'][dom]: if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][ hub] # Add with redundancy for spoke in dpi[dom][0:]: if dpi[dom].count(spoke) == 1: dpi[dom].remove( spoke) # Must have 2+ domain interactions for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in dpi[dom]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict[ 'PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) dpi[dom] = rje.sortUnique(dpi[dom], False, False) acc = [] for name in dpi[dom]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.dpi.acc' % (outdir, dom), 'w').write(string.join(acc, '\n')) self.printLog('#DPI', '%s domain => %d interactors' % (dom, len(acc))) if badname: badname.sort() self.printLog( '#BAD', '%d "bad" protein names: %s' % (len(badname), string.join(badname, '; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DPI', 'No %s PPI left after DPI removed' % hub, screen=False) self.printLog( '#PPX', '%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len( self.dict['PPI'])), rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.dpi()', quitchoice=True)
def loadFeatures(self, ftfile): ### Loads features from given file '''Loads features from given file.''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['', 'none']: return if not os.path.exists(ftfile): return self.printLog('#ERR', 'Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile, 'r').readline(), delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in [ 'feature', '%sstart' % pos, '%send' % pos, 'description' ]: if h not in hmap: return self.printLog( '#ERR', 'No %s field detected in "%s" features file' % (h, ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self, ftfile, mainkeys, ['description'], delimit, headers, lists=True) (mx, mtot, fx) = (0.0, len(ftdata), 0) for mainkey in rje.sortKeys(ftdata): self.progLog( '\r#FT', 'Loading features from %s: %.2f%%' % (ftfile, mx / mtot)) mx += 100.0 (id, ft, start, end) = string.split(mainkey, delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({ 'Type': ft, 'Start': int(start), 'End': int(end), 'Desc': desc }) self.printLog( '\r#FT', 'Loaded %s features for %s IDs from %s' % (rje.integerString(fx), rje.integerString(len(self.dict['Features'])), ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def ddi(self): ### Domain-domain interactions '''Domain-domain interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ddx = 0 (dx, dtot) = (0.0, len(self.dict['DDI'])) if not self.dict['DDI'] or not self.dict['Domain']: return ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['DDI']): self.progLog( '\r#DDI', 'Screening domain-domain interactions: %.1f%%; %s removed' % ((dx / dtot), rje.integerString(ddx))) dx += 100 if dom not in self.dict['Domain']: self.printLog('#DOM', 'No sequences with "%s" domains' % dom) continue for ddi in self.dict['DDI'][dom]: if ddi not in self.dict['Domain']: continue for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in self.dict['Domain'][ddi]: ddx += 1 self.dict['PPI'][hub].remove(spoke) for hub in self.dict['Domain'][ddi]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in self.dict['Domain'][dom]: ddx += 1 self.dict['PPI'][hub].remove(spoke) self.printLog( '\r#DDI', 'Screening domain-domain interactions complete: %s removed.' % (rje.integerString(ddx))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DDI', 'No %s interactions left after DDI removed' % hub, screen=False) self.printLog( '#PPX', '%s of %s PPI hubs remain after DDI removed' % (rje.integerString(len( self.dict['PPI'])), rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.ddi()', quitchoice=True)
def loadPPI(self): ### Load pairwise interaction data '''Load pairwise interaction data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['PPIFile']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in open(self.info['PPIFile'],'r').readlines(): try: [pa,pb] = string.split(rje.chomp(line))[:2] except: continue for ppi in [(pa,pb),(pb,pa)]: if ppi[0] not in self.dict['PPI']: self.dict['PPI'][ppi[0]] = [] if ppi[1] not in self.dict['PPI'][ppi[0]]: self.dict['PPI'][ppi[0]].append(ppi[1]) self.progLog('\r#PPI','Loading PPI data: %s proteins' % rje.integerString(len(self.dict['PPI']))) self.printLog('\r#PPI','Loaded PPI data for %s proteins' % rje.integerString(len(self.dict['PPI']))) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def loadPillars(self): ### Load YGOB Pillar data '''Load YGOB Pillar data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['Pillars']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in self.loadFromFile(filename=self.info['Pillars'],chomplines=True): pillars = string.split(line) #self.deBug('%s = %d' % (pillars,len(pillars))) if len(pillars) < 17: continue pillars = pillars[:5] + pillars[6:] # Remove ancestral gene while '---' in pillars: pillars.remove('---') #self.deBug('%s = %d' % (pillars,len(pillars))) if pillars: self.list['Pillars'].append(pillars) self.progLog('\r#YGOB','Loading Pillar data: %s loci' % rje.integerString(len(self.list['Pillars']))) self.printLog('\r#YGOB','Loaded Pillar data for %s loci' % rje.integerString(len(self.list['Pillars']))) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def loadAlias(self, sourcefile): ### Loads Alias data ''' Loads Alias data. >> sourcefile:str = Source filename ''' try: ### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if sourcefile.lower() in ['', 'none']: return if not os.path.exists(sourcefile): return self.log.errorLog('Alias file "%s" not found' % (sourcefile), printerror=False) data = rje.dataDict(self, sourcefile, datakeys=['Aliases'], lists=True) ### ~ [2] Parse out Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx, htot) = (0.0, len(data)) for id in data: self.log.printLog('\r#ALIAS', 'Processing %s: %.1f%%' % (sourcefile, hx / htot), newline=False, log=False) hx += 100.0 ## ~ [2a] Update self.dict ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## for alist in data[id]['Aliases']: for alias in string.split(alist, ','): self.addAlias(id, alias) if id in self.dict['Aliases']: self.dict['Aliases'][id].sort() self.log.printLog( '\r#ALIAS', 'Processed %s: %s IDs with aliases' % (sourcefile, rje.integerString(len(self.dict['Aliases'])))) except: self.log.errorLog(rje_zen.Zen().wisdom())
def ddi(self): ### Domain-domain interactions '''Domain-domain interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ddx = 0 (dx,dtot) = (0.0,len(self.dict['DDI'])) if not self.dict['DDI'] or not self.dict['Domain']: return ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['DDI']): self.progLog('\r#DDI','Screening domain-domain interactions: %.1f%%; %s removed' % ((dx/dtot),rje.integerString(ddx))); dx += 100 if dom not in self.dict['Domain']: self.printLog('#DOM','No sequences with "%s" domains' % dom); continue for ddi in self.dict['DDI'][dom]: if ddi not in self.dict['Domain']: continue for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in self.dict['Domain'][ddi]: ddx+=1; self.dict['PPI'][hub].remove(spoke) for hub in self.dict['Domain'][ddi]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in self.dict['Domain'][dom]: ddx+=1; self.dict['PPI'][hub].remove(spoke) self.printLog('\r#DDI','Screening domain-domain interactions complete: %s removed.' % (rje.integerString(ddx))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DDI','No %s interactions left after DDI removed' % hub,screen=False) self.printLog('#PPX','%s of %s PPI hubs remain after DDI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.ddi()',quitchoice=True)
def loadFeatures(self,ftfile): ### Loads features from given file '''Loads features from given file.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if ftfile in ['','none']: return if not os.path.exists(ftfile): return self.printLog('#ERR','Features file "%s" missing') delimit = rje.delimitFromExt(filename=ftfile) ## ~ [1a] ~ Establish headers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## headers = rje.readDelimit(open(ftfile,'r').readline(),delimit) mainkeys = [headers[0]] hmap = {} for h in headers: hmap[h.lower()] = h pos = '' # Leader for start/end positions if 'ft_start' in hmap or 'ft_end' in hmap: pos = 'ft_' for h in ['feature','%sstart' % pos,'%send' % pos,'description']: if h not in hmap: return self.printLog('#ERR','No %s field detected in "%s" features file' % (h,ftfile)) mainkeys.append(hmap[h]) mainkeys.remove(hmap['description']) ### ~ [2] ~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ftdata = rje.dataDict(self,ftfile,mainkeys,['description'],delimit,headers,lists=True) (mx,mtot,fx) = (0.0,len(ftdata),0) for mainkey in rje.sortKeys(ftdata): self.progLog('\r#FT','Loading features from %s: %.2f%%' % (ftfile,mx/mtot)) mx += 100.0 (id,ft,start,end) = string.split(mainkey,delimit) if id == mainkeys[0]: continue if id not in self.dict['Features']: self.dict['Features'][id] = [] for desc in ftdata[mainkey][hmap['description']]: fx += 1 self.dict['Features'][id].append({'Type':ft,'Start':int(start),'End':int(end),'Desc':desc}) self.printLog('\r#FT','Loaded %s features for %s IDs from %s' % (rje.integerString(fx),rje.integerString(len(self.dict['Features'])),ftfile)) except: self.errorLog('UniFake.loadFeatures error ["%s"]' % ftfile)
def seqBySeq( self ): ### Runs in SeqBySeq Mode #V1.0 ''' In SeqBySeq mode, the program assumes that seqin=FILE and basefile=X are given and farm states the program to be run. Seqin will then be worked through in turn and each sequence farmed out to the farm program. Outputs given by OutList are then compiled, as is the Log, into the correct basefile=X given. In the case of *.csv and *.tdt files, the header row is copied for the first file and then excluded for all subsequent files. For all other files extensions, the whole output is copied. ''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('Farm')[-3:] == '.py': self.str['Farm'] = self.str['Farm'][:-3] self.list['Seq'] = rje_seq.SeqList( self.log, self.cmd_list + ['autoload=T', 'accnr=F', 'seqnr=F']).seq[0:] while self.getStrLC('StartFrom') and self.list['Seq']: if self.list['Seq'][0].shortName() != self.getStr('StartFrom'): self.list['Seq'] = self.list['Seq'][1:] else: self.str['StartFrom'] = '' self.printLog( '#SEQ', '%s query sequences to farm out' % rje.integerString(len(self.list['Seq']))) self.list['Pickup'] = self.pickupList() ### ~ [2] ~ Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.runJobs() return True except SystemExit: raise # Child except: self.errorLog('JobFarmer.seqBySeq error') return False
def ensLoci(self): ### Reads from EnsLoci file if it exists and parses into dictionaries. '''Reads from EnsLoci file if it exists and parses into dictionaries.''' self.dict['EnsLoci'] = {} # Dictionary of {EnsGene:shortName()} self.dict['EnsDesc'] = {} # Dictionary of {EnsGene:Description} self.dict['UniEns'] = {} # Dictionary of {UniProt?:EnsGene} if os.path.exists(self.info['EnsLoci']): elines = self.loadFromFile(self.info['EnsLoci']) (ex,etot) = (0.0,len(elines)) while elines: ex += 100.0 line = elines.pop(0) if line[:1] != '>': continue if rje.matchExp('^>(\S+).+ gene:(\S+)\]',line): (name,gene) = rje.matchExp('^>(\S+).+ gene:(\S+)\]',line) else: self.log.errorLog('Problem with EnsLoci line: %s' % line,printerror=False) continue try: acc = rje.matchExp('\[acc:(\S+)',line)[0] except: acc = '' if acc: self.dict['UniEns'][acc] = gene self.dict['EnsLoci'][gene] = name self.dict['EnsDesc'][gene] = string.join(string.split(string.split(line,' [acc:')[0][1:])[1:]) if self.opt['FullEns'] and gene not in self.list['Genes']: self.list['Genes'].append(gene) if self.opt['FullEns'] and gene not in self.dict['GeneCard']: self.dict['GeneCard'][gene] = {'EnsEMBL':gene,'Symbol':'!FAILED!'} self.log.printLog('\r#ENS','Parsing EnsLoci %.1f%%: %s genes' % (ex/etot,rje.integerString(len(self.dict['EnsLoci']))),newline=False,log=False) self.log.printLog('\r#ENS','Parsing EnsLoci complete: %s genes' % (rje.integerString(len(self.dict['EnsLoci']))))
def convert(self,filelist=[],outfile=None): ### Converts scansite output files in FileList to Outfile ''' Converts scansite output files in FileList to Outfile. ''' try: ### Setup ### _stage = 'Setup' if len(filelist) < 1: filelist = self.list['FileList'] if not outfile: outfile = self.info['Name'] if len(filelist) < 1: self.log.errorLog('No scansite files to convert! %s unchanged/not made.' % outfile,printerror=False) return False delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) if ext != outfile[-3:]: newfile = outfile[:-3] + ext if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)): outfile = newfile self.log.printLog('#OUT','Converting %d file(s), output to %s.' % (len(filelist),outfile)) ### Output File ### _stage = 'Output File' if not self.opt['Append'] or not os.path.exists(outfile): # Create with header OUTFILE = open(outfile,'w') headers = ['seq_id','enzyme','enz_group','aa','pos','score','percentile','matchseq','sa'] rje.writeDelimit(OUTFILE,headers,delimit) else: OUTFILE = open(outfile,'a') ### Conversion ### _stage = 'Conversion' sx = 0 for infile in filelist: if not os.path.exists(infile): self.log.errorLog('Input file %s does not exist! :o(' % infile,False,False) continue fx = 0 INFILE = open(infile,'r') inline = rje.nextLine(INFILE) while inline != None: if rje.matchExp(re_scansite,inline): scanlist = rje.matchExp(re_scansite,inline) rje.writeDelimit(OUTFILE,scanlist,delimit) sx += 1 fx += 1 rje.progressPrint(self,sx) inline = rje.nextLine(INFILE) self.log.printLog('#OUT','%s scansite results from %s. (%s Total.)' % (rje.integerString(fx),infile,rje.integerString(sx))) INFILE.close() ### End ### _stage = 'End' OUTFILE.close() self.log.printLog('#OUT','%s scansite results output to %s.' % (rje.integerString(sx),outfile)) return True except: self.log.errorLog('Error in convert(%s)' % _stage,printerror=True,quitchoice=False) raise
def loadMutations(self): ### Inputs parsed mutations back into dictionaries '''Inputs parsed mutations back into dictionaries.''' try:### ~ [1] Setup input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} headers = ['OMIM_ID','SubID','Gene','Pos','WildAA','MutAA','Disease'] infile = 'omim_mutations.tdt' if not os.path.exists(infile): return False datadict = rje.dataDict(self,infile,headers[:2],headers,'\t') mx = len(datadict) ### ~ [2] Process into dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dkey in datadict.keys()[0:]: data = datadict.pop(dkey) record = data['OMIM_ID'] subid = data['SubID'] gene = data['Gene'] mutation = '%s%s%s' % (data['WildAA'],data['Pos'],data['MutAA']) disease = data['Disease'] if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} self.dict['Mutations'][gene][subid] = (disease,mutation) self.log.printLog('\r#OMIM','Loaded %s OMIM mutations (%s genes).' % (rje.integerString(mx),rje.integerString(len(self.dict['Records'])))) return True except: self.log.errorLog(rje_zen.Zen().wisdom()) return False
def outputCards(self): ### Outputs cards to delimited file '''Outputs cards to delimited file.''' ### ~ Setup for output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### genelist = self.list['Genes'] if self.opt['Purify'] and self.opt['Restrict']: for gene in genelist[0:]: if self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: # Replace with symbol genelist.remove(gene) if self.dict['GeneCard'][gene]['Symbol'] not in genelist: genelist.append(self.dict['GeneCard'][gene]['Symbol']) delimit = rje.delimitFromExt(filename=self.info['CardOut']) CARDOUT = open(self.info['CardOut'],'a') ### ~ Generate output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (noens,noloci,ox) = (0,0,0) for gene in rje.sortKeys(self.dict['GeneCard']): if self.opt['Restrict'] and gene not in genelist: continue elif self.opt['Purify'] and self.dict['GeneCard'][gene]['Symbol'] not in [gene,'!FAILED!']: continue self.progLog('\r#OUT','Output for %s parsed genes' % rje.iStr(ox)); ox += 1 self.dict['GeneCard'][gene]['Alias'] = gene self.dict['GeneCard'][gene]['Species'] = self.info['Species'] rje.delimitedFileOutput(self,CARDOUT,self.list['Headers'],delimit,self.dict['GeneCard'][gene]) if self.dict['GeneCard'][gene]['Symbol'] == gene: # Not an alias if 'EnsEMBL' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsEMBL']: noens += 1 if 'EnsLoci' not in self.dict['GeneCard'][gene] or not self.dict['GeneCard'][gene]['EnsLoci']: noloci += 1 CARDOUT.close() self.printLog('\r#OUT','Parsed info for %d genes output to %s' % (len(self.list['Genes']),self.info['CardOut'])) self.printLog('#ENS','%s without EnsGene; %s without EnsLoci' % (rje.integerString(noens),rje.integerString(noloci)))
def run(self,batch=False): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not batch: self.setupResults() ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not batch and not self.obj['SeqList'].seqs(): ### Look for batch files and run for each batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0) self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles))) if not batchfiles: self.errorLog('No input files found!',printerror=False) else: bx = 0 for infile in batchfiles: bx += 1 self.printLog('#BATCH','Batch running %s' % infile) bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile] self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd) self.run(batch=True) self.opt['Append'] = True self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False) if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32 return ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.info['Special'].lower() == 'allbyall': self.printLog('#RUN','Performing special "all-by-all" pairwise run') self.info['Special'] = '' for i in range(len(self.seqs())-1): self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i] for j in range(i+1,len(self.seqs())): self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1) self.run(batch=True) self.opt['Append'] = True self.info['Special'] = 'allbyall'; return ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.setup() ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.fitness() self.phenotype() self.grouping() for vector in ['Fitness','Phenotype','SeqGroup']: if len(self.list[vector]) != self.qry().seqLen(): self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False) raise ValueError results = self.price() ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True) results['Query'] = self.qry().shortName() results['Fitness'] = self.info['Fmethod'] results['Phenotype'] = self.info['Pmethod'] results['SeqGroup'] = self.info['SeqGroup'] rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results) self.printLog('#OUT','Results output to %s' % self.info['ResFile']) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def caseChange(self): ### Returns groupings based on Case boundaries of query '''Returns groupings based on Case boundaries of query.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qry = self.qry() self.deBug(qry.getSequence(case=True)) grplist = ['UC'] * qry.seqLen() # List of groups (None = no group) ### ~ [1] Map Case ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for (start,end) in self.qry().dict['Case']['Lower']: for i in range(start-1,end): grplist[i] = 'LC' caselist = grplist[0:] gx = 1 for r in range(qry.seqLen()): q = qry.info['Sequence'].upper()[r] if not self.opt['QryGaps'] and q == '-': grplist[r] = 0 elif r > 0 and caselist[r] != caselist[r-1]: gx += 1 grplist[r] = gx self.printLog('#GRP','%s case groups from %s%s' % (rje.integerString(gx),rje.integerString(qry.seqLen()),self.obj['SeqList'].units())) self.deBug(grplist) return grplist except: self.errorLog(rje_zen.Zen().wisdom()); raise
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Sequence file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list) #!# Add code for memsaver/autoload=F #!# self.printLog('#SCAP','%s sequences loaded for SCAP analysis' % rje.integerString(seqlist.seqNum())) ## ~ [1b] ~ Xmer background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## mseqfile = self.info['XmerBack'] if mseqfile.lower() in ['','none']: mseqfile = self.info['XmerBack'] = seqlist.info['Name'] markov = self.obj['Markov'] = rje_markov.Markov(self.log,['autoload=T','accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % mseqfile,'direction=both','markov=F','scap=T']) markov.setup() maxx = markov.stat['MaxXmer'] if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = '%s.scap' % rje.baseFile(seqlist.info['Name'],True) if markov.opt['Sorted']: self.info['Basefile'] = '%s.sorted' % self.info['Basefile'] basefile = self.info['Basefile'] self.printLog('#MARKOV','Markov setup complete') ## ~ [1c] ~ SCAP Background file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## scapfile = self.info['ScapBack'] if scapfile.lower() in ['','none',seqlist.info['Name'].lower()]: self.obj['ScapBack'] = self.obj['SeqList'] elif scapfile == mseqfile: self.obj['ScapBack'] = markov.obj['SeqList'] else: self.obj['ScapBack'] = rje_seq.SeqList(self.log,['accnr=F','seqnr=F']+self.cmd_list+['seqin=%s' % scapfile]) self.printLog('#SCAP','%s sequences for SCAP Background' % rje.integerString(seqlist.seqNum())) ### ~ [2] Markov Chains ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if mseqfile == seqlist.info['Name']: markov.obj['SeqList'] = seqlist elif mseqfile == self.obj['ScapBack'].info['Name']: markov.obj['SeqList'] = self.obj['ScapBack'] mpickle = markov.unpickleMe() if mpickle: markov = self.obj['Markov'] = mpickle if not markov.suftree() or not markov.pretree() or maxx > markov.stat['MaxXmer']: markov.run() markov.pickleMe() markov.opt['DeBug'] = self.opt['DeBug'] self.deBug(markov.opt) self.deBug(markov.stat) #self.deBug(markov.suftree()) #self.deBug(markov.pretree()) return True # Setup successful except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def loadMutations( self): ### Inputs parsed mutations back into dictionaries '''Inputs parsed mutations back into dictionaries.''' try: ### ~ [1] Setup input ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} headers = [ 'OMIM_ID', 'SubID', 'Gene', 'Pos', 'WildAA', 'MutAA', 'Disease' ] infile = 'omim_mutations.tdt' if not os.path.exists(infile): return False datadict = rje.dataDict(self, infile, headers[:2], headers, '\t') mx = len(datadict) ### ~ [2] Process into dictionaries ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dkey in datadict.keys()[0:]: data = datadict.pop(dkey) record = data['OMIM_ID'] subid = data['SubID'] gene = data['Gene'] mutation = '%s%s%s' % (data['WildAA'], data['Pos'], data['MutAA']) disease = data['Disease'] if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} self.dict['Mutations'][gene][subid] = (disease, mutation) self.log.printLog( '\r#OMIM', 'Loaded %s OMIM mutations (%s genes).' % (rje.integerString(mx), rje.integerString(len(self.dict['Records'])))) return True except: self.log.errorLog(rje_zen.Zen().wisdom()) return False
def loadPPI(self): ### Load pairwise interaction data '''Load pairwise interaction data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['PPIFile']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in open(self.info['PPIFile'], 'r').readlines(): try: [pa, pb] = string.split(rje.chomp(line))[:2] except: continue for ppi in [(pa, pb), (pb, pa)]: if ppi[0] not in self.dict['PPI']: self.dict['PPI'][ppi[0]] = [] if ppi[1] not in self.dict['PPI'][ppi[0]]: self.dict['PPI'][ppi[0]].append(ppi[1]) self.progLog( '\r#PPI', 'Loading PPI data: %s proteins' % rje.integerString(len(self.dict['PPI']))) self.printLog( '\r#PPI', 'Loaded PPI data for %s proteins' % rje.integerString(len(self.dict['PPI']))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def loadPillars(self): ### Load YGOB Pillar data '''Load YGOB Pillar data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['Pillars']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in self.loadFromFile(filename=self.info['Pillars'], chomplines=True): pillars = string.split(line) #self.deBug('%s = %d' % (pillars,len(pillars))) if len(pillars) < 17: continue pillars = pillars[:5] + pillars[6:] # Remove ancestral gene while '---' in pillars: pillars.remove('---') #self.deBug('%s = %d' % (pillars,len(pillars))) if pillars: self.list['Pillars'].append(pillars) self.progLog( '\r#YGOB', 'Loading Pillar data: %s loci' % rje.integerString(len(self.list['Pillars']))) self.printLog( '\r#YGOB', 'Loaded Pillar data for %s loci' % rje.integerString(len(self.list['Pillars']))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def test(self): ### Development method '''Development method.''' self.readGO() self.mapEnsGO() gohead = ['EnsG','GO_ID','GO_Type','GO_Desc'] gofile = 'test.go.tdt' rje.delimitedFileOutput(self,gofile,gohead,rje_backup=True) gx = 0.0; gtot = len(self.dict['EnsGO']) for gene in rje.sortKeys(self.dict['EnsGO']): self.progLog('\r#ENSGO','Compiling %s: %.2f%%' % (gofile,gx/gtot)); gx += 100.0 for goid in self.dict['EnsGO'][gene]: godata = {'EnsG':gene, 'GO_ID':goid} godata['GO_Type'] = self.dict['GO'][goid]['type'] godata['GO_Desc'] = self.dict['GO'][goid]['name'] rje.delimitedFileOutput(self,gofile,gohead,datadict=godata) self.printLog('\r#ENSGO','Compiling %s all done: %s genes.' % (gofile,rje.integerString(gtot)))
def codons(self): ### Returns grouping vector based on DNA codon positions (three groups) '''Returns grouping vector based on DNA codon positions (three groups).''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qry = self.qry() grplist = [0] * qry.seqLen() # List of groups (0 = no group) ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### trip = 1 for r in range(qry.seqLen()): q = qry.info['Sequence'].upper()[r] if self.opt['QryGaps'] and q == '-': continue grplist[r] = trip if trip == 3: trip = 1 else: trip += 1 self.printLog('#GRP','3 codon groups from %s%s' % (rje.integerString(qry.seqLen()),self.obj['SeqList'].units())) return grplist except: self.errorLog(rje_zen.Zen().wisdom()); raise
def fpi(self): ### Family-protein interactions '''Family-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_FPI' rje.mkDir(self,outdir) fpi = {} # Dictionary of {family:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for qry in rje.sortKeys(self.dict['PPI']): try: fam = self.dict['Fam'][qry] if len(fam) < 2: continue except: self.errorLog('Problem with "%s" protein family' % qry); continue fpi[qry] = [] for hub in fam: if hub not in self.dict['PPI']: continue fpi[qry] += self.dict['PPI'][hub] # Add with redundancy for spoke in fpi[qry][0:]: if fpi[qry].count(spoke) == 1: fpi[qry].remove(spoke) # Must have 2+ family interactions for hub in fam: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in fpi[qry]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) fpi[qry] = rje.sortUnique(fpi[qry],False,False) acc = [] gene = self.dict['Gene'][qry] for name in fpi[qry]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.fpi.acc' % (outdir,gene),'w').write(string.join(acc,'\n')) self.printLog('#FPI','%s family => %d interactors' % (gene,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#FPI','No %s PPI left after FPI removed' % hub) self.printLog('#PPX','%s of %s PPI hubs remain after FPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.fpi()',quitchoice=True)
def triplets(self): ### Returns grouping vector based on DNA triplets '''Returns grouping vector based on DNA triplets.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qry = self.qry() grplist = [0] * qry.seqLen() # List of groups (0 = no group) gx = 0 ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### trip = 0 for r in range(qry.seqLen()): q = qry.info['Sequence'].upper()[r] if self.opt['QryGaps'] and q == '-': continue if not trip: gx += 1 grplist[r] = gx if trip == 2: trip = 0 else: trip += 1 self.printLog('#GRP','%s triplet groups from %s%s' % (rje.integerString(gx),rje.integerString(qry.seqLen()),self.obj['SeqList'].units())) return grplist except: self.errorLog(rje_zen.Zen().wisdom()); raise
def makePPIDatasets(self): ### Generate PPI datasets from pairwise data '''Generate PPI datasets from pairwise data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rje.mkDir(self,'YeastPPI/') seqdict = self.dict['SeqDict'] ### ~ [2] Parse data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (hx,htot,fx) = (0.0,len(self.dict['PPI']),0) for hub in rje.sortKeys(self.dict['PPI']): self.progLog('\r#FAS','Generating %s PPI fasta files: %.2f' % (rje.integerString(fx),hx/htot)); hx += 100.0 if len(self.dict['PPI'][hub]) < 3: continue seqs = [] for spoke in self.dict['PPI'][hub]: if spoke not in seqdict: continue seqs.append(seqdict[spoke]) if len(seqs) < 3: continue self.obj['SeqList'].saveFasta(seqs,rje.makePath('YeastPPI/%s.fas' % hub,wholepath=True),log=False); fx+=1 self.printLog('\r#FAS','Generation of %s PPI fasta files from %s hubs complete.' % (rje.integerString(fx),rje.integerString(htot))) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def dpi(self): ### Domain-protein interactions '''Domain-protein interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.dict['Domain']: return outdir = 'SLiMPID_DPI' rje.mkDir(self,outdir) dpi = {} # Dictionary of {domain:[interactors]} badname = [] ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for dom in rje.sortKeys(self.dict['Domain']): dpi[dom] = [] for hub in self.dict['Domain'][dom]: if hub in self.dict['PPI']: dpi[dom] += self.dict['PPI'][hub] # Add with redundancy for spoke in dpi[dom][0:]: if dpi[dom].count(spoke) == 1: dpi[dom].remove(spoke) # Must have 2+ domain interactions for hub in self.dict['Domain'][dom]: if hub not in self.dict['PPI']: continue for spoke in self.dict['PPI'][hub][0:]: if spoke in dpi[dom]: self.dict['PPI'][hub].remove(spoke) if spoke in self.dict['PPI'] and hub in self.dict['PPI'][spoke]: self.dict['PPI'][spoke].remove(hub) dpi[dom] = rje.sortUnique(dpi[dom],False,False) acc = [] for name in dpi[dom]: if not name: continue if name in self.dict['Seq']: acc.append(self.dict['Seq'][name].info['AccNum']) elif name not in badname: badname.append(name) open('%s/%s.dpi.acc' % (outdir,dom),'w').write(string.join(acc,'\n')) self.printLog('#DPI','%s domain => %d interactors' % (dom,len(acc))) if badname: badname.sort() self.printLog('#BAD','%d "bad" protein names: %s' % (len(badname),string.join(badname,'; '))) ### ~ [3] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### hx = len(self.dict['PPI']) for hub in rje.sortKeys(self.dict['PPI']): if hub and self.dict['PPI'][hub]: continue self.dict['PPI'].pop(hub) self.printLog('#DPI','No %s PPI left after DPI removed' % hub,screen=False) self.printLog('#PPX','%s of %s PPI hubs remain after DPI removed' % (rje.integerString(len(self.dict['PPI'])),rje.integerString(hx))) except: self.errorLog('Problem with SLiMPID.dpi()',quitchoice=True)
def seqBySeq(self): ### Runs in SeqBySeq Mode #V1.0 ''' In SeqBySeq mode, the program assumes that seqin=FILE and basefile=X are given and farm states the program to be run. Seqin will then be worked through in turn and each sequence farmed out to the farm program. Outputs given by OutList are then compiled, as is the Log, into the correct basefile=X given. In the case of *.csv and *.tdt files, the header row is copied for the first file and then excluded for all subsequent files. For all other files extensions, the whole output is copied. ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('Farm')[-3:] == '.py': self.str['Farm'] = self.str['Farm'][:-3] self.list['Seq'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T','accnr=F','seqnr=F']).seq[0:] while self.getStrLC('StartFrom') and self.list['Seq']: if self.list['Seq'][0].shortName() != self.getStr('StartFrom'): self.list['Seq'] = self.list['Seq'][1:] else: self.str['StartFrom'] = '' self.printLog('#SEQ','%s query sequences to farm out' % rje.integerString(len(self.list['Seq']))) self.list['Pickup'] = self.pickupList() ### ~ [2] ~ Run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.runJobs() return True except SystemExit: raise # Child except: self.errorLog('JobFarmer.seqBySeq error') return False
def addToGeneCards(self,cards,addcards=True): ### Reconfigures and adds parsed HPRD data to GeneCards ''' Reconfigures and adds parsed HPRD data to GeneCards. >> cards:rje_genecards.GeneCards object >> addcards:boolean [True] = whether to add genes from HPRD to the GeneCards dictionary ''' ### Add relevant headers for future output ### for h in ['HPRD','OMIM','EntrezCheck','Desc']: if h not in cards.list['Headers']: cards.list['Headers'].append(h) for gene in cards.list['Genes']: if h not in cards.dict['GeneCard'][gene]: cards.dict['GeneCard'][gene][h] = '' ### Add to GeneCards ### (hx,htot) = (0.0,len(self.dict['HPRD'])) for hprd in self.dict['HPRD']: self.log.printLog('\r#HPRD','Adding HPRD to GeneCards: %.1f%%' % (hx/htot),newline=False,log=False) hx += 100.0 self.deBug(self.dict['HPRD'][hprd]) gene = self.dict['HPRD'][hprd]['gene'] omim = self.dict['HPRD'][hprd]['omim'] entrez = self.dict['HPRD'][hprd]['entrez'] if gene in cards.list['Genes']: if cards.dict['GeneCard'][gene]['HPRD'] == '': cards.dict['GeneCard'][gene]['HPRD'] = hprd elif hprd not in string.split(cards.dict['GeneCard'][gene]['HPRD'],','): cards.dict['GeneCard'][gene]['HPRD'] = string.join(string.split(cards.dict['GeneCard'][gene]['HPRD'],',')+[hprd],',') if cards.dict['GeneCard'][gene]['OMIM'] == '': cards.dict['GeneCard'][gene]['OMIM'] = omim elif omim not in string.split(cards.dict['GeneCard'][gene]['OMIM'],','): cards.dict['GeneCard'][gene]['OMIM'] = string.join(string.split(cards.dict['GeneCard'][gene]['OMIM'],',')+[omim],',') if cards.dict['GeneCard'][gene]['EntrezCheck'] == '': cards.dict['GeneCard'][gene]['EntrezCheck'] = entrez elif entrez not in string.split(cards.dict['GeneCard'][gene]['EntrezCheck'],','): cards.dict['GeneCard'][gene]['EntrezCheck'] = string.join(string.split(cards.dict['GeneCard'][gene]['EntrezCheck'],',')+[entrez],',') elif addcards: if gene == '-': gene = 'HPRD' + hprd cards.list['Genes'].append(gene) cards.dict['GeneCard'][gene] = {'Symbol':'!FAILED!','HPRD':hprd,'OMIM':omim,'EntrezCheck':entrez,'Desc':self.dict['HPRD'][hprd]['desc']} self.log.printLog('\r#HPRD','Added %s HPRD genes to GeneCards.' % (rje.integerString(htot)))
def domainFasta( self ): ### Outputs parsed domain and domain PPI datasets in Fasta format '''Outputs parsed PPI datasets in Fasta format.''' try: ### ~ Tab delimited domain-HPRD pairs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### headers = ['Domain', 'HPRD', 'Gene'] dfile = self.info['OutDir'] + 'HPRD.domains.tdt' rje.delimitedFileOutput(self, dfile, headers, '\t') sfile = self.info['OutDir'] + 'HPRD.domsource.tdt' shead = ['Domain', 'Source'] rje.delimitedFileOutput(self, sfile, shead, '\t') dx = 0.0 for domain in rje.sortKeys(self.dict['Domains']): self.log.printLog('\r#DOM', 'HPRD Domain output (%s): %.1f%%' % (dfile, dx / len(self.dict['Domains'])), newline=False, log=False) dx += 100.0 for hid in self.dict['Domains'][domain]: datadict = { 'Domain': domain, 'HPRD': hid, 'Gene': self.dict['HPRD'][hid]['gene'] } rje.delimitedFileOutput(self, dfile, headers, '\t', datadict) for source in self.dict['DomainSource'][domain]: datadict = {'Domain': domain, 'Source': source} rje.delimitedFileOutput(self, sfile, shead, '\t', datadict) self.log.printLog( '\r#DOM', 'HPRD Domain output (%s): %s domains.' % (dfile, rje.integerString(len(self.dict['Domains'])))) ### ~ Domain PPI Dataset Outputs ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### datpath = self.info['OutDir'] + rje.makePath( 'HPRD_Domain_Datasets/') rje.mkDir(self, datpath) for domain in rje.sortKeys(self.dict['Domains']): ## Generate a list of all interactors with domain-containing proteins ## plist = [] for p1 in self.dict['Domains'][domain]: if p1 not in self.dict['PPI']: continue for p2 in self.dict['PPI'][p1]: if p2 not in plist: plist.append(p2) plist.sort() ## Generate Sequence list and output ## mylist = [] for p in plist: if self.opt['AllIso']: mylist += self.dict['HPRD'][p]['Seq'] else: mylist.append(self.dict['HPRD'][p]['Seq']) sfile = '%s%s_hprd.fas' % (datpath, domain) if mylist: self.obj['SeqList'].saveFasta(seqs=mylist, seqfile=sfile) else: self.log.printLog( '#DOM', 'No PPI partners for domain "%s"' % domain) self.log.printLog('\r#DOM', 'HPRD Domain fasta output complete.') except: self.log.errorLog('Error in HPRD.saveFasta()', printerror=True, quitchoice=False) raise
def readHMMPFamSearch( self, resfile=None, readaln=False): ### Reads HMM PFam Search Results into objects ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!! ''' try: ### Setup ### if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file "%s" missing!' % resfile, printerror=False) return False ## Make RegExp for starting next alignment ## re_hit = string.join([ '^(\S+):', 'domain', '(\d+)', 'of', '(\d+),', 'from', '(\d+)', 'to', '(\d+):', 'score', '(\S+),', 'E', '=', '(\S+)' ], '\s+') ## Search dictionary as results come back per sequence, not per HMM! ## pfam = {} # Dictionary of {PFam name:search} hitx = 0 # Total number of hits hitlist = [ ] # List of sequences processed from file (may or may not include zero hit sequences) ### Read in Search results ### if open(resfile, 'r').readline().find('hmmpfam') != 0: self.errorLog( 'File "%s" does not appear to be an hmmpfam results file' % resfile, printerror=False) if rje.yesNo( 'Delete incorrect results file? (Check that hmmpfam=T is right!)', default='N'): os.unlink(resfile) self.printLog('#DEL', 'Dodgy results file "%s" deleted.' % resfile) return False hitname = None i = 0 hx = 0 seqx = 0 RESFILE = open(resfile, 'r') #x#resline = self.loadFromFile(resfile,chomplines=True) #x#while i < len(resline): line = RESFILE.readline() newres = [rje.chomp(line)] newresout = True newresfile = '%s.partial' % resfile if os.path.exists(newresfile): os.unlink(newresfile) while line: self.progLog( '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(hx), rje.integerString(len(pfam)), rje.integerString(hitx))) line = rje.chomp(line) #print line ## New Sequence ## if rje.matchExp('^Query sequence:\s+(\S+)', line): if newres and newresout and self.opt['CleanRes']: open(newresfile, 'a').write(string.join(newres, '\n')) newres = ['', line] newresout = False hitname = rje.matchExp('^Query sequence:\s+(\S+)', line)[0] hx += 1 #x#if hitname not in hitlist: hitlist.append(hitname) ## One Line Data for hits ## elif line.find('Parsed for domains:') == 0: #x#i += 3 # Skip two complete lines newres += [ line, rje.chomp(RESFILE.readline()), rje.chomp(RESFILE.readline()) ] line = rje.chomp(RESFILE.readline()) newres.append(line) #Model Domain seq-f seq-t hmm-f hmm-t score E-value #-------- ------- ----- ----- ----- ----- ----- ------- #Lep_receptor_Ig 1/1 24 114 .. 1 103 [] 158.4 1.7e-44 # ... else ... # [no hits above thresholds] while rje.matchExp( string.join([ '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)', '(\S+)\s*$' ], '\s+'), line): newresout = True (dom, start, end, score, eval) = rje.matchExp( string.join([ '^(\S+)', '\S+', '(\d+)', '(\d+)\D.+', '(\S+)', '(\S+)\s*$' ], '\s+'), line) if not pfam.has_key(dom): pfam[dom] = self._addSearch() pfam[dom].info['Name'] = dom hit = pfam[dom]._addHit() hit.info['Name'] = hitname aln = hit._addAln() aln.setStat({ 'SbjStart': string.atoi(start), 'SbjEnd': string.atoi(end), 'Expect': string.atof(eval), 'BitScore': string.atof(score) }) hitx += 1 self.progLog( '\r#RES', 'Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(hx), rje.integerString( len(pfam)), rje.integerString(hitx))) line = rje.chomp(RESFILE.readline()) newres.append(line) ## End of Protein ## elif line[:2] == '//': hitname = None newres.append(line) elif rje.matchExp( 'End of rje_hmm reduced results file: (%d) sequences in original', line): seqx = string.atoi( rje.matchExp( 'End of rje_hmm reduced results file: (\d+) sequences in original', line)[0]) elif newres: newres.append(line) #x#i += 1 line = RESFILE.readline() if newres and newresout and self.opt['CleanRes']: open(newresfile, 'a').write(string.join(newres, '\n')) if not seqx: seqx = hx if self.opt['CleanRes']: open(newresfile, 'a').write( string.join([ '', 'End of rje_hmm reduced results file: %d sequences in original' % seqx ], '\n')) os.unlink(resfile) os.rename(newresfile, resfile) self.printLog( '\r#RED', 'Results file %s replaced with reduced version (%s Hits only)' % (resfile, rje.integerString(hitx))) self.printLog( '\r#RES', 'Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile, rje.integerString(seqx), rje.integerString( len(pfam)), rje.integerString(hitx))) return True except: self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile)) return False
def altPAM(self): ### Alternative PAM matrix construction '''Alternative PAM matrix construction.''' try: ### Setup ## wlines = self.loadFromFile(self.info['AltPam']) if not wlines: raise IOError aas = string.split(wlines[0].upper()) codes = string.split(wlines[1]) rawfreqs = string.split(wlines[2]) freq = {} for i in range(len(rawfreqs)): freq[aas[i]] = string.atof(rawfreqs[i]) prob = {} for r in range(3, 22): subs = string.split(wlines[r]) for i in range(len(subs)): prob['%s%s' % (aas[i], aas[r - 2])] = string.atof(subs[i]) prob['%s%s' % (aas[r - 2], aas[i])] = string.atof(subs[i]) ### Alternative freqs ### if self.info['SeqIn'].lower() not in [ '', 'none' ] and os.path.exists(self.info['SeqIn']): ## Clear freq ## freq = {} for a in aas: freq[a] = 0.0 ## Count freq ## slines = self.loadFromFile(self.info['SeqIn']) for line in slines: if line[:1] == '>': continue for a in aas: freq[a] += string.count(line.upper(), a) ## Convert to freq ## total = sum(freq.values()) if total > 0: for a in aas: freq[a] = freq[a] / total self.log.printLog( '#AA', 'Rescaling matrix based on %s aa from %s.' % (rje.integerString(total), self.info['SeqIn'])) ### Calculate s ### s = 0.01 step = 0.000001 solve = True bests = 1.000000 bestdif = -1 while solve and s >= step: ## Scaler ## s = s - step self.log.printLog( '\r#WAG', 'Considering s = %.6f; Best s = %.6f (Dif = %.6f)' % (s, bests, bestdif), log=False, newline=False) ## Self Subs ## newprobs = rje.scaledict(dict=prob, scale=s) toobig = False for a in aas: newprobs['%s%s' % (a, a)] = 1.0 for key in prob.keys(): if key[0] == a: newprobs['%s%s' % (a, a)] -= newprobs[key] if newprobs['%s%s' % (a, a)] < 0.0: # Overshot possibility toobig = True break if toobig: break if toobig: continue #print 'PAM!!', ## PAM1 ## dsum = 0.0 for a in aas: dsum += freq[a] * newprobs['%s%s' % (a, a)] dif = 0.99 - dsum if dif < 0: dif = -dif if dif < bestdif or bestdif < 0: bestdif = dif bests = s ### Output best s ### self.log.printLog( '\r#WAG', 'Considered all s <= 0.010000; Best s = %.6f (Dif = %.6f)' % (bests, bestdif)) if self.info['PamOut'].lower() in ['', 'none']: self.info['PamOut'] = self.info['AltPam'] + '.pam' self.log.printLog( '#PAM', 'Rescaled PAM matrix output to %s' % self.info['PamOut']) PAM = open(self.info['PamOut'], 'w') rje.writeDelimit(PAM, aas, ' ') newprobs = rje.scaledict(dict=prob, scale=bests) for a in aas: newprobs['%s%s' % (a, a)] = 1.0 for key in prob.keys(): if key[0] == a: newprobs['%s%s' % (a, a)] -= newprobs[key] for i in range(len(aas)): out = [codes[i]] a = aas[i] for b in aas: out.append('%.6f' % newprobs['%s%s' % (a, b)]) rje.writeDelimit(PAM, out, ' ') PAM.close() self.info['Name'] = self.info['PamOut'] except: self.log.errorLog('Major Error with PamCtrl.altPAM().', quitchoice=True)
def singleSeqAQ(self,seqlist,focus=[0,-1]): ### Performs SAQ on seqlist, adding seq.info['SAQ'] ''' Performs SAQ on seqlist, adding seq.info['SAQ']. >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: SAQX = SAQ sequences with individual Xs - A new info key will be added: SAQ = SAQ sequences with aligment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <SAQ1> ### Setup try: _stage = '<1> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences query = haqlist.obj['QuerySeq'] if self.opt['NoQuery']: query = None badres = [-1,0] # List of how many bad residues in total dataset block_align = {} # Dictionary of whether residue in block of sequence that is well-aligned or not res_align = {} # Dictionary of whether residue of sequence is well-aligned or not res_gap = {} # Dictionary of whether residue of sequence is a gap or not gap_align = {} # Dictionary of whether residue of sequence is a gap in a well-aligned block or not for seq in haqlist.seq: seq.info['SAQ'] = seq.info['Sequence'][0:] # Note! Sequence is modified and SAQ not, then they are swapped at end! block_align[seq] = [False] * seq.seqLen() res_align[seq] = [False] * seq.seqLen() res_gap[seq] = [False] * seq.seqLen() gap_align[seq] = [False] * seq.seqLen() ### <SAQ2> ### Repeated cycles of defining well- and badly-aligned blocks #X#self.deBug(self.stat) _stage = '<2> BlockID' while badres[-1] != badres[-2]: # Change in number of bad residues total_res = 0 badres.append(0) # badres[-1] is the current number of bad residues infotxt = 'SAQ%d-%d: Calculating "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2) for seq in haqlist.seq: myinfo = '%s %.1f%%' % (infotxt,(100.0 * haqlist.seq.index(seq) / haqlist.seqNum())) self.log.printLog('\r#SAQ',myinfo,log=False,newline=False) #self.verbose(0,3,'\r%45s' % myinfo,0) ## <SAQ2a> ## For each sequence, mark residues as aligned or gapped _stage = '<2a> Mark Residues' for r in range(seq.seqLen()): gap_align[seq][r] = False res_align[seq][r] = False if block_align[seq][r] or len(badres) == 3: # After first cycle, look only at well-aligned blocks (well-aligned for sequence not whole alignment) a = seq.info['Sequence'][r] res_gap[seq][r] = False if a == '-': res_gap[seq][r] = True gap_align[seq][r] = True else: # 'X' handled by self._saqCon conx = 0 # Matches with good regions of otherseqs (*including self*) for otherseq in haqlist.seq[0:]: if otherseq == seq: # > so self not counted! continue if len(otherseq.info['Sequence']) != len(seq.info['Sequence']): self.log.errorLog('Sequence lengths do not match - should be aligned!',printerror=False) raise ValueError if (block_align[otherseq][r] or len(badres) == 3): conx += self._saqCon(a, otherseq.info['Sequence'][r]) #if seq == query and r > 590: # print seq.shortName(),r,conx,'vs',self.stat['SAQCon'], if conx >= self.stat['SAQCon']: res_align[seq][r] = True #if seq == query and r > 590: # print r, res_align[seq][r] ## <SAQ2b> ## Marked regions of well-aligned residues for each sequence _stage = '<2b> Mark Regions' ## <i> ## Clear first _stage = '<2b-i> Mark Regions' for r in range(seq.seqLen()): block_align[seq][r] = False ## <ii> ## Recalculate _stage = '<2b-ii> Mark Regions' for r in range(seq.seqLen()): _stage = '<2b-ii> Blocks' if res_align[seq][r]: # Start of potential block blen = 0 # Block length (SAQBlock) = AAs win = 0 # Window length = all sequence matchx = 1 # Good residues in window (first residue must be good!) (SAQMatch) while blen < self.stat['SAQBlock'] and matchx < self.stat['SAQMatch']: win += 1 if (r + win) >= seq.seqLen() or seq.info['Sequence'][r+win] == 'X': # Hit Bad Region: Abort break else: # Better region if gap_align[seq][r+win]: # Decent gap continue else: blen += 1 # Increase Block if res_align[seq][r+win]: # Good residue matchx += 1 #if seq == query and r > 590: # print seq.shortName(),r,matchx,'vs',self.stat['SAQMatch'], if matchx >= self.stat['SAQMatch']: for w in range((win+1)): block_align[seq][r+w] = True #if seq == query and r > 590: # print r, block_align[seq][r] ## <iii> ## Update bad residue count for r in range(seq.seqLen()): _stage = '<2b-iii> Mark Regions' #print seq.shortName(), r, seq.seqLen(), block_align[seq][r], res_gap[seq][r], badres[-1] # Bad residue if not block_align[seq][r] and not res_gap[seq][r]: # Bad residue badres[-1] += 1 if not res_gap[seq][r]: total_res += 1 myinfo = '%s 100.0%%' % infotxt myinfo += ' => %s bad of %s total residues' % (rje.integerString(badres[-1]),rje.integerString(total_res)) self.log.printLog('\r#SAQ',myinfo) #self.verbose(0,3,'\r%45s' % myinfo,0) if badres[-1] == total_res: self.log.errorLog('All residues marked as bad in SAQ!',printerror=False,quitchoice=True) # Now have all residues in all sequences marked as good (block_align=True) or bad (block_align=False) ### <SAQ3> ### X out badly-aligned blocks _stage = '<3> X-Out' self.log.printLog('#SAQ','SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),log=False,newline=False) #self.verbose(0,3,'SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),0) for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][r] == '-': #!# Was backwards? res_gap[seq][r] == False: newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] seq.info['SAQX'] = newseq[0:] # Stores Xd sequences for individuals for use in PAQ #!# Add saving of data in 'datafull' option ### <SAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removal' self.log.printLog('\r#SAQ','SAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),log=False,newline=False) #self.verbose(0,3,'\rSAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),0) ## <SAQ4a> ## Process Query first - only interested in good regions within query _stage = '<4a> Query Removal' if self.opt['NoQuery'] or query == None: # No preprocessing of Query self.verbose(0,4,'no Master Query processing...',0) else: haqlist.mapX(query, qtrim=True, focus=focus) # Replaces other sequence ends and query X columns with Xs self.verbose(0,4,'Query (%s) processed...' % query.shortName(),0) self.verbose(0,3,'',1) if self.opt['ManSAQ']: haqlist.saveFasta(seqfile='%s.mansaq.fas' % haqlist.info['Basefile']) ## <SAQ4b> ## Cycle through other sequences (worst first) until no more good residues or sequences are lost _stage = '<4b> Seq Removal' goodres = [0, self._getGood(haqlist.seq)] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [0] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info['Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = {} # Dictionary of seq:number of lost residues if seq kept losex = {} # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManSAQ']: default = 'N' if badkx * self.stat['SAQKeepLen'] > badlx * self.stat['SAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo('%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),default): seqlist.removeSeq(text='SAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['SAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose(1,3,'%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['SAQKeepLen'] > badlx * self.stat['SAQKeepSeq']: # Lose sequence! haqlist.removeSeq(text='SAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['SAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) #X#self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1) ### <SAQ5> ### Reinstate UnX'd sequence: _stage = '<4b> Seq Removal' for seq in haqlist.seq: #print seq.info [seq.info['SAQ'],seq.info['Sequence']] = [seq.info['Sequence'],seq.info['SAQ']] if self.opt['ManSAQ'] and rje.checkForFile('%s.mansaq.fas' % haqlist.info['Basefile']): os.unlink('%s.mansaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog('Problem with singleSeqAQ() %s.' % _stage, quitchoice=True)
class Price(rje.RJE_Object): ''' Price Class. Author: Rich Edwards (2009). Info:str - Fitness = Fitness measurement [cons] - Phenotype = Phenotype measurement [cons] - ResFile = Results file [price.tdt] - SeqGroup = Sequence grouping method [triplets] - Special = Instigate special run, e.g. allbyall [None] Opt:boolean - NormFit = Normalise fitness to have mean of 1 [False] - QryGaps = Whether to include gaps in the query sequence as positions to score [False] - Weighted = Weight the mean covariance by size of group [False] Stat:numeric List:list - Batch = List of alignment files to use as input [*.fas,*.fasta] - Fitness = Fitness measurement vector (matches query sequence) - Phenotype = Phenotype measurement (matches query sequence) - SeqGroup = Sequence grouping method (matches query sequence) Dict:dictionary Obj:RJE_Objects - SeqList = Sequence list object ''' ######################################################################################################################### def qry(self): return self.obj['SeqList'].obj['QuerySeq'] def seqs(self): return self.obj['SeqList'].seqs() def dna(self): return self.obj['SeqList'].dna() ######################################################################################################################### ### <1> ### Class Initiation etc.: sets attributes # ######################################################################################################################### def _setAttributes(self): ### Sets Attributes of Object '''Sets Attributes of Object.''' ### ~ Basics ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.infolist = ['Fitness','Phenotype','ResFile','SeqGroup','Special'] self.optlist = ['QryGaps','NormFit','Weighted'] self.statlist = [] self.listlist = ['Batch','Fitness','Phenotype','SeqGroup'] self.dictlist = [] self.objlist = ['SeqList'] ### ~ Defaults ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self._setDefaults(info='None',opt=False,stat=0.0,obj=None,setlist=True,setdict=True) self.setInfo({'Fitness':'cons','Phenotype':'cons','SeqGroup':'triplets','ResFile':'price.tdt'}) self.setOpt({'Append':True,'Weighted':True}) self.list['Batch'] = ['*.fas','*.fasta'] ### ~ Other Attributes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['SeqList'] = rje_seq.SeqList(self.log,['query=1']+self.cmd_list+['autoload=T']) ######################################################################################################################### def _cmdList(self): ### Sets Attributes from commandline ''' Sets attributes according to commandline parameters: - see .__doc__ or run with 'help' option ''' for cmd in self.cmd_list: try: self._generalCmd(cmd) ### General Options ### ### Class Options ### self._cmdReadList(cmd,'info',['Fitness','Phenotype','SeqGroup','Special']) self._cmdReadList(cmd,'file',['ResFile']) self._cmdReadList(cmd,'opt',['QryGaps','NormFit','Weighted']) self._cmdReadList(cmd,'list',['Batch']) except: self.errorLog('Problem with cmd:%s' % cmd) ######################################################################################################################### ### <2> ### Main Class Backbone # ######################################################################################################################### def run(self,batch=False): ### Main run method '''Main run method.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Results ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not batch: self.setupResults() ## ~ [1b] ~ Batch run ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not batch and not self.obj['SeqList'].seqs(): ### Look for batch files and run for each batchfiles = rje.getFileList(self,filelist=self.list['Batch'],subfolders=False,summary=True,filecount=0) self.printLog('\r#FILES','Getting files: %5s files for batch run' % rje.integerString(len(batchfiles))) if not batchfiles: self.errorLog('No input files found!',printerror=False) else: bx = 0 for infile in batchfiles: bx += 1 self.printLog('#BATCH','Batch running %s' % infile) bcmd = ['query=1']+self.cmd_list+['autoload=T','seqin=%s' % infile] self.obj['SeqList'] = rje_seq.SeqList(self.log,bcmd) self.run(batch=True) self.opt['Append'] = True self.printLog('#BATCH','|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(bx),rje.integerString(len(batchfiles)-bx)),log=False) if self.opt['Win32'] and len(sys.argv) < 2: self.verbose(0,0,'Finished!',1) # Optional pause for win32 return ## ~ [1c] ~ Special run options ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.info['Special'].lower() == 'allbyall': self.printLog('#RUN','Performing special "all-by-all" pairwise run') self.info['Special'] = '' for i in range(len(self.seqs())-1): self.obj['SeqList'].obj['QuerySeq'] = self.seqs()[i] for j in range(i+1,len(self.seqs())): self.info['Fitness'] = self.info['Phenotype'] = '%d' % (j + 1) self.run(batch=True) self.opt['Append'] = True self.info['Special'] = 'allbyall'; return ## ~ [1d] ~ General setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.setup() ### ~ [2] ~ Price calculations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.fitness() self.phenotype() self.grouping() for vector in ['Fitness','Phenotype','SeqGroup']: if len(self.list[vector]) != self.qry().seqLen(): self.errorLog('%s vector length (%s) does not match %s sequence length (%s)' % (vector,len(self.list[vector]),self.qry().seqLen()),printerror=False) raise ValueError results = self.price() ### ~ [3] ~ Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### results['Dataset'] = rje.baseFile(self.obj['SeqList'].info['Name'],True) results['Query'] = self.qry().shortName() results['Fitness'] = self.info['Fmethod'] results['Phenotype'] = self.info['Pmethod'] results['SeqGroup'] = self.info['SeqGroup'] rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],datadict=results) self.printLog('#OUT','Results output to %s' % self.info['ResFile']) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible ######################################################################################################################### def setupResults(self): ### Main results setup method. '''Main results setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['Headers'] = ['Dataset','Query','Fitness','Phenotype','SeqGroup','CovP','CovB','CovW','Price','Ratio'] rje.delimitedFileOutput(self,self.info['ResFile'],self.list['Headers'],rje_backup=True) except: self.errorLog('Problem during %s setupResults().' % self); raise ######################################################################################################################### def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqlist = self.obj['SeqList'] seqlist._checkAln(aln=True,realign=True) if not seqlist.obj['QuerySeq']: seqlist.obj['QuerySeq'] = seqlist.seqs()[0] self.printLog('#QRY','No query sequence: will use %s' % seqlist.obj['QuerySeq'].shortName()) except: self.errorLog('Problem during %s setup.' % self); raise ######################################################################################################################### ### <3> ### Price Equation Methods # ######################################################################################################################### def fitness(self): ### Calculates fitness vector '''Calculates fitness vector.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### methodlist = ['cons','seqnumber'] self.info['Fmethod'] = method = self.info['Fitness'].lower() if method not in methodlist: try: method = string.atoi(method) try: method = self.seqs()[method-1] except: self.errorLog('Cannot use sequence "%s" for comparison!' % method); raise self.info['Fmethod'] = method.shortName() except: self.errorLog('Fitness method "%s" not recognised!' % method,printerror=False) self.errorLog('Check fitness=%s' % string.join(methodlist,'/'),printerror=False) raise ValueError ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if method == 'cons': self.list['Fitness'] = self.posPercID() elif method in self.seqs(): self.list['Fitness'] = self.posPercID(comp=method) elif os.path.exists(method): self.list['Fitness'] = rje.listFromCommand(method,checkfile=True) self.printLog('#FIT','Vector of %s fitness values read from %s' % (len(self.list['Fitness']),method)) return except: self.errorLog(rje_zen.Zen().wisdom()); raise ######################################################################################################################### def phenotype(self): ### Calculates phenotype vector '''Calculates phenotype vector.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### methodlist = ['cons','seqnumber','hyd'] self.info['Pmethod'] = method = self.info['Phenotype'].lower() if method not in methodlist: try: method = string.atoi(method) try: method = self.seqs()[method-1] except: self.errorLog('Cannot use sequence "%s" for comparison!' % method); raise self.info['Pmethod'] = method.shortName() except: self.errorLog('Phenotype method "%s" not recognised!' % method,printerror=False) self.errorLog('Check phenotype=%s' % string.join(methodlist,'/'),printerror=False) raise ValueError ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if method == 'cons': self.list['Phenotype'] = self.posPercID() elif method in self.seqs(): self.list['Phenotype'] = self.posPercID(comp=method) elif method == 'hyd': self.list['Phenotype'] = rje_sequence.eisenbergHydropathy(self.qry().info['Sequence'],returnlist=True) elif os.path.exists(method): self.list['Phenotype'] = rje.listFromCommand(method,checkfile=True) self.printLog('#PHEN','Vector of %s phenotype values read from %s' % (len(self.list['Phenotype']),method)) except: self.errorLog(rje_zen.Zen().wisdom()); raise ######################################################################################################################### def grouping(self): ### Calculates grouping vector '''Calculates grouping vector.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### methodlist = ['triplets','codons','casechange','case','disorder'] method = self.info['SeqGroup'].lower() if method not in methodlist: self.errorLog('SeqGroup method "%s" not recognised!' % method,printerror=False) self.errorLog('Check seqgroup=%s' % string.join(methodlist,'/'),printerror=False) raise ValueError ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if method == 'triplets': self.list['SeqGroup'] = self.triplets() elif method == 'codons': self.list['SeqGroup'] = self.codons() elif method == 'casechange': self.list['SeqGroup'] = self.caseChange() elif method == 'case': self.list['SeqGroup'] = self.case() elif method == 'disorder': if self.opt['QryGaps']: self.list['SeqGroup'] = self.qry().gappedDisorder() else: self.list['SeqGroup'] = self.qry().gappedDisorder(gap=None) for i in range(self.qry().seqLen()): if self.list['SeqGroup'][i]: if self.list['SeqGroup'][i] > self.qry().obj['Disorder'].stat['IUCut']: self.list['SeqGroup'][i] = 'Dis' else: self.list['SeqGroup'][i] = 'Ord' elif os.path.exists(method): self.list['SeqGroup'] = rje.listFromCommand(method,checkfile=True) self.printLog('#GRP','Vector of %s group values read from %s' % (len(self.list['SeqGroup']),method)) except: self.errorLog(rje_zen.Zen().wisdom()); raise ######################################################################################################################### def price(self): ### Calculates price equation, using Fitness, Phenotype and SeqGroup vectors '''Calculates price equation, using Fitness, Phenotype and SeqGroup vectors.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### pop = {'z':[],'w':[]} # w = fitness, z = phenotype grp = {} # Each group will have its own w and z grpmean = {'z':[],'w':[]} # Calculate means for each group grpcov = [] # List of group covariances ### ~ [2] Populate data vectors ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.deBug(self.list['SeqGroup']) self.deBug(self.list['Fitness']) self.deBug(self.list['Phenotype']) for i in range(len(self.list['SeqGroup'])): if not self.list['SeqGroup'][i]: continue g = self.list['SeqGroup'][i] w = self.list['Fitness'][i] z = self.list['Phenotype'][i] pop['z'].append(z); pop['w'].append(w) if g not in grp: grp[g] = {'z':[],'w':[]} grp[g]['z'].append(z); grp[g]['w'].append(w) ## ~ [2a] Normalise fitness? ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['NormFit']: meanfit = float(rje.meansd(pop['w'])[0]) for i in range(len(pop['w'])): pop['w'][i] = pop['w'][i] / meanfit for g in grp: for i in range(len(grp[g]['w'])): grp[g]['w'][i] = grp[g]['w'][i] / meanfit ## ~ [2b] Group means ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## covw = 0.0 # Mean covariance within groups for g in grp: grp[g]['cov'] = self.covariance(grp[g]['z'],grp[g]['w']) grpcov.append(grp[g]['cov']) if self.opt['Weighted']: covw += grp[g]['cov'] * len(grp[g]['w']) / len(pop['w']) else: covw += grp[g]['cov'] / len(grp) grpmean['z'].append(rje.meansd(grp[g]['z'])[0]) grpmean['w'].append(rje.meansd(grp[g]['w'])[0]) ### ~ [3] Calculate within and between group covariance ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### covp = self.covariance(pop['z'],pop['w']) # Covariance of whole population covb = self.covariance(grpmean['z'],grpmean['w']) # Covariance between groups #x#covw = rje.meansd(grpcov)[0] # Mean covariance within groups price = covp / rje.meansd(pop['w'])[0] try: ratio = covb / covw except: ratio = -1 ## ~ [3a] Perform checks of calculation ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.printLog('#CHECK','CovP = %s; (CovB + CovW) = %s' % (rje.expectString(covp),rje.expectString(covb+covw))) self.printLog('#PRICE','Price value = %s; CovB/CovW ratio = %s' % (rje.expectString(price),rje.expectString(ratio))) return {'CovP':rje.expectString(covp),'CovB':rje.expectString(covb),'CovW':rje.expectString(covw),'Price':rje.expectString(price),'Ratio':rje.expectString(ratio)} except: self.errorLog(rje_zen.Zen().wisdom()); raise ######################################################################################################################### def covariance(self,list1,list2): ### Calculates the covariance of two lists and returns '''Calculates the covariance of two lists and returns.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### n = len(list1) if not n: self.errorLog('Lists for covariance are empty!',printerror=False); return 0.0 if len(list2) != n: self.errorLog('Lists for covariance of different lengths!',printerror=False); raise ValueError ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### return covariance(list1,list2) except: self.errorLog(rje_zen.Zen().wisdom()); raise ######################################################################################################################### ### <4> ### Fitness/Phenotype Methods # ######################################################################################################################### def posPercID(self,gaps=True,xval=0.0,default=1.0,comp=None): ### Returns a list of absolute pecentage conservation across each position ''' Returns a list of absolute pecentage conservation across each position. >> gaps:bool [True] = Whether to include gapped sequences in calculation [True] >> xval:num [0.0] = The value (0-1) to give undefined residues matching defined residues >> default:num [1.0] = Value to return if no homologues for position >> comp:Sequence object = sequence for pairwise comparison ''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qry = self.qry() if comp: compseq = [comp] else: compseq = self.seqs()[0:]; compseq.remove(qry) poslist = [default] * qry.seqLen() # List of percentage ID values xval = min(1.0,max(0.0,xval)) ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for r in range(qry.seqLen()): q = qry.info['Sequence'].upper()[r] i = 0.0; n = 0 for seq in compseq: s = seq.info['Sequence'].upper()[r] if s == q: i += 1; n += 1 elif 'X' in [s,q]: i += xval; n += 1 elif s == '-' and not gaps: continue else: n += 1 if n: poslist[r] = i / n return poslist except: self.errorLog(rje_zen.Zen().wisdom()); raise ######################################################################################################################### ### <5> ### Grouping Methods # ######################################################################################################################### def triplets(self): ### Returns grouping vector based on DNA triplets '''Returns grouping vector based on DNA triplets.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qry = self.qry() grplist = [0] * qry.seqLen() # List of groups (0 = no group) gx = 0 ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### trip = 0 for r in range(qry.seqLen()): q = qry.info['Sequence'].upper()[r] if self.opt['QryGaps'] and q == '-': continue if not trip: gx += 1 grplist[r] = gx if trip == 2: trip = 0 else: trip += 1 self.printLog('#GRP','%s triplet groups from %s%s' % (rje.integerString(gx),rje.integerString(qry.seqLen()),self.obj['SeqList'].units())) return grplist except: self.errorLog(rje_zen.Zen().wisdom()); raise ######################################################################################################################### def codons(self): ### Returns grouping vector based on DNA codon positions (three groups) '''Returns grouping vector based on DNA codon positions (three groups).''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qry = self.qry() grplist = [0] * qry.seqLen() # List of groups (0 = no group) ### ~ [2] Calculate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### trip = 1 for r in range(qry.seqLen()): q = qry.info['Sequence'].upper()[r] if self.opt['QryGaps'] and q == '-': continue grplist[r] = trip if trip == 3: trip = 1 else: trip += 1 self.printLog('#GRP','3 codon groups from %s%s' % (rje.integerString(qry.seqLen()),self.obj['SeqList'].units())) return grplist except: self.errorLog(rje_zen.Zen().wisdom()); raise ######################################################################################################################### def caseChange(self): ### Returns groupings based on Case boundaries of query '''Returns groupings based on Case boundaries of query.''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### qry = self.qry() self.deBug(qry.getSequence(case=True)) grplist = ['UC'] * qry.seqLen() # List of groups (None = no group) ### ~ [1] Map Case ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for (start,end) in self.qry().dict['Case']['Lower']: for i in range(start-1,end): grplist[i] = 'LC' caselist = grplist[0:] gx = 1 for r in range(qry.seqLen()): q = qry.info['Sequence'].upper()[r] if not self.opt['QryGaps'] and q == '-': grplist[r] = 0 elif r > 0 and caselist[r] != caselist[r-1]: gx += 1 grplist[r] = gx self.printLog('#GRP','%s case groups from %s%s' % (rje.integerString(gx),rje.integerString(qry.seqLen()),self.obj['SeqList'].units())) self.deBug(grplist) return grplist
def convert(self, filelist=[], outfile=None ): ### Converts scansite output files in FileList to Outfile ''' Converts scansite output files in FileList to Outfile. ''' try: ### Setup ### _stage = 'Setup' if len(filelist) < 1: filelist = self.list['FileList'] if not outfile: outfile = self.info['Name'] if len(filelist) < 1: self.log.errorLog( 'No scansite files to convert! %s unchanged/not made.' % outfile, printerror=False) return False delimit = rje.getDelimit(self.cmd_list) ext = rje.delimitExt(delimit) if ext != outfile[-3:]: newfile = outfile[:-3] + ext if rje.yesNo('Change file name from %s to %s?' % (outfile, newfile)): outfile = newfile self.log.printLog( '#OUT', 'Converting %d file(s), output to %s.' % (len(filelist), outfile)) ### Output File ### _stage = 'Output File' if not self.opt['Append'] or not os.path.exists( outfile): # Create with header OUTFILE = open(outfile, 'w') headers = [ 'seq_id', 'enzyme', 'enz_group', 'aa', 'pos', 'score', 'percentile', 'matchseq', 'sa' ] rje.writeDelimit(OUTFILE, headers, delimit) else: OUTFILE = open(outfile, 'a') ### Conversion ### _stage = 'Conversion' sx = 0 for infile in filelist: if not os.path.exists(infile): self.log.errorLog( 'Input file %s does not exist! :o(' % infile, False, False) continue fx = 0 INFILE = open(infile, 'r') inline = rje.nextLine(INFILE) while inline != None: if rje.matchExp(re_scansite, inline): scanlist = rje.matchExp(re_scansite, inline) rje.writeDelimit(OUTFILE, scanlist, delimit) sx += 1 fx += 1 rje.progressPrint(self, sx) inline = rje.nextLine(INFILE) self.log.printLog( '#OUT', '%s scansite results from %s. (%s Total.)' % (rje.integerString(fx), infile, rje.integerString(sx))) INFILE.close() ### End ### _stage = 'End' OUTFILE.close() self.log.printLog( '#OUT', '%s scansite results output to %s.' % (rje.integerString(sx), outfile)) return True except: self.log.errorLog('Error in convert(%s)' % _stage, printerror=True, quitchoice=False) raise
def tabulatePPIRegion( self): ### Tabulates regions of known PPI from DAT file '''Tabulates regions of known PPI from DAT file.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tabfile = 'ppi_region.tdt' unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat' if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB', '%s found. (Force=F)' % tabfile) headers = ['Protein', 'Start', 'End', 'Interactor'] rje.delimitedFileOutput(self, tabfile, headers, rje_backup=True) ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gcmd = "grep -P '(ID |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile self.printLog('#GREP', gcmd) prot = None rx = 0 plist = [] ilist = [] for gline in os.popen(gcmd).readlines(): if rje.matchExp('ID (\S+)', gline): prot = rje.matchExp('ID (\S+)', gline)[0] if rje.matchExp( 'FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)', gline): (rstart, rend, rint) = rje.matchExp( 'FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)', gline) for ppi in string.split(rint): if rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi): datadict = { 'Protein': prot, 'Start': rstart, 'End': rend, 'Interactor': rje.matchExp('^([A-Z0-9][A-Z0-9]+)', ppi)[0] } rje.delimitedFileOutput(self, tabfile, headers, datadict=datadict) rx += 1 if prot not in plist: plist.append(prot) if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor']) self.progLog( '\r#REGTAB', 'Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString( len(plist)), rje.integerString( len(ilist)), rje.integerString(rx))) self.printLog( '\r#REGTAB', 'Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)), rje.integerString( len(ilist)), rje.integerString(rx), tabfile)) return True except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def readHMMPFamSearch(self,resfile=None,readaln=False): ### Reads HMM PFam Search Results into objects ''' Reads HMM Search Results into objects. >> resfile:str = Results File (set as self.info['OutFile']) >> readaln:boolean = whether to bother reading Alignments into objects [False] !!! Currently always False !!! ''' try: ### Setup ### if not resfile or not os.path.exists(resfile): self.log.errorLog('Results file "%s" missing!' % resfile,printerror=False) return False ## Make RegExp for starting next alignment ## re_hit = string.join(['^(\S+):','domain','(\d+)','of','(\d+),','from','(\d+)','to','(\d+):','score','(\S+),','E','=','(\S+)'],'\s+') ## Search dictionary as results come back per sequence, not per HMM! ## pfam = {} # Dictionary of {PFam name:search} hitx = 0 # Total number of hits hitlist = [] # List of sequences processed from file (may or may not include zero hit sequences) ### Read in Search results ### if open(resfile,'r').readline().find('hmmpfam') != 0: self.errorLog('File "%s" does not appear to be an hmmpfam results file' % resfile,printerror=False) if rje.yesNo('Delete incorrect results file? (Check that hmmpfam=T is right!)',default='N'): os.unlink(resfile) self.printLog('#DEL','Dodgy results file "%s" deleted.' % resfile) return False hitname = None i = 0; hx = 0; seqx = 0 RESFILE = open(resfile,'r') #x#resline = self.loadFromFile(resfile,chomplines=True) #x#while i < len(resline): line = RESFILE.readline() newres = [rje.chomp(line)]; newresout = True; newresfile = '%s.partial' % resfile if os.path.exists(newresfile): os.unlink(newresfile) while line: self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx))) line = rje.chomp(line) #print line ## New Sequence ## if rje.matchExp('^Query sequence:\s+(\S+)',line): if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n')) newres = ['',line]; newresout = False hitname = rje.matchExp('^Query sequence:\s+(\S+)',line)[0]; hx += 1 #x#if hitname not in hitlist: hitlist.append(hitname) ## One Line Data for hits ## elif line.find('Parsed for domains:') == 0: #x#i += 3 # Skip two complete lines newres += [line,rje.chomp(RESFILE.readline()),rje.chomp(RESFILE.readline())] line = rje.chomp(RESFILE.readline()); newres.append(line) #Model Domain seq-f seq-t hmm-f hmm-t score E-value #-------- ------- ----- ----- ----- ----- ----- ------- #Lep_receptor_Ig 1/1 24 114 .. 1 103 [] 158.4 1.7e-44 # ... else ... # [no hits above thresholds] while rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line): newresout = True (dom,start,end,score,eval) = rje.matchExp(string.join(['^(\S+)','\S+','(\d+)','(\d+)\D.+','(\S+)','(\S+)\s*$'],'\s+'),line) if not pfam.has_key(dom): pfam[dom] = self._addSearch() pfam[dom].info['Name'] = dom hit = pfam[dom]._addHit() hit.info['Name'] = hitname aln = hit._addAln() aln.setStat({'SbjStart':string.atoi(start),'SbjEnd':string.atoi(end),'Expect':string.atof(eval),'BitScore':string.atof(score)}) hitx += 1 self.progLog('\r#RES','Reading %s: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(hx),rje.integerString(len(pfam)),rje.integerString(hitx))) line = rje.chomp(RESFILE.readline()); newres.append(line) ## End of Protein ## elif line[:2] == '//': hitname = None; newres.append(line) elif rje.matchExp('End of rje_hmm reduced results file: (%d) sequences in original',line): seqx = string.atoi(rje.matchExp('End of rje_hmm reduced results file: (\d+) sequences in original',line)[0]) elif newres: newres.append(line) #x#i += 1 line = RESFILE.readline() if newres and newresout and self.opt['CleanRes']: open(newresfile,'a').write(string.join(newres,'\n')) if not seqx: seqx = hx if self.opt['CleanRes']: open(newresfile,'a').write(string.join(['','End of rje_hmm reduced results file: %d sequences in original' % seqx],'\n')) os.unlink(resfile) os.rename(newresfile,resfile) self.printLog('\r#RED','Results file %s replaced with reduced version (%s Hits only)' % (resfile,rje.integerString(hitx))) self.printLog('\r#RES','Reading %s complete: %s Seqs; %s Domains; %s Hits' % (resfile,rje.integerString(seqx),rje.integerString(len(pfam)),rje.integerString(hitx))) return True except: self.log.errorLog('Calamity during readHMMSearch(%s)' % (resfile)) return False
def setup(self): ### Loads data into attributes. '''Loads data into attributes.''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ UniProt Object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = self.obj['UniProt'] = rje_uniprot.UniProt(self.log,self.cmd_list) uniprot.readUniProt() if uniprot.entryNum() > 0: ### UniProt data loaded. Populate seqlist and domain dictionary. seqlist = rje_seq.SeqList(self.log,self.cmd_list+['autoload=F']) for entry in uniprot.list['Entry']: seq = entry.obj['Sequence'] seqlist.seq.append(entry.obj['Sequence']) name = seq.shortName() self.dict['Entry'][name] = entry self.dict['Seq'][name] = seq for ft in entry.list['Feature']: if ft['Type'] in self.list['DomFT']: try: dom = string.split(ft['Desc'])[0] if dom not in self.dict['Domain']: self.dict['Domain'][dom] = [] if name not in self.dict['Domain'][dom]: self.dict['Domain'][dom].append(name) except: self.errorLog('Trouble with %s feature %s' % (name,ft)) ## ~ [1b] ~ SeqList only ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## else: seqlist = rje_seq.SeqList(self.log,self.cmd_list) for seq in seqlist.seq: name = seq.shortName() self.dict['Entry'][name] = None self.dict['Seq'][name] = seq #!# Consider adding loading domains from a table #!# ## ~ [1c] ~ Add PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['PPI'] # Dictionary of ShortName-centred ppi = rje.dataDict(self,self.info['PPI']) for hub in ppi: if ppi[hub]['EnsLoci'] == '-': continue ens = ppi[hub]['EnsLoci'] if ens not in self.dict['PPI']: self.dict['PPI'][ens] = [] self.dict['Gene'][ens] = hub for gene in string.split(ppi[hub]['PPI'],','): if ppi[gene]['EnsLoci'] == '-': continue if ppi[gene]['EnsLoci'] not in self.dict['PPI'][ens]: self.dict['PPI'][ens].append(ppi[gene]['EnsLoci']) ## ~ [1d] ~ Add DDI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['DDI'] = {} if self.info['DDI'].lower() not in ['','none']: data = rje.dataDict(self,self.info['DDI'],mainkeys=['Name1'],datakeys=['Name2'], headers=['Pfam1','Pfam2','Name1','Name2','Acc1','Acc2','Code1','Code2'],lists=True) ## ~ Parse ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # (dx,dtot) = (0.0,len(data)) self.deBug(data) try: rje.sortKeys(data) except: self.errorLog('F**k',quitchoice=True) for p1 in rje.sortKeys(data): self.progLog('\r#DDI','Parsing DDI from iPFam: %.1f%%' % (dx/dtot)) if p1 not in self.dict['DDI']: self.dict['DDI'][p1] = [] for p2 in data[p1]['Name2']: if p2 not in self.dict['DDI']: self.dict['DDI'][p2] = [] if p2 not in self.dict['DDI'][p1]: self.dict['DDI'][p1].append(p2) if p1 not in self.dict['DDI'][p2]: self.dict['DDI'][p2].append(p1) self.printLog('\r#DDI','Parsing DDI from iPFam: %s domains' % (rje.integerString(dtot))) ## ~ [1e] ~ Family data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.dict['Fam'] = {} if self.info['Fam'].lower() not in ['','none']: data = rje.dataDict(self,self.info['Fam'],mainkeys=['Qry'],datakeys=['Hit'],lists=True) for qry in self.dict['Seq']: self.dict['Fam'][qry] = [] if qry in data: self.dict['Fam'][qry] = data[qry]['Hit'] elif self.dict['Seq'][qry].info['AccNum'] in data: self.dict['Fam'][qry] = data[self.dict['Seq'][qry].info['AccNum']]['Hit'] if qry not in self.dict['Fam'][qry]: self.dict['Fam'][qry].append(qry) except: self.errorLog('Problem with SLiMPID.setup()',quitchoice=True)
def parseOMIM(self): ### Main parsing method '''Main parsing method.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} aas = string.split( string.join(rje_sequence.aa_code_3.values()).upper()) oline = os.path.exists(self.info['Name']) (olen, ox, mx) = (len(open(self.info['Name'], 'r').readlines()), 0.0, 0) OMIM = open(self.info['Name'], 'r') ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### record = gene = subid = disease = mutation = '' av = False # Whether reading *FIELD* AV for mutation data while oline: oline = OMIM.readline() self.log.printLog( '\r#OMIM', 'Processing OMIM: %.2f%% (%s genes)' % (ox / olen, rje.integerString(len(self.dict['Records']))), newline=False, log=False) ox += 100.0 if not av and oline[:1] != '*': continue line = rje.chomp(oline) while line[-1:] == ' ': line = line[:-1] ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if line == '*RECORD*': (record, av) = ('', False) elif line == '*FIELD* NO': # New record record = rje.chomp(OMIM.readline()) gene = '' ox += 100.0 ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* TI': # New gene gene = string.split(rje.chomp(OMIM.readline()))[-1] subid = '' av = False ox += 100.0 ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* AV': av = True # Start of mutation records elif av and rje.matchExp('^(\.\d+)', line): # New subid mutation record subid = rje.matchExp('^(\.\d+)', line)[0] disease = rje.chomp(OMIM.readline()) ox += 100.0 try: mutation = rje.matchExp( '^%s, (\D\D\D\d+\D\D\D)' % gene, rje.chomp(OMIM.readline()))[0] except: continue # No mutation or not coding change ox += 100.0 subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)', mutation) if subaa[0] not in aas or subaa[1] not in aas: continue if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} mx += 1 self.dict['Mutations'][gene][subid] = (disease, mutation) ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OMIM.close() self.log.printLog( '\r#OMIM', 'Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len( self.dict['Records'])), rje.integerString(mx))) self.saveMutations() except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def _setupMapped(self): ### Sets up list of Previously Mapped Sequences '''Sets up list of Previously Mapped Sequences.''' ### Setup ### self.list['Mapped'] = [] # List of mapped sequence names if not self.bool['Append'] or not os.path.exists(self.str['MapFas']): return ### Previous Sequences ### seqlist = rje_seq.SeqList(None,['i=-1','v=-1','autoload=F','seqin=%s' % self.str['MapFas']]) SEQFILE = open(filename,'r') lastline = '' sx = 0 ### Count ### while 1: (nextseq,lastline) = seqlist.nextFasSeq(SEQFILE,lastline) seqlist.seq = [] if nextseq: sx += 1 self.list['Mapped'].append(nextseq.shortName()) else: break SEQFILE.close() self.printLog('#MAP','Read names of %s previously mapped sequences for redundancy checking' % rje.integerString(sx))
def run(self,imenu=False,outputmap=True,returndict=False): ### Main controlling run Method ''' Main controlling run Method. >> imenu:boolean = Whether to initiate interactive menu if appropriate [False]. >> outputmap:boolean = Whether to output mapping into a file [True] >> returndict:boolean = Whether to return a dictionary of {searchname:mappedname} (no previous mapping) [False] ''' try:### ~ [0] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not self.setup(imenu): raise ValueError seqlist = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=T','seqmode=file']) if not seqlist.seqNum(): self.warnLog('No sequences loaded for mapping.'); return {} ## ~ [0a] Setup BLAST Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## blast = rje_blast.BLASTRun(self.log,['blaste=1e-4','blastv=20','blastf=F']+self.cmd_list+['v=-1']) blast.setStr({'DBase':self.getStr('MapDB'),'Type':'blastp','InFile':self.getStr('SeqIn'), 'Name':'%s-%s.blast' % (rje.baseFile(self.str['SeqIn'],True),rje.baseFile(self.str['MapDB'],True))}) blast.setStat({'HitAln':blast.getStat('OneLine')}) blast.list['ResTab'] = ['Search','Hit','GABLAM'] if seqlist.nt(): blast.str['Type'] = 'blastx' ## ~ [0b] Setup Output ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if outputmap: self._setupOutput() ## Output Files ## if returndict: mapdict = {} else: self._setupMapped() ## Previously Mapped Sequences ## seqx = seqlist.seqNum() ## Number of sequences ## ### ~ [1] BLAST Search Mapping ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#BLAST','BLASTing %s vs %s.\n *** This could take some time if files are large. Please be patient! ***' % (self.str['SeqIn'],self.str['MapDB']),log=False) ## ~ [1a] Perform BLAST Unless it exists ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## blast.run(format=True) self.obj['DB'] = blast.obj['DB'] ## ~ [1b] Mapping from searches ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.debug(self.getStr('MapDB')) self.obj['MapDB'] = rje_seqlist.SeqList(self.log,self.cmd_list+['autoload=F','seqmode=file','seqin=%s' % self.str['MapDB']]) self.obj['MapDB'].loadSeq(self.getStr('MapDB')) self.debug('%s' % self.obj['MapDB'].list['Seq']) sx = 0 while seqlist.nextSeq() != None: search = seqlist.getSeq(format='short') sx += 1 ## Check StartFrom ## if self.str['StartFrom']: if self.str['StartFrom'] != search: self.progLog('\r#SKIP','Looking for %s: skipping %d seqs' % (self.str['StartFrom'],sx)) continue self.str['StartFrom'] = '' self.printLog('\r#SKIP','Starting from %s: skipped %d seqs' % (self.str['StartFrom'],sx)) ## Check if in Mapped ## if search in self.list['Mapped']: resdict = {'Query':search,'Hit':search,'Method':'Already Mapped!'} self.printLog('#FAS','%s already in output - not duplicating in %s' % (search,self.str['MapFas'])) rje.delimitedFileOutput(self,self.str['MapRes'],self.list['Headers'],rje.getDelimit(self.cmd_list),resdict) continue ### Map Sequence ### self.printLog('#MAP','Mapping %s seqs: %s of %s' % (self.str['SeqIn'],rje.integerString(sx),rje.integerString(seqx))) mapname = self.mapSeq(seqlist,blast,search) if returndict: mapdict[search] = mapname ### ~ [2] Finish ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.printLog('#MAP','Mapping of %s (%s seqs) complete.' % (self.str['SeqIn'],rje.integerString(seqx))) if os.path.exists(blast.str['Name']) and not (self.getBool('DeBug') or self.test()): os.unlink(blast.str['Name']) #!# Add option to keep BLAST! #!# if returndict: return mapdict except: self.errorLog('Error in SeqMapper.run()',printerror=True,quitchoice=True); raise
def pairwiseAQ(self,seqlist=None,query=None,focus=[0,0]): ### Performs PAQ on seqlist, adding seq.info['PAQ'] ''' Performs PAQ on seqlist, adding seq.info['PAQ'] >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: PAQ = PAQ sequences with alignment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <PAQ0> ### Setup try: _stage = '<0> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences if not query: query = haqlist.obj['QuerySeq'] if self.opt['NoQuery'] or not query: query = haqlist.seq[random.randint(0,haqlist.seqNum()-1)] self.log.printLog('#QRY','Temp (random) query %s assigned for PAQ' % query.shortName()) #!# paqx = [False] * seqlist.seq[0].seqLen() # List of whether a column of the alignment is bad (has an X) [True] or not [False] #!# - make this a method?! pwaq = {} # Dictionary of lists of pairwise alignements block_align = {} # Dictionary of whether residue in block of sequence that is well-aligned or not for seq in haqlist.seq: block_align[seq] = [False] * seq.seqLen() seq.info['PAQ'] = seq.info['Sequence'][0:] if seq.info.has_key('SAQX') and len(seq.info['SAQX']) == seq.seqLen(): #!# Should no longer be issues due to length changes following realignment seq.info['Sequence'] = seq.info['SAQX'][0:] elif seq.info.has_key('SAQX'): self.log.errorLog('Cannot use SAQX for %s in PAQ as wrong length.' % seq.shortName(),printerror=False) for otherseq in haqlist.seq: pwaq[(seq,otherseq)] = [False] * seq.seqLen() ### <PAQ1> ### Directional Pairwise Comparisons of sequences _stage = '<1> Pairwise Comparisons' infotxt = 'PAQ%d: Pairwise Comparisons ...' % self.stat['PAQCyc'] #print self.stat for seq in haqlist.seq: for otherseq in haqlist.seq: myinfo = '%s %.1f%% %.1f%% ' % (infotxt,(100.0 * haqlist.seq.index(seq) / haqlist.seqNum()),(100.0 * haqlist.seq.index(otherseq) / haqlist.seqNum())) self.log.printLog('\r#PAQ',myinfo,log=False,newline=False) for r in range(seq.seqLen()): ar = seq.info['Sequence'][r] ## <i> ## Look for PW aligned block _stage = '<1-i> Pairwise Comparisons' if ar not in ['-','X']: # Start of test block blen = 0 # Block length (PAQBlock) = AAs win = 0 # Window length = all sequence matchx = 0 # Score for residues in window while blen < self.stat['PAQBlock'] and (r+win) < seq.seqLen(): # This time we allow overshoots in both directions ar = seq.info['Sequence'][r+win] at = otherseq.info['Sequence'][r+win] if 'X' in [ar,at]: # Hit Bad Region: Abort break else: # Better region if ar != '-': blen += 1 # Increase Block matchx += self._saqCon(ar,at) win += 1 ## <ii> ## Update pwaq if block good _stage = '<1-ii> Pairwise Comparisons' if matchx >= self.stat['PAQMatch']: for w in range(win): if seq.info['Sequence'][r+w] in ['-','X']: pwaq[(seq,otherseq)][r+w] = False else: pwaq[(seq,otherseq)][r+w] = True self.log.printLog('\r#PAQ','%s 100.0% 100.0%. ' % infotxt,log=False) ### <PAQ2> ### Link back to Query _stage = '<2> Linking to Query' ### <PAQ2a> ### Network of Pairwise Quality alignments _stage = '<2a> Linking to Query' #self.verbose(1,3,'PAQ%d: Linking Residues to Query (%s)' % (self.stat['PAQCyc'],query.shortName()),0) infotxt = 'PAQ%d: Linking Residues to Query (%s) ...' % (self.stat['PAQCyc'],query.shortName()) for r in range(query.seqLen()): _stage = '<2a> Linking to Query' self.log.printLog('\r#PAQ','%s %.1f%%' % (infotxt,(100.0 * r / query.seqLen())),log=False,newline=False) qok = {} # Dictionary of whether residue in seq OK, i.e. linked to query for seq in haqlist.seq: qok[seq] = False qok[query] = True sok = [0,1] # List of OK sequence for residue while sok[-2] != sok[-1]: ## <i> ## Match pairs, starting with query _stage = '<2a-i> Linking to Query' for seq in haqlist.seq: if qok[seq]: for otherseq in haqlist.seq: if pwaq[(seq,otherseq)][r] or pwaq[(otherseq,seq)][r]: qok[otherseq] = True ## <ii> ## Update sok _stage = '<2a-ii> Linking to Query' sok.append(0) for seq in haqlist.seq: if qok[seq]: sok[-1] += 1 block_align[seq][r] = True _stage = '<2a-iii> Linking to Query' if sok[-1] == 1: # Only query OK! block_align[query][r] = False self.log.printLog('\r#PAQ','%s 100.0%%' % infotxt,log=False) ### <PAQ2b> ### Allow for divergence (Conserved Anchors) _stage = '<2b> Anchors' if self.opt['Anchors']: infotxt = 'PAQ%d: Accounting for divergence within aligned regions ...' % self.stat['PAQCyc'] ## <i> ## Setup gapped list gapped = [False] * query.seqLen() # Whether column of alignment is gapped for seq in haqlist.seq: self.log.printLog('\r#PAQ','%s %.1f%% ' % (infotxt,(50.0 * haqlist.seq.index(seq) / haqlist.seqNum())),log=False,newline=False) (start,end) = (0,seq.seqLen()) while seq.info['Sequence'][start] == '-': start += 1 while seq.info['Sequence'][end-1] == '-': end -=1 for r in range(start,end): if seq.info['Sequence'][r] == '-': gapped[r] = True ## <ii> ## Correction for seq in haqlist.seq: self.log.printLog('\r#PAQ','%s %.1f%% ' % (infotxt,(50 + (50.0 * haqlist.seq.index(seq) / haqlist.seqNum()))),log=False,newline=False) for r in range(seq.seqLen()): if block_align[seq][r] or gapped[r]: # No need for correction continue # Move in both directions: if good residues (or sequence end) reached before gaps then reinstate winf = 0 fwd = True fok = False winb = 0 bwd = True bok = False while fwd or bwd: # End of seqs if (r + winf) >= seq.seqLen(): fwd = False if (r - winb) < 0: bwd = False # Gaps/OK if fwd: if gapped[r+winf]: fok = False fwd = False elif block_align[seq][r+winf]: fwd = False else: winf += 1 if bwd: if gapped[r-winb]: bok = False bwd = False elif block_align[seq][r-winb]: bwd = False else: winb += 1 if fok and bok: # Reinstate for w in range(r-winb,r+winf+1): block_align[seq][w] = True self.log.printLog('\r#PAQ','%s 100.0%% ' % infotxt,log=False) ### <PAQ3> ### X out badly-aligned blocks _stage = '<3> Making bad sequence blocks' for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][r] == '-': newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] #!# Add saving of data in 'datafull' option ### <PAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removing sequences/regions' self.verbose(0,4,'PAQ%d: Removing bad sequences and/or dodgy regions...' % self.stat['PAQCyc'],0) ## <PAQ4a> ## Process Query first - only interested in good regions within query if self.opt['NoQuery']: # No preprocessing of Query self.verbose(0,4,'no Master Query processing...',0) else: haqlist.mapX(query, qtrim=True, focus=focus) # Replaces other sequence ends and query X columns with Xs self.verbose(0,4,'Query (%s) processed...' % query.shortName(),0) self.verbose(0,3,'',1) if self.opt['ManPAQ']: haqlist.saveFasta(seqfile='%s.manpaq.fas' % haqlist.info['Basefile']) ## <PAQ4b> ## Cycle through other sequences (worst first) until no more good residues are lost goodres = [0, self._getGood(haqlist.seq)] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [0] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info['Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = {} # Dictionary of seq:number of lost residues if seq kept losex = {} # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] #?# In Perl HAQESAC there was an option to ignore Orphans in this calculation. Reinstate? losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManPAQ']: default = 'N' if badkx * self.stat['PAQKeepLen'] > badlx * self.stat['PAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo('%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),default): seqlist.removeSeq(text='PAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['PAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose(1,3,'%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['PAQKeepLen'] > badlx * self.stat['PAQKeepSeq']: # Lose sequence! seqlist.removeSeq(text='PAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['PAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1) ### <PAQ5> ### Reinstate UnX'd sequence: _stage = '<5> Replacing sequences' for seq in haqlist.seq: [seq.info['PAQ'],seq.info['Sequence']] = [seq.info['Sequence'],seq.info['PAQ']] if self.opt['ManPAQ'] and rje.checkForFile('%s.manpaq.fas' % haqlist.info['Basefile']): os.unlink('%s.manpaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog('rje_haq.py ~ Problem with pairwiseAQ %s.' % _stage, True)
def setup(self): ### Main class setup method. '''Main class setup method.''' try: ### ~ [1] Pairwise PPI ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ppipairwise = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/pingu.pairwise.tdt' self.progLog('\r#PPI', 'Loading pairwise data...') pairwise = rje.dataDict(self, ppipairwise, ['Hub', 'Spoke'], ['Spoke', 'SpokeSeq', 'Evidence']) gene2seq = {} seq2gene = {} fullppi = {} px = 0.0 ptot = len(pairwise) ppix = 0 for pair in rje.sortKeys(pairwise): self.progLog( '\r#PPI', 'Processing full pairwise PPI: %.2f%%' % (px / ptot)) px += 100.0 [hub, spoke] = string.split(pair, '\t') if spoke not in gene2seq: sseq = pairwise[pair]['SpokeSeq'] gene2seq[spoke] = sseq seq2gene[string.split(sseq, '__')[0]] = spoke if hub not in fullppi: fullppi[hub] = {} if spoke not in fullppi[hub]: fullppi[hub][spoke] = pairwise.pop(pair)['Evidence'] ppix += 1 self.printLog( '\r#PPI', 'Processed full pairwise PPI: %s genes; %s ppi.' % (rje.integerString(len(fullppi)), rje.integerString(ppix / 2))) ### ~ [2] Filter complexes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### goodppifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/Pingu/hybrid.txt' goodppi = self.loadFromFile(goodppifile, chomplines=True) self.dict['PPI'] = {} px = 0.0 ptot = len(fullppi) fppix = ppix ppix = 0 for hub in fullppi: self.progLog( '\r#PPI', 'Filtering complexes: %.2f%% (%s hubs; %s ppi)' % (px / ptot, rje.integerString(len( self.dict['PPI'])), rje.integerString(ppix))) px += 100.0 self.dict['PPI'][hub] = [] for spoke in fullppi[hub]: goodspoke = False for ptype in goodppi: if rje.matchExp(':(%s)($|\|)' % ptype, fullppi[hub][spoke]): goodspoke = True break if goodspoke: self.dict['PPI'][hub].append(spoke) continue goodspoke = True for spoke2 in fullppi[hub]: if spoke2 in [hub, spoke]: continue if spoke2 in fullppi[spoke]: goodspoke = False break if goodspoke: self.dict['PPI'][hub].append(spoke) ppix += len(self.dict['PPI'][hub]) if not self.dict['PPI'][hub]: self.dict['PPI'].pop(hub) self.printLog( '\r#PPI', 'Filtered complexes: (%s -> %s hubs; %s -> %s ppi)' % (rje.integerString( len(fullppi)), rje.integerString(len(self.dict['PPI'])), rje.integerString(fppix / 2), rje.integerString(ppix / 2))) ### ~ [3] SeqList ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### seqfile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/EnsEMBL/ens_HUMAN.loci.fas' scmd = ['accnr=F', 'seqnr=F', 'seqin=%s' % seqfile] + self.cmd_list + ['autoload=T'] seqlist = self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd) self.dict['SeqObj'] = seqlist.seqNameDic('Max') self.dict['Gene2Seq'] = gene2seq self.dict['Seq2Gene'] = seq2gene return True # Setup successful except: self.errorLog('Problem during %s setup.' % self) return False # Setup failed
def readSLiMSearchOcc(self,motifs=[]): ### Reads SLiMSearch results into data dictionary '''Reads SLiMSearch results into data dictionary.''' try:### ~ [1] Read ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not motifs: self.printLog('#OCC','Cannot process occurrences for No motifs!') occfile = '%s.csv' % self.info['ResFile'] delimit = rje.delimitFromExt(filename=occfile) data = rje.dataDict(self,occfile,mainkeys=['Motif','Seq','Start_Pos','End_Pos'],datakeys=string.split('Seq,Desc,Start_Pos,End_Pos,Cons,HomNum,GlobID,LocID,Hyd,SA',',')) self.dict['Occ'] = {} ### ~ [2] Process ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### (mx,ox,otot) = (0,0.0,len(data)) for occ in data: self.progLog('\r#OCC','Processing occurrences (%d motifs): %.2f%%' % (mx,ox/otot)); ox += 100.0 #x#self.deBug('%s vs MinHom %d' % (data[occ],self.stat['MinHom'])) if string.atoi(data[occ]['HomNum']) < self.stat['MinHom']: continue (motif,seq,start,end) = string.split(occ,delimit) if motif not in motifs: continue try: gene = rje.matchExp('gene:(\S+)\]',data[occ]['Desc'])[0] self.deBug('%s:%s' % (gene,self.ensGO(gene))) if not self.ensGO(gene): continue except: continue if motif[-3:] == 'rev': (motif,type) = (motif[:-4],'Rev') elif motif[-5:] == 'scram': (motif,type) = (motif[:-6],'Scr') else: type = 'ELM' if motif not in self.dict['Occ']: self.dict['Occ'][motif] = {}; mx += 1 if type not in self.dict['Occ'][motif]: self.dict['Occ'][motif][type] = {} if gene not in self.dict['Occ'][motif][type]: self.dict['Occ'][motif][type][gene] = [] self.dict['Occ'][motif][type][gene].append(data[occ]) self.printLog('\r#OCC','Processed %s occurrences: %d motifs with GO-links' % (rje.integerString(otot),mx)) except: self.log.errorLog(rje_zen.Zen().wisdom())
def parse( self, parsedom=True, parseseq=True, parsecomplex=True ): ### HPRD Parsing method. Generates Mappings, HPRD data dictionary, Domain dictionary & Sequences '''HPRD Parsing method. Generates Mappings, HPRD data dictionary, Domain dictionary & Sequences.''' try: ### ~ Parse HPRD Mappings onto other database IDs from HPRD_ID_MAPPINGS.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['HPRD'] = {} self.dict['Mapping'] = {} hprd = self.loadFromFile('%sHPRD_ID_MAPPINGS.txt' % self.info['HPRDPath'], v=1, checkpath=True, chomplines=True) hx = float(len(hprd)) while hprd: entry = hprd.pop(0) px = 100.0 * (hx - len(hprd)) / hx self.log.printLog('\r#HPRD', 'Parsing HPRD_ID_MAPPINGS: %.1f%%' % px, newline=False, log=False) data = string.split(entry) ## Check ## if len(data) < 7: continue if self.dict['HPRD'].has_key(data[0]): self.log.errorLog('HPRD ID %s duplicated! Aaargh!' % data[0], printerror=False) ## Update ## self.dict['HPRD'][data[0].upper()] = { 'gene': data[1].upper(), 'gb': data[3], 'entrez': data[4], 'omim': data[5], 'sp': data[6].upper(), 'desc': string.join(data[7:]) } for i in [1, 3, 6]: self.dict['Mapping'][data[i].upper()] = data[0] self.log.printLog('\r#HPRD', 'Parsing HPRD_ID_MAPPINGS complete!') ### ~ Parse HPRD Domain Mappings from PROTEIN_Architecture.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Domains'] = {} self.dict['DomainSource'] = {} if parsedom: hprd = self.loadFromFile('%sPROTEIN_Architecture.txt' % self.info['HPRDPath'], v=1, checkpath=True, chomplines=True) hx = float(len(hprd)) while hprd: entry = hprd.pop(0) px = 100.0 * (hx - len(hprd)) / hx self.log.printLog('\r#HPRD', 'Parsing PROTEIN_Architecture: %.1f%%' % px, newline=False, log=False) data = string.split(entry) ## Check ## if len(data) < 9: continue (hid, domain, type, source) = (data[0], data[4], data[5], data[8]) if type != 'Domain': continue ## Update ## if domain not in self.dict['Domains']: self.dict['Domains'][domain] = [hid] elif hid not in self.dict['Domains'][domain]: self.dict['Domains'][domain].append(hid) if domain not in self.dict['DomainSource']: self.dict['DomainSource'][domain] = [source] elif source not in self.dict['DomainSource'][domain]: self.dict['DomainSource'][domain].append(source) self.log.printLog('\r#HPRD', 'Parsing PROTEIN_Architecture complete!') ### ~ Make SeqList from PROTEIN_SEQUENCES.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if parseseq: scmd = self.cmd_list + [ 'autoload=T', 'gnspacc=F', 'seqin=%sPROTEIN_SEQUENCES.txt' % self.info['HPRDPath'], 'autofilter=F', 'accnr=F', 'seqnr=F' ] self.obj['SeqList'] = rje_seq.SeqList(self.log, scmd) self.obj['SeqList'].info[ 'Name'] = self.info['OutDir'] + 'hprd.fas' sx = 0.0 for seq in self.obj['SeqList'].seq[ 0:]: # seq.info['ID'] should be the HPRD ID # ## Initial processing of sequence. Only keep if AllIso or isoform 1 ## self.log.printLog('\r#SEQ', 'Processing HPRD Sequences: %.1f%%' % (sx / self.obj['SeqList'].seqNum()), newline=False, log=False) iso = 'X' h = seq.info['ID'] try: iso = rje.matchExp('^\d+\|\d+_(\d+)\|', seq.info['Name'])[0] except: self.deBug(seq.info['Name']) try: if h not in self.dict['HPRD']: self.printLog( '\r#ERR', 'Missing from HPRD_ID_MAPPINGS?: %s' % seq.info['Name']) data = string.split(seq.info['Name'], '|') self.dict['HPRD'][h] = { 'gene': '-', 'gb': data[2], 'entrez': '', 'omim': '', 'sp': '', 'desc': string.join(data[3:], '|') } if not self.opt['AllIso'] and self.dict['HPRD'][ h].has_key('Seq') and iso != '1': self.obj['SeqList'].seq.remove(seq) continue #x#if h == '00001': self.deBug('%s = %s' % (h,iso)) sx += 100.0 seq.setInfo({ 'Gene': self.dict['HPRD'][h]['gene'], 'Description': self.dict['HPRD'][h]['desc'] + ' [Gene:%s HPRD:%s; gb:%s; sp:%s]' % (self.dict['HPRD'][h]['gene'], h, self.dict['HPRD'] [h]['gb'], self.dict['HPRD'][h]['sp']), 'AccNum': self.dict['HPRD'][h]['sp'] }) ## AllIso options ## if self.opt['AllIso']: if 'Seq' not in self.dict['HPRD'][h]: self.dict['HPRD'][h]['Seq'] = [seq] else: self.dict['HPRD'][h]['Seq'].append(seq) seq.setInfo({'AccNum': '%s-%s' % (h, iso)}) else: self.dict['HPRD'][h]['Seq'] = seq #x#print h, self.dict['HPRD'][h]['Seq'] ## Finish formatting ## if seq.info['Gene'] == '-': self.dict['HPRD'][h]['gene'] = seq.info[ 'Gene'] = 'HPRD' + h if seq.info['AccNum'] == '-': seq.info['AccNum'] = self.dict['HPRD'][h]['gb'] seq.info['ID'] = '%s_HUMAN' % seq.info['Gene'] seq.info['Name'] = '%s__%s %s' % ( seq.info['ID'], seq.info['AccNum'], seq.info['Description']) except: self.errorLog('Protein Parse Error (%s)' % seq.info['Name']) self.log.printLog('\r#SEQ', 'Processing HPRD Sequences complete!') ### ~ Make PPI Data from BINARY_PROTEIN_PROTEIN_INTERACTIONS.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### missing = [] self.dict['PPI'] = {} ppi = self.loadFromFile( '%sBINARY_PROTEIN_PROTEIN_INTERACTIONS.txt' % self.info['HPRDPath'], v=1, checkpath=True, chomplines=True) hx = float(len(ppi)) ix = 0 while ppi: entry = ppi.pop(0) px = 100.0 * (hx - len(ppi)) / hx self.log.printLog( '\r#PPI', 'Parsing BINARY_PROTEIN_PROTEIN_INTERACTIONS: %.1f%%' % px, newline=False, log=False) data = string.split(entry, '\t') ## Check ## if len(data) < 7: continue types = string.split(data[6], ';') if not types: types = ['unknown'] for type in types[0:]: if type in self.list['BadType'] or ( self.list['PPIType'] and type not in self.list['PPIType']): types.remove(type) if not types: continue ix += 1 ## Update ## (p1, p2) = (data[1].upper(), data[4].upper()) if p1 not in self.dict['HPRD']: if p1 not in missing: missing.append(p1) self.log.printLog( '#ERR', 'HPRD ID "%s" missing from HPRD_ID_MAPPINGS!' % p1, screen=False) continue if p2 not in self.dict['HPRD']: if p2 not in missing: missing.append(p2) self.log.printLog( '#ERR', 'HPRD ID "%s" missing from HPRD_ID_MAPPINGS!' % p1, screen=False) continue if not self.dict['PPI'].has_key(p1): self.dict['PPI'][p1] = [] if p2 not in self.dict['PPI'][p1]: self.dict['PPI'][p1].append(p2) if not self.dict['PPI'].has_key(p2): self.dict['PPI'][p2] = [] if p1 not in self.dict['PPI'][p2]: self.dict['PPI'][p2].append(p1) if p1 not in self.dict['Evidence']: self.dict['Evidence'][p1] = {} if p2 not in self.dict['Evidence'][p1]: self.dict['Evidence'][p1][p2] = [] for type in types: if type not in self.dict['Evidence'][p1][p2]: self.dict['Evidence'][p1][p2].append(type) #x#if p1 == '12422': self.deBug(self.dict['PPI'][p1]) self.log.printLog( '\r#PPI', 'Parsing BINARY_PROTEIN_PROTEIN_INTERACTIONS complete!') ### ~ Parse protein Complex data from PROTEIN_COMPLEXES.txt ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Complex'] = {} ppi = self.loadFromFile('%sPROTEIN_COMPLEXES.txt' % self.info['HPRDPath'], v=1, checkpath=True, chomplines=True) hx = float(len(ppi)) while ppi: entry = ppi.pop(0) px = 100.0 * (hx - len(ppi)) / hx self.log.printLog('\r#PPI', 'Parsing PROTEIN_COMPLEXES: %.1f%%' % px, newline=False, log=False) data = string.split(entry) ## Check ## if len(data) < 5: continue ## Update ## (complex, hprd) = (data[0], data[1]) if hprd == 'None': continue if not self.dict['Complex'].has_key(complex): self.dict['Complex'][complex] = [] if hprd not in self.dict['Complex'][complex]: self.dict['Complex'][complex].append(hprd) #x#if p1 == '12422': self.deBug(self.dict['PPI'][p1]) self.log.printLog('\r#PPI', 'Parsing PROTEIN_COMPLEXES complete!') ### ~ Update PPI from protein Complex data if appropriate ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### type = 'complex' if type not in self.list['BadType'] and ( not self.list['PPIType'] or type in self.list['PPIType']): cx = 0.0 for complex in self.dict['Complex']: self.log.printLog( '\r#PPI', 'Adding protein complex data to PPI: %.1f%%' % (cx / len(self.dict['Complex'])), newline=False, log=False) cx += 100.0 for p1 in self.dict['Complex'][complex]: for p2 in self.dict['Complex'][complex]: if not self.dict['PPI'].has_key(p1): self.dict['PPI'][p1] = [] if p2 not in self.dict['PPI'][p1]: self.dict['PPI'][p1].append(p2) if p1 not in self.dict['Evidence']: self.dict['Evidence'][p1] = {} if p2 not in self.dict['Evidence'][p1]: self.dict['Evidence'][p1][p2] = [] if type not in self.dict['Evidence'][p1][p2]: self.dict['Evidence'][p1][p2].append(type) self.log.printLog( '\r#PPI', 'Added protein complex data to PPI for %s complexes' % rje.integerString(len(self.dict['Complex']))) ptxt = '%s proteins; %s interactions' % (rje.integerString( len(self.dict['PPI'])), rje.integerString(ix)) self.log.printLog('\r#PPI', 'Parsing interactions complete: %s.' % ptxt) if missing: open('HPRD.missing.txt', 'w').write(string.join(missing, '\n')) except: self.log.errorLog('Error in HPRD.parse()', printerror=True, quitchoice=False) raise
def scap(self): ### Full SCAP method '''Full SCAP method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### markov = self.obj['Markov'] minx = markov.stat['MinXmer'] maxx = markov.stat['MaxXmer'] headers = ['seq','type','sorted'] for x in range(minx,maxx+1): headers.append('X%d' % x) delimit = rje.getDelimit(self.cmd_list,'\t') scapfile = '%s.%s' % (self.info['Basefile'],rje.delimitExt(delimit)) rje.delimitedFileOutput(self,scapfile,headers,delimit,rje_backup=True) ### ~ [2] SCAP ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [2a] Query ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## (sx,stot) = (0.0,self.obj['SeqList'].seqNum()) for seq in self.obj['SeqList'].seq: self.progLog('\r#SCAP','SCAP processing Query to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0 datadict = {'seq':seq.shortName(),'type':'qry','sorted':markov.opt['Sorted']} for x in range(minx,maxx+1): datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x) if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x] else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x] rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict) self.printLog('\r#SCAP','SCAP processed Query to %s for %s sequences.' % (scapfile,rje.integerString(stot))) ## ~ [2b] Background ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.obj['ScapBack'] != self.obj['SeqList']: (sx,stot) = (0.0,self.obj['ScapBack'].seqNum()) for seq in self.obj['ScapBack'].seq: self.progLog('\r#SCAP','SCAP processing Background to %s: %.2f%%' % (scapfile,(sx/stot))); sx += 100.0 datadict = {'seq':seq.shortName(),'type':'bg','sorted':markov.opt['Sorted']} for x in range(minx,maxx+1): datadict['X%d' % x] = self.scapSeq(seq.info['Sequence'],x) if datadict['X%d' % x] > 0.001: datadict['X%d' % x] = '%.4f' % datadict['X%d' % x] else: datadict['X%d' % x] = '%.3e' % datadict['X%d' % x] rje.delimitedFileOutput(self,scapfile,headers,delimit,datadict) self.printLog('\r#SCAP','SCAP processed Background to %s for %s sequences.' % (scapfile,rje.integerString(stot))) if markov.opt['Sorted']: self.printLog('#SCAP','Sorted SCAP run complete') else: self.printLog('#SCAP','UnSorted SCAP run complete') except: self.errorLog(rje_zen.Zen().wisdom())
def uniFake( self, seqs=[], store=False ): ### Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs. '''Main UniFake method. Runs on sequences in self.obj['SeqList'] if no seqs given.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### unifake = string.split(string.join(self.list['UniFake']).lower()) seqlist = self.obj['SeqList'] if seqs: seqlist.seq = seqs else: seqs = seqlist.seq (sx, seqnum) = (0, seqlist.seqNum()) ## ~ [1b] Setup UniProt object and output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## uniprot = rje_uniprot.UniProt( self.log, self.cmd_list) # UniProt object for saving data if self.info['DatOut'].lower() in ['', 'none']: self.info['DatOut'] = rje.baseFile( seqlist.info['Name']) + '.dat' datfile = self.info['DatOut'] if os.path.exists(datfile): rje.backup(self, datfile) if store: seqlist.obj['UniProt'] = uniprot ## ~ [1c] Setup RJE_HMM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'pfam' in unifake: hmm = rje_hmm.HMMRun(self.log, self.cmd_list + ['force=T']) hmmfile = '%s.pfam.tdt' % rje.baseFile(datfile) if os.path.exists(hmmfile): rje.backup(self, hmmfile) hmm.list['HMM'] = [self.info['PFam']] hmm.opt['HMMPFam'] = True else: hmm = None ## ~ [1d] Setup RJE_TM object ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: tm = rje_tm.TM(self.log, self.cmd_list) else: tm = None ### ~ [2] ~ Perform UniFake processing ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for seq in seqs: sx += 1 name = seq.shortName() self.printLog( '#SEQ', 'Processing %s (%s aa) %s...' % (seq.shortName(), rje.integerString( seq.aaLen()), seq.info['Description'][:50])) try: ## ~ [2a] ~ Basic data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## utmp = 'tmp%s.%s' % (rje.randomString(5), seq.info['AccNum']) open('%s.fas' % utmp, 'w').write( '>%s\n%s\n' % (seq.shortName(), seq.info['Sequence'])) udata = { 'CC': ['-!- Features generated using unifake.py'], 'AC': [] } if seq.info['SpecCode'] in ['Unknown', 'UNK']: seq.info['SpecCode'] = self.info['SPCode'] #x#elif seq.info['Species'] != 'None': udata['OS'] = [seq.info['Species']] #!# Check how well this works. Add spectable? #!# ## ~ [2b] ~ Aliases ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if self.opt['EnsDat'] and rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']): details = rje.matchExp( '\[acc:(\S+) pep:(\S+) gene:(\S+)\]', seq.info['Name']) self.addAlias(seq.info['AccNum'], details[0]) self.addAlias(seq.info['AccNum'], details[1]) self.addAlias(seq.info['AccNum'], details[2]) udata['GN'] = [details[2]] for id in [seq.shortName(), seq.info['AccNum']]: if id in self.dict['Aliases']: udata['AC'].append( '%s;' % string.join(self.dict['Aliases'][id], '; ')) ## ~ [2c] ~ Features ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## ft = [] # List of features for sequence for id in [ seq.shortName(), seq.info['AccNum'], seq.info['ID'] ]: if id in self.dict['Features']: ft += self.dict['Features'][id] ## ~ [2d] IUPRED disorder prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'disorder' in self.list['UniFake']: try: seq.disorder() dis = seq.obj['Disorder'] for disorder in seq.obj['Disorder'].list[ 'RegionDisorder']: ft.append({ 'Type': 'DISORDER', 'Desc': 'Predicted disorder: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': disorder[0], 'End': disorder[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s > %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) for fold in seq.obj['Disorder'].list['RegionFold']: ft.append({ 'Type': 'ORDER', 'Desc': 'Predicted order: %s' % seq.obj['Disorder'].info['Disorder'], 'Start': fold[0], 'End': fold[1] }) if dis.info['Disorder'].lower() == 'iupred': ft[-1]['Desc'] = '%s <= %.2f' % ( ft[-1]['Desc'], dis.stat['IUCut']) except: self.log.errorLog( 'UniFake disorder problem for %s.' % name) ## ~ [2e] PFam HMM domain prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if hmm: try: hmm.setInfo({ 'SearchDB': '%s.fas' % utmp, 'HMMOut': '%s.hmm.out' % utmp }) # This will be made for each sequence hmm.search = [] hmm.list['HMMRes'] = [ hmm.hmmSearch(self.info['PFam'], outfile=hmm.info['HMMOut']) ] # Used in hmmTable hmm.hmmTable(outfile=hmmfile, append=True) if 'disorder' in self.list['UniFake']: disorder = seq.obj['Disorder'].list[ 'ResidueDisorder'] # individual (IUPRed) residue results else: disorder = [] if hmm.search: udata['CC'].append( 'PFam: HMMer PFam search vs %s (Modified %s)' % (self.info['PFam'], time.ctime( os.path.getmtime(self.info['PFam'])))) else: udata['CC'].append( '-!- ERROR: PFam HMMer Search failure!') out = {'Type': '!ERROR!', 'Name': name} rje.delimitedFileOutput( self, hmmfile, [ 'Type', 'Name', 'Start', 'End', 'Eval', 'Score' ], datadict=out) for search in hmm.search: for hit in search.hit: for aln in hit.aln: pfamft = { 'Start': aln.stat['SbjStart'], 'End': aln.stat['SbjEnd'], 'Type': 'PFAM', 'Desc': '%s PFam HMM Eval: %.2e; Score: %.1f' % (search.info['Name'], aln.stat['Expect'], aln.stat['BitScore']) } if disorder: region = disorder[ aln.stat['SbjStart'] - 1:aln.stat['SbjEnd']] hmmdisorder = float( sum(region)) / len(region) pfamft[ 'Desc'] = '%s; IUPRed: %.2f' % ( pfamft['Desc'], hmmdisorder) if hmmdisorder < self.stat[ 'DisDom']: pfamft['Type'] = 'DOMAIN' ft.append(pfamft) except: self.log.errorLog( 'UniFake PFam HMM problem for %s.' % name) ## ~ [2f] TMHMM transmembrane topology prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'tmhmm' in unifake: try: tmdat = os.popen( '%s %s.fas -short' % (self.info['TMHMM'], utmp)).readlines() domlist = rje_tm.domainList( rje_tm.parseTMHMM(tmdat[0])) for tmdom in domlist: ft.append(tmdom) ft[-1]['Desc'] = 'TMHMM topology prediction' ft[-1]['Start'] = string.atoi(ft[-1]['Start']) ft[-1]['End'] = string.atoi(ft[-1]['End']) if len(domlist) > 1: udata['CC'].append( 'TMHMM: %d TM domains; N-Term %s' % ((len(domlist) - 1) / 2, domlist[0]['Type'])) else: udata['CC'].append('TMHMM: 0 TM domains') except: self.log.errorLog('UniFake TMHMM problem for %s.' % name) ## ~ [2g] SIGNALP signal peptide prediction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if 'signalp' in unifake: try: os.system( '%s -f short -t euk %s.fas > %s.signalp' % (self.info['SignalP'], utmp, utmp)) tm.signalp = {} tm.parseSignalP('%s.signalp' % utmp) sigp = tm.signalp.pop(seq.shortName()) cpos = 0 if sigp['nn_ymax?'] == 'Y': cpos = string.atoi(sigp['nn_ymaxpos']) desc = 'SignalP NN prediction' if sigp['hmm_cmax?'] == 'Y': hmm_c = string.atoi(sigp['hmm_cmaxpos']) if cpos == 0: cpos = hmm_c desc = 'SignalP HMM prediction' else: if hmm_c < cpos: cpos = hmm_c desc = 'SignalP HMM prediction (NN also Y)' else: desc += ' (HMM also Y)' if cpos > 0: ft.append({ 'Type': 'SIGNALP', 'Desc': desc, 'Start': 1, 'End': cpos }) except: self.log.errorLog( 'UniFake SignalP problem for %s.' % name) ## ~ [2h] Convert to UniProt and save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## self.addRealUniProt(seq, udata, ft) self.deBug(ft) if not store: uniprot.list['Entry'] = [] if uniprot.addFromSeq( seq, data=udata, ft=ft): ### Converts into UniProtEntry object if not store: uniprot.saveUniProt(datfile, append=True) #x#open(self.info['DatPickup'],'a').write('%s\n' % seq.shortName()) ## ~ [2f] Cleanup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## except: self.log.errorLog('Problem during UniFake(%s)' % name) for tmp in glob.glob('%s*' % utmp): os.unlink(tmp) self.printLog( '#UNIFAKE', '|---------- %s run <<<|>>> %s to go -----------|' % (rje.integerString(sx), rje.integerString(seqnum - sx)), log=False) if store: uniprot.saveUniProt(datfile, append=False) if self.opt['CleanUp']: for tmp in glob.glob('TMHMM*'): if os.path.isdir(tmp): os.rmdir(tmp) except: self.errorLog( 'Oh, the shame of it! Trouble during UniFake.uniFake()')
def tabulatePPIRegion(self): ### Tabulates regions of known PPI from DAT file '''Tabulates regions of known PPI from DAT file.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### tabfile = 'ppi_region.tdt' unifile = '/scratch/RJE_Filestore/SBSBINF/Databases/DBase_090505/UniFake/Human/ens_HUMAN.unifake.dat' if os.path.exists(tabfile) and not self.opt['Force']: return self.printLog('#REGTAB','%s found. (Force=F)' % tabfile) headers = ['Protein','Start','End','Interactor'] rje.delimitedFileOutput(self,tabfile,headers,rje_backup=True) ### ~ [2] Extract and tabulate data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gcmd = "grep -P '(ID |REGION)' %s | grep -P '(HUMAN|interact)' -i | grep REGION -B 1" % unifile self.printLog('#GREP',gcmd) prot = None; rx = 0; plist = []; ilist = [] for gline in os.popen(gcmd).readlines(): if rje.matchExp('ID (\S+)',gline): prot = rje.matchExp('ID (\S+)',gline)[0] if rje.matchExp('FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline): (rstart,rend,rint) = rje.matchExp('FT REGION\s+(\d+)\s+(\d+).+nteract\S+ with (\S.+)',gline) for ppi in string.split(rint): if rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi): datadict = {'Protein':prot,'Start':rstart,'End':rend,'Interactor':rje.matchExp('^([A-Z0-9][A-Z0-9]+)',ppi)[0]} rje.delimitedFileOutput(self,tabfile,headers,datadict=datadict); rx += 1 if prot not in plist: plist.append(prot) if datadict['Interactor'] not in ilist: ilist.append(datadict['Interactor']) self.progLog('\r#REGTAB','Tabulating regions: %s proteins; %s interactors; %s regions' % (rje.integerString(len(plist)),rje.integerString(len(ilist)), rje.integerString(rx))) self.printLog('\r#REGTAB','Tabulated regions (%s proteins; %s interactors; %s regions) => %s' % (rje.integerString(len(plist)),rje.integerString(len(ilist)),rje.integerString(rx),tabfile)) return True except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Reformat Sequences ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for fasta in glob.glob('*.fasta'): fas = fasta[:-2] if os.path.exists(fas): continue sx = 0 for line in open(fasta,'r').readlines(): if line[:1] == '>': try: (name,desc) = rje.matchExp('^>(\S+) (\S.+)$',line) except: name = rje.matchExp('^>(\S+)',line)[0] if len(string.split(name,'|')) == 3: name = '6rf_NEIME__%s' % string.split(name,'|')[2] open(fas,'a').write('>%s\n' % name) elif len(string.split(name,'|')) == 5: name = 'ref_NEIME__%s' % string.split(name,'|')[3] open(fas,'a').write('>%s %s\n' % (name,desc)) else: print string.split(name,'|'); raise ValueError self.progLog('\r#FAS','Processing %s: %s seqs' % (fas, rje.integerString(sx))); sx += 1 else: open(fas,'a').write(line) self.printLog('\r#FAS','Processed %s: %s seqs from %s' % (fas, rje.integerString(sx), fasta)) rje_blast.BLASTRun(self.log,self.cmd_list).formatDB(fas,protein=True,force=True) ### ~ [2] Read in CSV Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### rfhits = {} # Dictionary of {hit:['File:hit_num']} acc = 'MC58_6RF_Hits.acc'; open(acc,'w') gfile = 'MC58_6RF_Hits.vs.MC58_1.hitsum.tdt' cx = 0 for csv in glob.glob('MC58_6RF_CSV/*.CSV'): cx += 1 file = os.path.basename(csv)[:-4] hits = False for line in open(csv,'r').readlines(): if line.find('prot_hit_num,prot_acc') == 0: hits = True elif hits: data = rje.readDelimit(line,',') if len(data) < 2: continue [num,name] = data[:2] try: name = string.split(name,'|')[2] except: continue if name not in rfhits: open(acc,'a').write('6rf_NEIME__%s\n' % name) rfhits[name] = [] id = '%s:%s' % (file,num) if id not in rfhits[name]: rfhits[name].append(id) self.progLog('\r#CSV','Reading %d CSV files: %s 6RF Hits' % (cx,rje.integerString(len(rfhits)))) self.printLog('\r#CSV','Read %d CSV files: %s 6RF Hits output to %s' % (cx,rje.integerString(len(rfhits)),acc)) ### ~ [3] Extract sequences and perform GABLAM ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not os.path.exists(gfile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=%s' % acc,'fasdb=MC58_6RF.fas','seqout=MC58_6RF_Hits.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Hits.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Hits.fas','searchdb=MC58_1.fas','qryacc=F']).gablam() ### ~ [4] Read in GABLAM and ID Hits without genomic homology ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### gdata = rje.dataDict(self,gfile,['Qry'],['HitNum']) zeros = [] for hit in gdata: if string.atoi(gdata[hit]['HitNum']) == 0: zeros.append(hit) zeros = rje.sortUnique(zeros,False) open('6rf_zeros.acc','w').write(string.join(zeros,'\n')) self.printLog('#ZERO','%d 6RF hits with 0 BLAST hits to MC58_1' % len(zeros)) ufile = 'MC58_6RF_Zeros.vs.embl_bacteria.hitsum.tdt' if not os.path.exists(ufile): seqlist = rje_seq.SeqList(self.log,self.cmd_list+['seqin=6rf_zeros.acc','fasdb=MC58_6RF.fas','seqout=MC58_6RF_Zeros.fas','autoload=T','accnr=F','seqnr=F']) seqlist.info['Name'] = 'MC58_6RF_Zeros.fas' seqlist.saveFasta() gablam.GABLAM(self.log,self.cmd_list+['seqin=MC58_6RF_Zeros.fas','searchdb=/scratch/Databases/NewDB/TaxaDB/embl_bacteria.fas','qryacc=F']).gablam() gdata = rje.dataDict(self,ufile,['Qry'],getheaders=True) fdata = rje.dataDict(self,string.replace(ufile,'hitsum','gablam'),['Qry'],['Hit'],lists=True) headers = gdata.pop('Headers') headers.insert(1,'Sample') headers.append('BestHit') rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,rje_backup=True) for rf in rje.sortKeys(gdata): rfcut = string.split(rf,'__')[1] gdata[rf]['Sample'] = string.join(rfhits[rfcut],'; ') gdata[rf]['Qry'] = rfcut try: gdata[rf]['BestHit'] = fdata[rf]['Hit'][0] except: gdata[rf]['BestHit'] = '-' rje.delimitedFileOutput(self,'MC58_6RF_Zeros.tdt',headers,datadict=gdata[rf]) except: self.errorLog(rje_zen.Zen().wisdom()) self.printLog('#ZEN',rje_zen.Zen().wisdom())
def addToGeneCards( self, cards, addcards=True ): ### Reconfigures and adds parsed HPRD data to GeneCards ''' Reconfigures and adds parsed HPRD data to GeneCards. >> cards:rje_genecards.GeneCards object >> addcards:boolean [True] = whether to add genes from HPRD to the GeneCards dictionary ''' ### Add relevant headers for future output ### for h in ['HPRD', 'OMIM', 'EntrezCheck', 'Desc']: if h not in cards.list['Headers']: cards.list['Headers'].append(h) for gene in cards.list['Genes']: if h not in cards.dict['GeneCard'][gene]: cards.dict['GeneCard'][gene][h] = '' ### Add to GeneCards ### (hx, htot) = (0.0, len(self.dict['HPRD'])) for hprd in self.dict['HPRD']: self.log.printLog('\r#HPRD', 'Adding HPRD to GeneCards: %.1f%%' % (hx / htot), newline=False, log=False) hx += 100.0 self.deBug(self.dict['HPRD'][hprd]) gene = self.dict['HPRD'][hprd]['gene'] omim = self.dict['HPRD'][hprd]['omim'] entrez = self.dict['HPRD'][hprd]['entrez'] if gene in cards.list['Genes']: if cards.dict['GeneCard'][gene]['HPRD'] == '': cards.dict['GeneCard'][gene]['HPRD'] = hprd elif hprd not in string.split( cards.dict['GeneCard'][gene]['HPRD'], ','): cards.dict['GeneCard'][gene]['HPRD'] = string.join( string.split(cards.dict['GeneCard'][gene]['HPRD'], ',') + [hprd], ',') if cards.dict['GeneCard'][gene]['OMIM'] == '': cards.dict['GeneCard'][gene]['OMIM'] = omim elif omim not in string.split( cards.dict['GeneCard'][gene]['OMIM'], ','): cards.dict['GeneCard'][gene]['OMIM'] = string.join( string.split(cards.dict['GeneCard'][gene]['OMIM'], ',') + [omim], ',') if cards.dict['GeneCard'][gene]['EntrezCheck'] == '': cards.dict['GeneCard'][gene]['EntrezCheck'] = entrez elif entrez not in string.split( cards.dict['GeneCard'][gene]['EntrezCheck'], ','): cards.dict['GeneCard'][gene]['EntrezCheck'] = string.join( string.split( cards.dict['GeneCard'][gene]['EntrezCheck'], ',') + [entrez], ',') elif addcards: if gene == '-': gene = 'HPRD' + hprd cards.list['Genes'].append(gene) cards.dict['GeneCard'][gene] = { 'Symbol': '!FAILED!', 'HPRD': hprd, 'OMIM': omim, 'EntrezCheck': entrez, 'Desc': self.dict['HPRD'][hprd]['desc'] } self.log.printLog( '\r#HPRD', 'Added %s HPRD genes to GeneCards.' % (rje.integerString(htot)))
def run(self): ### Main run method '''Main run method.''' try:### ~ [1] Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.info['Basefile'].lower() in ['','none']: self.info['Basefile'] = '' elif self.info['Basefile'][-1] != '.': self.info['Basefile'] += '.' self.obj['SeqList'] = rje_seq.SeqList(self.log,self.cmd_list+['autoload=T']) self.list['PlotFT'] = string.split(string.join(self.list['PlotFT']).upper()) if self.info['OccFile'].lower() not in ['','none']: self.info['Delimit'] = rje.delimitFromExt(filename=self.info['OccFile']) self.dict['OccData'] = {} occdata = rje.dataDict(self,self.info['OccFile'],['Seq','Dataset','Pattern','Start_Pos','End_Pos'],['Seq','Dataset','Pattern','Start_Pos','End_Pos']) for key in rje.sortKeys(occdata): seq = occdata[key].pop('Seq') if seq not in self.dict['OccData']: self.dict['OccData'][seq] = {} dataset = occdata[key].pop('Dataset') if dataset not in self.dict['OccData'][seq]: self.dict['OccData'][seq][dataset] = [] self.dict['OccData'][seq][dataset].append(occdata[key]) self.printLog('#OCC','Loaded data for %s occurrences in %s sequences' % (rje.integerString(len(occdata)),rje.integerString(len(self.dict['OccData'])))) self.obj['SeqList'].autoFilter(['GoodSeq=%s' % string.join(rje.sortKeys(self.dict['OccData']),',')]) ### ~ [2] Calculate Stats ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.list['PlotStat'] = string.split(string.join(self.list['PlotStat']).lower()) if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc = rje_slimcalc.SLiMCalc(self.log,self.cmd_list) seqdict = self.obj['SeqList'].seqNameDic() for name in rje.sortKeys(seqdict): if self.opt['OccOnly'] and not name in self.dict['OccData']: continue seq = seqdict[name] sequence = seq.getSequence(gaps=False) seq.dict['PlotStat'] = {} if 'sa' in self.list['PlotStat']: seq.dict['PlotStat']['SA'] = rje_seq.surfaceAccessibility(sequence,returnlist=True) if 'hyd' in self.list['PlotStat']: seq.dict['PlotStat']['Hydropathy'] = rje_seq.eisenbergHydropathy(sequence,returnlist=True) if 'dis' in self.list['PlotStat']: seq.dict['PlotStat']['Disorder'] = seq.disorder(returnlist=True) if 'cons' in self.list['PlotStat'] or 'rel' in self.list['PlotStat']: slimcalc.relConListFromSeq(seq,slimcalc.stat['RelConWin'],store=True) try: seq.dict['PlotStat']['Cons_Abs'] = seq.list.pop('Cons') seq.dict['PlotStat']['Cons_Rel'] = seq.list.pop('RelCons') except: self.printLog('#CONS','No conservation stats for %s' % name) self.printLog('#STAT','PlotStats calculated for %s' % name) for stat in seq.dict['PlotStat']: if stat != 'Cons_Rel' and self.stat['PlotWin'] >= 0: seq.dict['PlotStat'][stat] = self.plotWin(seq.dict['PlotStat'][stat]) seq.dict['PlotStat'][stat] = self.convertStat(seq.dict['PlotStat'][stat]) self.printLog('#STAT','PlotStats converted for %s' % name) ### ~ [3] Output Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if name in self.dict['OccData']: for dataset in self.dict['OccData'][name]: ofile = '%s%s.%s.plot.txt' % (self.info['Basefile'],dataset,seq.info['AccNum']) self.output(seq,ofile,self.dict['OccData'][name][dataset]) else: self.output(seq,'%s%s.plot.txt' % (self.info['Basefile'],seq.info['AccNum'])) return except: self.errorLog(rje_zen.Zen().wisdom())
def parseOMIM(self): ### Main parsing method '''Main parsing method.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.dict['Records'] = {} self.dict['Mutations'] = {} aas = string.split(string.join(rje_sequence.aa_code_3.values()).upper()) oline = os.path.exists(self.info['Name']) (olen,ox,mx) = (len(open(self.info['Name'],'r').readlines()),0.0,0) OMIM = open(self.info['Name'],'r') ### ~ [2] Extract data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### record = gene = subid = disease = mutation = '' av = False # Whether reading *FIELD* AV for mutation data while oline: oline = OMIM.readline() self.log.printLog('\r#OMIM','Processing OMIM: %.2f%% (%s genes)' % (ox/olen,rje.integerString(len(self.dict['Records']))),newline=False,log=False) ox += 100.0 if not av and oline[:1] != '*': continue line = rje.chomp(oline) while line[-1:] == ' ': line = line[:-1] ## ~ [2a] New record ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if line == '*RECORD*': (record,av) = ('',False) elif line == '*FIELD* NO': # New record record = rje.chomp(OMIM.readline()) gene = '' ox += 100.0 ## ~ [2b] Gene ID ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* TI': # New gene gene = string.split(rje.chomp(OMIM.readline()))[-1] subid = '' av = False ox += 100.0 ## ~ [2c] Mutations ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## elif line == '*FIELD* AV': av = True # Start of mutation records elif av and rje.matchExp('^(\.\d+)',line): # New subid mutation record subid = rje.matchExp('^(\.\d+)',line)[0] disease = rje.chomp(OMIM.readline()) ox += 100.0 try: mutation = rje.matchExp('^%s, (\D\D\D\d+\D\D\D)' % gene,rje.chomp(OMIM.readline()))[0] except: continue # No mutation or not coding change ox += 100.0 subaa = rje.matchExp('(\D\D\D)\d+(\D\D\D)',mutation) if subaa[0] not in aas or subaa[1] not in aas: continue if gene not in self.dict['Records']: self.dict['Records'][gene] = [record] if record not in self.dict['Records'][gene]: self.dict['Records'][gene] += [record] if gene not in self.dict['Mutations']: self.dict['Mutations'][gene] = {} mx += 1 self.dict['Mutations'][gene][subid] = (disease,mutation) ### ~ [3] Finish & Save ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### OMIM.close() self.log.printLog('\r#OMIM','Processing OMIM complete! (%s genes; %s mutations)' % (rje.integerString(len(self.dict['Records'])),rje.integerString(mx))) self.saveMutations() except: self.log.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible