def loadXRef(self): ### Load Identifier XRef Data '''Load Identifier XRef Data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.exists('%s.xref.tdt' % self.info['Basefile']) and not self.opt['Force']: return self.db().addTable('%s.xref.tdt' % self.info['Basefile'], mainkeys=['#'], datakeys='All', name='XRef') if not rje.checkForFile(self.info['XRef']): return False changehead = { 'Ensembl Gene ID': 'EnsG', 'Ensembl Protein ID': 'EnsP', 'Associated Gene Name': 'Gene', 'Associated Gene DB': 'GeneDB', 'UniProt/SwissProt ID': 'UniprotID', 'UniProt/SwissProt Accession': 'UniProt', 'SGD Gene': 'SGD' } ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xref = self.db().addTable(self.info['XRef'], mainkeys='All', datakeys='All', name='XRef') for field in changehead: if field in xref.fields(): xref.renameField(field, changehead[field]) xref.saveToFile('%s.xref.tdt' % self.info['Basefile']) return xref except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def setup(self): ### Main class setup method. '''Main class setup method.''' try:### ~ [0] Setup File names etc. ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('SaveDis').lower() in ['','none']: base = 'peptides' if rje.checkForFile(self.getStr('Peptides')): base = rje.baseFile(self.getStr('Peptides')) if self.baseFile().lower() not in ['','none']: base = self.baseFile() self.baseFile(base) self.setStr({'SaveDis':'%s.%s.%s' % (base,self.getStr('PeptDis'),self.getStr('PeptCluster'))}) if self.getStr('OutMatrix') in ['tdt','csv','png','phylip']: self.str['SaveDis'] += '.%s' % self.getStr('OutMatrix')[:3] else: self.str['SaveDis'] += '.txt' self.dict['Output']['peptides'] = string.join(self.list['Peptides'],'\n') ### ~ [1] Setup Distance Matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### self.obj['AADis'] = rje_dismatrix.DisMatrix(self.log,['nsf2nwk=T']+self.cmd_list) self.obj['AADis'].info['Name'] = 'Pairwise AA distances' self.obj['PeptDis'] = rje_dismatrix.DisMatrix(self.log,['nsf2nwk=T']+self.cmd_list) self.obj['PeptDis'].info['Name'] = 'Pairwise peptide distances' ### ~ [2] Optional loading of AA Distance Matrix ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.getStr('AADis').lower() not in ['','none']: self.obj['AADis'].loadMatrix(self.getStr('AADis')) else: self.obj['AAProp'] = aaprop = rje_aaprop.AAPropMatrix(self.log,self.cmd_list) #aaprop.readAAProp() # Does this on loading! for aa in aaprop.pdif: self.obj['AADis'].addDis(aa[0],aa[1],aaprop.pdif[aa]) return True except: self.errorLog('Problem during %s setup.' % self); return False # Setup failed
def processHHPID(self): ### Process HHPID interactions '''Process HHPID interactions.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.checkForFile('%s.HHPIDMap.tdt' % self.basefile()): mdb = self.db().addTable('%s.HHPIDMap.tdt' % self.basefile(), ['HIV', 'Gene'], 'All', name='HHPIDMap') return mdb hdb = self.db('HHPID') gdb = self.db('GeneMap') pdb = self.db('PPI') mdb = self.db().joinTables(name='HHPIDMap', join=[(hdb, 'Entrez'), (gdb, 'Entrez')], newkey=['#'], empties=False, keeptable=True) for field in mdb.fields()[0:]: if field not in [ '#', 'AccHIV', 'EntrezHIV', 'HIV', 'Entrez', 'Gene', 'Symbol', 'UniProt', 'EnsEMBL', 'EnsLoci' ]: mdb.dropField(field) mdb.compress(['HIV', 'Gene'], default='str') mdb.dropField('#') mdb.saveToFile() ### ~ [2] Save viral accession numbers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open('%s.hivacc' % self.getStr('Basefile'), 'w').write( '%s\n' % string.join(rje.sortKeys(mdb.index('AccHIV')), '\n')) return mdb except: self.errorLog('%s.processHHPID error' % self) return False
def hmmSearch(self,hmm,dbase=None,outfile=None,wait=True): ### Performs HMMer Search using object attributes ''' Performs HMMer Search using object attributes. >> hmm:str = Name of HMM file >> dbase:str = Name of DBase file [self.info['SearchDB']] >> outfile:str = Name of Output file file [self.info['HMMOut']] >> wait:boolean = whether to wait for HMMer. [True] << returns outfile or None if fails ''' try:### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.checkForFile(hmm): self.printLog('#ERR','HMM file %s is missing!' % hmm); return None if not dbase: dbase = self.info['SearchDB'] if not rje.checkForFile(dbase): self.printLog('#ERR','Database file "%s" is missing!' % dbase); return None ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not outfile or outfile.lower() in ['','none']: # Make an outfile per search outfile = '%s.%s.hmmer' % (rje.baseFile(hmm,True),rje.baseFile(dbase,True)) resfile = outfile if not os.path.exists(outfile) and self.opt['GZip'] and os.path.exists('%s.gz' % outfile) and not self.opt['Force']: resfile = '%s.gz' % outfile if not self.opt['Force'] and rje.isYounger(resfile,hmm) == resfile and rje.isYounger(resfile,dbase) == resfile: self.printLog('#HMM','HMM results file "%s" exists.' % resfile) return outfile # Already exists else: rje.backup(self,outfile,unlink=True) ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['HMMPFam']: _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile) else: _command = 'hmmsearch %s %s %s > %s' % (string.join(self.list['HMMOptions']),hmm,dbase,outfile) self.log.printLog('#HMM',_command) if not wait: os.system(self.info['HMMerPath'] + _command + ' &') elif not os.path.exists(outfile) or self.opt['Force']: open(outfile,'a').write(os.popen(self.info['HMMerPath'] + _command).read()) self.printLog('#HMM','Outfile produced for %s: %s.' % (hmm,outfile)) if self.opt['GZip']: rje.backup(self,'%s.gz' % outfile,unlink=True) os.system('gzip %s' % outfile) self.printLog('#GZIP','%s gzipped to save space' % outfile) return outfile except: self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm) return None
def loadPPI(self): ### Load pairwise interaction data '''Load pairwise interaction data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['PPIFile']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in open(self.info['PPIFile'],'r').readlines(): try: [pa,pb] = string.split(rje.chomp(line))[:2] except: continue for ppi in [(pa,pb),(pb,pa)]: if ppi[0] not in self.dict['PPI']: self.dict['PPI'][ppi[0]] = [] if ppi[1] not in self.dict['PPI'][ppi[0]]: self.dict['PPI'][ppi[0]].append(ppi[1]) self.progLog('\r#PPI','Loading PPI data: %s proteins' % rje.integerString(len(self.dict['PPI']))) self.printLog('\r#PPI','Loaded PPI data for %s proteins' % rje.integerString(len(self.dict['PPI']))) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def loadXRef(self): ### Load Identifier XRef Data '''Load Identifier XRef Data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.exists('%s.xref.tdt' % self.info['Basefile']) and not self.opt['Force']: return self.db().addTable('%s.xref.tdt' % self.info['Basefile'],mainkeys=['#'],datakeys='All',name='XRef') if not rje.checkForFile(self.info['XRef']): return False changehead = {'Ensembl Gene ID':'EnsG','Ensembl Protein ID':'EnsP','Associated Gene Name':'Gene', 'Associated Gene DB':'GeneDB','UniProt/SwissProt ID':'UniprotID', 'UniProt/SwissProt Accession':'UniProt','SGD Gene':'SGD'} ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### xref = self.db().addTable(self.info['XRef'],mainkeys='All',datakeys='All',name='XRef') for field in changehead: if field in xref.fields(): xref.renameField(field,changehead[field]) xref.saveToFile('%s.xref.tdt' % self.info['Basefile']); return xref except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def loadPillars(self): ### Load YGOB Pillar data '''Load YGOB Pillar data.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['Pillars']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in self.loadFromFile(filename=self.info['Pillars'],chomplines=True): pillars = string.split(line) #self.deBug('%s = %d' % (pillars,len(pillars))) if len(pillars) < 17: continue pillars = pillars[:5] + pillars[6:] # Remove ancestral gene while '---' in pillars: pillars.remove('---') #self.deBug('%s = %d' % (pillars,len(pillars))) if pillars: self.list['Pillars'].append(pillars) self.progLog('\r#YGOB','Loading Pillar data: %s loci' % rje.integerString(len(self.list['Pillars']))) self.printLog('\r#YGOB','Loaded Pillar data for %s loci' % rje.integerString(len(self.list['Pillars']))) except: self.errorLog(rje_zen.Zen().wisdom()); raise # Delete this if method error not terrible
def processHHPID(self): ### Process HHPID interactions '''Process HHPID interactions.''' try:### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if rje.checkForFile('%s.HHPIDMap.tdt' % self.basefile()): mdb = self.db().addTable('%s.HHPIDMap.tdt' % self.basefile(),['HIV','Gene'],'All',name='HHPIDMap') return mdb hdb = self.db('HHPID') gdb = self.db('GeneMap') pdb = self.db('PPI') mdb = self.db().joinTables(name='HHPIDMap',join=[(hdb,'Entrez'),(gdb,'Entrez')],newkey=['#'],empties=False,keeptable=True) for field in mdb.fields()[0:]: if field not in ['#','AccHIV','EntrezHIV','HIV','Entrez','Gene','Symbol','UniProt','EnsEMBL','EnsLoci']: mdb.dropField(field) mdb.compress(['HIV','Gene'],default='str'); mdb.dropField('#') mdb.saveToFile() ### ~ [2] Save viral accession numbers ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### open('%s.hivacc' % self.getStr('Basefile'),'w').write('%s\n' % string.join(rje.sortKeys(mdb.index('AccHIV')),'\n')) return mdb except: self.errorLog('%s.processHHPID error' % self); return False
def loadPillars(self): ### Load YGOB Pillar data '''Load YGOB Pillar data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['Pillars']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in self.loadFromFile(filename=self.info['Pillars'], chomplines=True): pillars = string.split(line) #self.deBug('%s = %d' % (pillars,len(pillars))) if len(pillars) < 17: continue pillars = pillars[:5] + pillars[6:] # Remove ancestral gene while '---' in pillars: pillars.remove('---') #self.deBug('%s = %d' % (pillars,len(pillars))) if pillars: self.list['Pillars'].append(pillars) self.progLog( '\r#YGOB', 'Loading Pillar data: %s loci' % rje.integerString(len(self.list['Pillars']))) self.printLog( '\r#YGOB', 'Loaded Pillar data for %s loci' % rje.integerString(len(self.list['Pillars']))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def loadPPI(self): ### Load pairwise interaction data '''Load pairwise interaction data.''' try: ### ~ [1] Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if not rje.checkForFile(self.info['PPIFile']): return False ### ~ [2] Load data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### for line in open(self.info['PPIFile'], 'r').readlines(): try: [pa, pb] = string.split(rje.chomp(line))[:2] except: continue for ppi in [(pa, pb), (pb, pa)]: if ppi[0] not in self.dict['PPI']: self.dict['PPI'][ppi[0]] = [] if ppi[1] not in self.dict['PPI'][ppi[0]]: self.dict['PPI'][ppi[0]].append(ppi[1]) self.progLog( '\r#PPI', 'Loading PPI data: %s proteins' % rje.integerString(len(self.dict['PPI']))) self.printLog( '\r#PPI', 'Loaded PPI data for %s proteins' % rje.integerString(len(self.dict['PPI']))) except: self.errorLog(rje_zen.Zen().wisdom()) raise # Delete this if method error not terrible
def hmmSearch( self, hmm, dbase=None, outfile=None, wait=True): ### Performs HMMer Search using object attributes ''' Performs HMMer Search using object attributes. >> hmm:str = Name of HMM file >> dbase:str = Name of DBase file [self.info['SearchDB']] >> outfile:str = Name of Output file file [self.info['HMMOut']] >> wait:boolean = whether to wait for HMMer. [True] << returns outfile or None if fails ''' try: ### ~ [1] ~ Setup ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### ## ~ [1a] ~ Input files ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not rje.checkForFile(hmm): self.printLog('#ERR', 'HMM file %s is missing!' % hmm) return None if not dbase: dbase = self.info['SearchDB'] if not rje.checkForFile(dbase): self.printLog('#ERR', 'Database file "%s" is missing!' % dbase) return None ## ~ [1b] ~ Output file ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ## if not outfile or outfile.lower() in [ '', 'none' ]: # Make an outfile per search outfile = '%s.%s.hmmer' % (rje.baseFile( hmm, True), rje.baseFile(dbase, True)) resfile = outfile if not os.path.exists( outfile) and self.opt['GZip'] and os.path.exists( '%s.gz' % outfile) and not self.opt['Force']: resfile = '%s.gz' % outfile if not self.opt['Force'] and rje.isYounger( resfile, hmm) == resfile and rje.isYounger( resfile, dbase) == resfile: self.printLog('#HMM', 'HMM results file "%s" exists.' % resfile) return outfile # Already exists else: rje.backup(self, outfile, unlink=True) ### ~ [2] ~ HMM Search ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ### if self.opt['HMMPFam']: _command = 'hmmpfam --cut_ga %s %s %s > %s' % (string.join( self.list['HMMOptions']), hmm, dbase, outfile) else: _command = 'hmmsearch %s %s %s > %s' % (string.join( self.list['HMMOptions']), hmm, dbase, outfile) self.log.printLog('#HMM', _command) if not wait: os.system(self.info['HMMerPath'] + _command + ' &') elif not os.path.exists(outfile) or self.opt['Force']: open(outfile, 'a').write( os.popen(self.info['HMMerPath'] + _command).read()) self.printLog('#HMM', 'Outfile produced for %s: %s.' % (hmm, outfile)) if self.opt['GZip']: rje.backup(self, '%s.gz' % outfile, unlink=True) os.system('gzip %s' % outfile) self.printLog('#GZIP', '%s gzipped to save space' % outfile) return outfile except: self.log.errorLog('Fatal Error during hmmSearch(%s)' % hmm) return None
def pairwiseAQ(self,seqlist=None,query=None,focus=[0,0]): ### Performs PAQ on seqlist, adding seq.info['PAQ'] ''' Performs PAQ on seqlist, adding seq.info['PAQ'] >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: PAQ = PAQ sequences with alignment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <PAQ0> ### Setup try: _stage = '<0> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences if not query: query = haqlist.obj['QuerySeq'] if self.opt['NoQuery'] or not query: query = haqlist.seq[random.randint(0,haqlist.seqNum()-1)] self.log.printLog('#QRY','Temp (random) query %s assigned for PAQ' % query.shortName()) #!# paqx = [False] * seqlist.seq[0].seqLen() # List of whether a column of the alignment is bad (has an X) [True] or not [False] #!# - make this a method?! pwaq = {} # Dictionary of lists of pairwise alignements block_align = {} # Dictionary of whether residue in block of sequence that is well-aligned or not for seq in haqlist.seq: block_align[seq] = [False] * seq.seqLen() seq.info['PAQ'] = seq.info['Sequence'][0:] if seq.info.has_key('SAQX') and len(seq.info['SAQX']) == seq.seqLen(): #!# Should no longer be issues due to length changes following realignment seq.info['Sequence'] = seq.info['SAQX'][0:] elif seq.info.has_key('SAQX'): self.log.errorLog('Cannot use SAQX for %s in PAQ as wrong length.' % seq.shortName(),printerror=False) for otherseq in haqlist.seq: pwaq[(seq,otherseq)] = [False] * seq.seqLen() ### <PAQ1> ### Directional Pairwise Comparisons of sequences _stage = '<1> Pairwise Comparisons' infotxt = 'PAQ%d: Pairwise Comparisons ...' % self.stat['PAQCyc'] #print self.stat for seq in haqlist.seq: for otherseq in haqlist.seq: myinfo = '%s %.1f%% %.1f%% ' % (infotxt,(100.0 * haqlist.seq.index(seq) / haqlist.seqNum()),(100.0 * haqlist.seq.index(otherseq) / haqlist.seqNum())) self.log.printLog('\r#PAQ',myinfo,log=False,newline=False) for r in range(seq.seqLen()): ar = seq.info['Sequence'][r] ## <i> ## Look for PW aligned block _stage = '<1-i> Pairwise Comparisons' if ar not in ['-','X']: # Start of test block blen = 0 # Block length (PAQBlock) = AAs win = 0 # Window length = all sequence matchx = 0 # Score for residues in window while blen < self.stat['PAQBlock'] and (r+win) < seq.seqLen(): # This time we allow overshoots in both directions ar = seq.info['Sequence'][r+win] at = otherseq.info['Sequence'][r+win] if 'X' in [ar,at]: # Hit Bad Region: Abort break else: # Better region if ar != '-': blen += 1 # Increase Block matchx += self._saqCon(ar,at) win += 1 ## <ii> ## Update pwaq if block good _stage = '<1-ii> Pairwise Comparisons' if matchx >= self.stat['PAQMatch']: for w in range(win): if seq.info['Sequence'][r+w] in ['-','X']: pwaq[(seq,otherseq)][r+w] = False else: pwaq[(seq,otherseq)][r+w] = True self.log.printLog('\r#PAQ','%s 100.0% 100.0%. ' % infotxt,log=False) ### <PAQ2> ### Link back to Query _stage = '<2> Linking to Query' ### <PAQ2a> ### Network of Pairwise Quality alignments _stage = '<2a> Linking to Query' #self.verbose(1,3,'PAQ%d: Linking Residues to Query (%s)' % (self.stat['PAQCyc'],query.shortName()),0) infotxt = 'PAQ%d: Linking Residues to Query (%s) ...' % (self.stat['PAQCyc'],query.shortName()) for r in range(query.seqLen()): _stage = '<2a> Linking to Query' self.log.printLog('\r#PAQ','%s %.1f%%' % (infotxt,(100.0 * r / query.seqLen())),log=False,newline=False) qok = {} # Dictionary of whether residue in seq OK, i.e. linked to query for seq in haqlist.seq: qok[seq] = False qok[query] = True sok = [0,1] # List of OK sequence for residue while sok[-2] != sok[-1]: ## <i> ## Match pairs, starting with query _stage = '<2a-i> Linking to Query' for seq in haqlist.seq: if qok[seq]: for otherseq in haqlist.seq: if pwaq[(seq,otherseq)][r] or pwaq[(otherseq,seq)][r]: qok[otherseq] = True ## <ii> ## Update sok _stage = '<2a-ii> Linking to Query' sok.append(0) for seq in haqlist.seq: if qok[seq]: sok[-1] += 1 block_align[seq][r] = True _stage = '<2a-iii> Linking to Query' if sok[-1] == 1: # Only query OK! block_align[query][r] = False self.log.printLog('\r#PAQ','%s 100.0%%' % infotxt,log=False) ### <PAQ2b> ### Allow for divergence (Conserved Anchors) _stage = '<2b> Anchors' if self.opt['Anchors']: infotxt = 'PAQ%d: Accounting for divergence within aligned regions ...' % self.stat['PAQCyc'] ## <i> ## Setup gapped list gapped = [False] * query.seqLen() # Whether column of alignment is gapped for seq in haqlist.seq: self.log.printLog('\r#PAQ','%s %.1f%% ' % (infotxt,(50.0 * haqlist.seq.index(seq) / haqlist.seqNum())),log=False,newline=False) (start,end) = (0,seq.seqLen()) while seq.info['Sequence'][start] == '-': start += 1 while seq.info['Sequence'][end-1] == '-': end -=1 for r in range(start,end): if seq.info['Sequence'][r] == '-': gapped[r] = True ## <ii> ## Correction for seq in haqlist.seq: self.log.printLog('\r#PAQ','%s %.1f%% ' % (infotxt,(50 + (50.0 * haqlist.seq.index(seq) / haqlist.seqNum()))),log=False,newline=False) for r in range(seq.seqLen()): if block_align[seq][r] or gapped[r]: # No need for correction continue # Move in both directions: if good residues (or sequence end) reached before gaps then reinstate winf = 0 fwd = True fok = False winb = 0 bwd = True bok = False while fwd or bwd: # End of seqs if (r + winf) >= seq.seqLen(): fwd = False if (r - winb) < 0: bwd = False # Gaps/OK if fwd: if gapped[r+winf]: fok = False fwd = False elif block_align[seq][r+winf]: fwd = False else: winf += 1 if bwd: if gapped[r-winb]: bok = False bwd = False elif block_align[seq][r-winb]: bwd = False else: winb += 1 if fok and bok: # Reinstate for w in range(r-winb,r+winf+1): block_align[seq][w] = True self.log.printLog('\r#PAQ','%s 100.0%% ' % infotxt,log=False) ### <PAQ3> ### X out badly-aligned blocks _stage = '<3> Making bad sequence blocks' for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][r] == '-': newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] #!# Add saving of data in 'datafull' option ### <PAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removing sequences/regions' self.verbose(0,4,'PAQ%d: Removing bad sequences and/or dodgy regions...' % self.stat['PAQCyc'],0) ## <PAQ4a> ## Process Query first - only interested in good regions within query if self.opt['NoQuery']: # No preprocessing of Query self.verbose(0,4,'no Master Query processing...',0) else: haqlist.mapX(query, qtrim=True, focus=focus) # Replaces other sequence ends and query X columns with Xs self.verbose(0,4,'Query (%s) processed...' % query.shortName(),0) self.verbose(0,3,'',1) if self.opt['ManPAQ']: haqlist.saveFasta(seqfile='%s.manpaq.fas' % haqlist.info['Basefile']) ## <PAQ4b> ## Cycle through other sequences (worst first) until no more good residues are lost goodres = [0, self._getGood(haqlist.seq)] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [0] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info['Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = {} # Dictionary of seq:number of lost residues if seq kept losex = {} # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] #?# In Perl HAQESAC there was an option to ignore Orphans in this calculation. Reinstate? losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManPAQ']: default = 'N' if badkx * self.stat['PAQKeepLen'] > badlx * self.stat['PAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo('%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),default): seqlist.removeSeq(text='PAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['PAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose(1,3,'%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['PAQKeepLen'] > badlx * self.stat['PAQKeepSeq']: # Lose sequence! seqlist.removeSeq(text='PAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['PAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1) ### <PAQ5> ### Reinstate UnX'd sequence: _stage = '<5> Replacing sequences' for seq in haqlist.seq: [seq.info['PAQ'],seq.info['Sequence']] = [seq.info['Sequence'],seq.info['PAQ']] if self.opt['ManPAQ'] and rje.checkForFile('%s.manpaq.fas' % haqlist.info['Basefile']): os.unlink('%s.manpaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog('rje_haq.py ~ Problem with pairwiseAQ %s.' % _stage, True)
def singleSeqAQ(self, seqlist, focus=[ 0, -1 ]): ### Performs SAQ on seqlist, adding seq.info['SAQ'] ''' Performs SAQ on seqlist, adding seq.info['SAQ']. >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: SAQX = SAQ sequences with individual Xs - A new info key will be added: SAQ = SAQ sequences with aligment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <SAQ1> ### Setup try: _stage = '<1> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences query = haqlist.obj['QuerySeq'] if self.opt['NoQuery']: query = None badres = [-1, 0] # List of how many bad residues in total dataset block_align = { } # Dictionary of whether residue in block of sequence that is well-aligned or not res_align = { } # Dictionary of whether residue of sequence is well-aligned or not res_gap = { } # Dictionary of whether residue of sequence is a gap or not gap_align = { } # Dictionary of whether residue of sequence is a gap in a well-aligned block or not for seq in haqlist.seq: seq.info['SAQ'] = seq.info['Sequence'][ 0:] # Note! Sequence is modified and SAQ not, then they are swapped at end! block_align[seq] = [False] * seq.seqLen() res_align[seq] = [False] * seq.seqLen() res_gap[seq] = [False] * seq.seqLen() gap_align[seq] = [False] * seq.seqLen() ### <SAQ2> ### Repeated cycles of defining well- and badly-aligned blocks #X#self.deBug(self.stat) _stage = '<2> BlockID' while badres[-1] != badres[-2]: # Change in number of bad residues total_res = 0 badres.append( 0) # badres[-1] is the current number of bad residues infotxt = 'SAQ%d-%d: Calculating "bad" residues ...' % ( self.stat['SAQCyc'], len(badres) - 2) for seq in haqlist.seq: myinfo = '%s %.1f%%' % (infotxt, (100.0 * haqlist.seq.index(seq) / haqlist.seqNum())) self.log.printLog('\r#SAQ', myinfo, log=False, newline=False) #self.verbose(0,3,'\r%45s' % myinfo,0) ## <SAQ2a> ## For each sequence, mark residues as aligned or gapped _stage = '<2a> Mark Residues' for r in range(seq.seqLen()): gap_align[seq][r] = False res_align[seq][r] = False if block_align[seq][r] or len( badres ) == 3: # After first cycle, look only at well-aligned blocks (well-aligned for sequence not whole alignment) a = seq.info['Sequence'][r] res_gap[seq][r] = False if a == '-': res_gap[seq][r] = True gap_align[seq][r] = True else: # 'X' handled by self._saqCon conx = 0 # Matches with good regions of otherseqs (*including self*) for otherseq in haqlist.seq[0:]: if otherseq == seq: # > so self not counted! continue if len(otherseq.info['Sequence']) != len( seq.info['Sequence']): self.log.errorLog( 'Sequence lengths do not match - should be aligned!', printerror=False) raise ValueError if (block_align[otherseq][r] or len(badres) == 3): conx += self._saqCon( a, otherseq.info['Sequence'][r]) #if seq == query and r > 590: # print seq.shortName(),r,conx,'vs',self.stat['SAQCon'], if conx >= self.stat['SAQCon']: res_align[seq][r] = True #if seq == query and r > 590: # print r, res_align[seq][r] ## <SAQ2b> ## Marked regions of well-aligned residues for each sequence _stage = '<2b> Mark Regions' ## <i> ## Clear first _stage = '<2b-i> Mark Regions' for r in range(seq.seqLen()): block_align[seq][r] = False ## <ii> ## Recalculate _stage = '<2b-ii> Mark Regions' for r in range(seq.seqLen()): _stage = '<2b-ii> Blocks' if res_align[seq][r]: # Start of potential block blen = 0 # Block length (SAQBlock) = AAs win = 0 # Window length = all sequence matchx = 1 # Good residues in window (first residue must be good!) (SAQMatch) while blen < self.stat[ 'SAQBlock'] and matchx < self.stat[ 'SAQMatch']: win += 1 if (r + win ) >= seq.seqLen() or seq.info['Sequence'][ r + win] == 'X': # Hit Bad Region: Abort break else: # Better region if gap_align[seq][r + win]: # Decent gap continue else: blen += 1 # Increase Block if res_align[seq][r + win]: # Good residue matchx += 1 #if seq == query and r > 590: # print seq.shortName(),r,matchx,'vs',self.stat['SAQMatch'], if matchx >= self.stat['SAQMatch']: for w in range((win + 1)): block_align[seq][r + w] = True #if seq == query and r > 590: # print r, block_align[seq][r] ## <iii> ## Update bad residue count for r in range(seq.seqLen()): _stage = '<2b-iii> Mark Regions' #print seq.shortName(), r, seq.seqLen(), block_align[seq][r], res_gap[seq][r], badres[-1] # Bad residue if not block_align[seq][r] and not res_gap[seq][ r]: # Bad residue badres[-1] += 1 if not res_gap[seq][r]: total_res += 1 myinfo = '%s 100.0%%' % infotxt myinfo += ' => %s bad of %s total residues' % ( rje.integerString( badres[-1]), rje.integerString(total_res)) self.log.printLog('\r#SAQ', myinfo) #self.verbose(0,3,'\r%45s' % myinfo,0) if badres[-1] == total_res: self.log.errorLog('All residues marked as bad in SAQ!', printerror=False, quitchoice=True) # Now have all residues in all sequences marked as good (block_align=True) or bad (block_align=False) ### <SAQ3> ### X out badly-aligned blocks _stage = '<3> X-Out' self.log.printLog('#SAQ', 'SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'], len(badres) - 2), log=False, newline=False) #self.verbose(0,3,'SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),0) for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][ r] == '-': #!# Was backwards? res_gap[seq][r] == False: newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] seq.info['SAQX'] = newseq[ 0:] # Stores Xd sequences for individuals for use in PAQ #!# Add saving of data in 'datafull' option ### <SAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removal' self.log.printLog( '\r#SAQ', 'SAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'], len(badres) - 2), log=False, newline=False) #self.verbose(0,3,'\rSAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),0) ## <SAQ4a> ## Process Query first - only interested in good regions within query _stage = '<4a> Query Removal' if self.opt[ 'NoQuery'] or query == None: # No preprocessing of Query self.verbose(0, 4, 'no Master Query processing...', 0) else: haqlist.mapX( query, qtrim=True, focus=focus ) # Replaces other sequence ends and query X columns with Xs self.verbose(0, 4, 'Query (%s) processed...' % query.shortName(), 0) self.verbose(0, 3, '', 1) if self.opt['ManSAQ']: haqlist.saveFasta(seqfile='%s.mansaq.fas' % haqlist.info['Basefile']) ## <SAQ4b> ## Cycle through other sequences (worst first) until no more good residues or sequences are lost _stage = '<4b> Seq Removal' goodres = [0, self._getGood(haqlist.seq) ] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [ 0 ] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info[ 'Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = { } # Dictionary of seq:number of lost residues if seq kept losex = { } # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManSAQ']: default = 'N' if badkx * self.stat['SAQKeepLen'] > badlx * self.stat[ 'SAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo( '%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(), rje.integerString(badkx), rje.integerString(badlx)), default): seqlist.removeSeq( text= 'SAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['SAQCyc'], rje.integerString(badkx), rje.integerString(badlx)), seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose( 1, 3, '%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(), rje.integerString(badkx), rje.integerString(badlx)), 1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['SAQKeepLen'] > badlx * self.stat[ 'SAQKeepSeq']: # Lose sequence! haqlist.removeSeq( text='SAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['SAQCyc'], rje.integerString(badkx), rje.integerString(badlx)), seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) #X#self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1) ### <SAQ5> ### Reinstate UnX'd sequence: _stage = '<4b> Seq Removal' for seq in haqlist.seq: #print seq.info [seq.info['SAQ'], seq.info['Sequence'] ] = [seq.info['Sequence'], seq.info['SAQ']] if self.opt['ManSAQ'] and rje.checkForFile( '%s.mansaq.fas' % haqlist.info['Basefile']): os.unlink('%s.mansaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog('Problem with singleSeqAQ() %s.' % _stage, quitchoice=True)
def loadOrthAln(callobj,seq,gopher=True): ### Identifies file, loads and checks alignment. ''' Identifies file, loads and checks alignment. If the identified file is not actually aligned, then RJE_SEQ will try to align the proteins using MUSCLE or ClustalW. >> callobj:Object containing settings for stats generation (MotifList, generally). >> seq:Sequence being analysed. >> gopher:bool [True] = whether to try to generate alignment with GOPHER if callobj.opt['Gopher'] << aln = SeqList object containing alignment with queryseq ''' try: ### Setup Attributes ### v = callobj.stat['Verbose'] alndir = rje.makePath(callobj.info['AlnDir']) alnext = callobj.info['AlnExt'] ### Identify File ### if alnext[0] != '.': alnext = '.%s' % alnext alnstart = [seq.info['AccNum'],seq.info['ID'],seq.shortName(),None] if v > 2: callobj.log.printLog('#PRESTO','%s' % callobj.opt) #!# Old debugging? #!# if callobj.opt['Gopher'] and callobj.opt['FullForce']: if v > 0: callobj.log.printLog('#ALN','FullForce=T. Will call Gopher for %s regardless of existing files' % seq.shortName()) alnstart = [None] for file in alnstart: if file: file = '%s%s%s' % (alndir,file,alnext) if rje.checkForFile(file): break # File found else: #!# Sort out logging and see if Gopher can be used directly rather than just run() #!# ### Run GOPHER ### if gopher and callobj.opt['Gopher']: #!# Add working version for PRESTO and SlimPickings #!# callobj.deBug('Run GOPHER in %s' % callobj.info['GopherDir']) mydir = os.getcwd() os.chdir(callobj.info['GopherDir']) callobj.log.printLog('\n#GOPHER','Running GOPHER on %s' % seq.shortName()) try: #!# Add log.silent() method? #!# gcmd = ['orthtree'] + callobj.cmd_list + ['gnspacc=T','i=-1'] solo_gopher = gopher_V2.GopherFork(log=callobj.log,cmd_list=gcmd) solo_gopher.info['Name'] = seq.shortName() solo_gopher.obj['Sequence'] = seq solo_gopher.obj['BLAST'] = gopher_V2.Gopher(callobj.log,gcmd).setupBlast() #!# Contemplate setting up Gopher in callobj #!# solo_gopher.obj['BLAST'].log = callobj.log solo_gopher.run('orthalign') #X#gopher_V2.Gopher(callobj.log,gcmd).setMode()) except: os.chdir(mydir) callobj.log.errorLog('Problem with Gopher run!') return None if not 'old_school': inputseq = 'tmp%s.fas' % rje.randomString(8) TMP = open(inputseq,'w') TMP.write('>%s\n%s\n' % (seq.info['Name'],seq.info['Sequence'])) TMP.close() gcmd = ['orthtree'] + callobj.cmd_list + ['gopher=%s' % inputseq, 'gnspacc=T','i=-1'] try: mygopher = gopher_V2.Gopher(log=callobj.log,cmd_list=gcmd) mygopher.run() except: os.chdir(mydir) callobj.log.errorLog('Problem with Gopher run!',printerror=False) return None rje_blast.cleanupDB(callobj,dbfile=inputseq,deletesource=True) os.chdir(mydir) if callobj.opt['Gopher']: file = '%s%s%s' % (alndir,seq.info['AccNum'],alnext) if not os.path.exists(file): file = None if not file: callobj.log.printLog('#ALN','No alignment file found for %s in %s.' % (seq.shortName(),alndir),screen=False) return None ### Load Alignment ### callobj.log.stat['Verbose'] = v - 1 alncmd = ['seqin=None','query=%s' % seq.shortName(),'accnr=F','seqnr=F','autofilter=F','align=T','gnspacc=F'] aln = rje_seq.SeqList(log=callobj.log,cmd_list=callobj.cmd_list+alncmd) #X#print file aln.loadSeqs(seqfile=file,seqtype='Protein',aln=True,nodup=None) callobj.log.stat['Verbose'] = v ## Check Query ## qry = aln.obj['QuerySeq'] if not qry: if aln.querySeq(query=seq.info['AccNum']): qry = aln.obj['QuerySeq'] else: callobj.log.printLog('#ALN','Problem finding %s in %s.' % (seq.shortName(),file),screen=False) return None ### Check Alignment ### if aln.seqNum() < 2: callobj.log.printLog('#ALN','Not enough sequences for %s in %s.' % (seq.shortName(),file),screen=False) return None if aln._checkAln(aln=True,realign=True): return aln else: callobj.log.printLog('#ERR','%s not aligned!!!' % (file)) return None except: callobj.log.errorLog('Something bad has happened in rje_motif_stats.loadOrthAln()') callobj.log.stat['Verbose'] = v return None
def pairwiseAQ( self, seqlist=None, query=None, focus=[0, 0]): ### Performs PAQ on seqlist, adding seq.info['PAQ'] ''' Performs PAQ on seqlist, adding seq.info['PAQ'] >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: PAQ = PAQ sequences with alignment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <PAQ0> ### Setup try: _stage = '<0> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences if not query: query = haqlist.obj['QuerySeq'] if self.opt['NoQuery'] or not query: query = haqlist.seq[random.randint(0, haqlist.seqNum() - 1)] self.log.printLog( '#QRY', 'Temp (random) query %s assigned for PAQ' % query.shortName()) #!# paqx = [False] * seqlist.seq[0].seqLen() # List of whether a column of the alignment is bad (has an X) [True] or not [False] #!# - make this a method?! pwaq = {} # Dictionary of lists of pairwise alignements block_align = { } # Dictionary of whether residue in block of sequence that is well-aligned or not for seq in haqlist.seq: block_align[seq] = [False] * seq.seqLen() seq.info['PAQ'] = seq.info['Sequence'][0:] if seq.info.has_key('SAQX') and len( seq.info['SAQX'] ) == seq.seqLen( ): #!# Should no longer be issues due to length changes following realignment seq.info['Sequence'] = seq.info['SAQX'][0:] elif seq.info.has_key('SAQX'): self.log.errorLog( 'Cannot use SAQX for %s in PAQ as wrong length.' % seq.shortName(), printerror=False) for otherseq in haqlist.seq: pwaq[(seq, otherseq)] = [False] * seq.seqLen() ### <PAQ1> ### Directional Pairwise Comparisons of sequences _stage = '<1> Pairwise Comparisons' infotxt = 'PAQ%d: Pairwise Comparisons ...' % self.stat['PAQCyc'] #print self.stat for seq in haqlist.seq: for otherseq in haqlist.seq: myinfo = '%s %.1f%% %.1f%% ' % ( infotxt, (100.0 * haqlist.seq.index(seq) / haqlist.seqNum()), (100.0 * haqlist.seq.index(otherseq) / haqlist.seqNum())) self.log.printLog('\r#PAQ', myinfo, log=False, newline=False) for r in range(seq.seqLen()): ar = seq.info['Sequence'][r] ## <i> ## Look for PW aligned block _stage = '<1-i> Pairwise Comparisons' if ar not in ['-', 'X']: # Start of test block blen = 0 # Block length (PAQBlock) = AAs win = 0 # Window length = all sequence matchx = 0 # Score for residues in window while blen < self.stat['PAQBlock'] and ( r + win ) < seq.seqLen( ): # This time we allow overshoots in both directions ar = seq.info['Sequence'][r + win] at = otherseq.info['Sequence'][r + win] if 'X' in [ar, at]: # Hit Bad Region: Abort break else: # Better region if ar != '-': blen += 1 # Increase Block matchx += self._saqCon(ar, at) win += 1 ## <ii> ## Update pwaq if block good _stage = '<1-ii> Pairwise Comparisons' if matchx >= self.stat['PAQMatch']: for w in range(win): if seq.info['Sequence'][r + w] in ['-', 'X']: pwaq[(seq, otherseq)][r + w] = False else: pwaq[(seq, otherseq)][r + w] = True self.log.printLog('\r#PAQ', '%s 100.0% 100.0%. ' % infotxt, log=False) ### <PAQ2> ### Link back to Query _stage = '<2> Linking to Query' ### <PAQ2a> ### Network of Pairwise Quality alignments _stage = '<2a> Linking to Query' #self.verbose(1,3,'PAQ%d: Linking Residues to Query (%s)' % (self.stat['PAQCyc'],query.shortName()),0) infotxt = 'PAQ%d: Linking Residues to Query (%s) ...' % ( self.stat['PAQCyc'], query.shortName()) for r in range(query.seqLen()): _stage = '<2a> Linking to Query' self.log.printLog('\r#PAQ', '%s %.1f%%' % (infotxt, (100.0 * r / query.seqLen())), log=False, newline=False) qok = { } # Dictionary of whether residue in seq OK, i.e. linked to query for seq in haqlist.seq: qok[seq] = False qok[query] = True sok = [0, 1] # List of OK sequence for residue while sok[-2] != sok[-1]: ## <i> ## Match pairs, starting with query _stage = '<2a-i> Linking to Query' for seq in haqlist.seq: if qok[seq]: for otherseq in haqlist.seq: if pwaq[(seq, otherseq)][r] or pwaq[(otherseq, seq)][r]: qok[otherseq] = True ## <ii> ## Update sok _stage = '<2a-ii> Linking to Query' sok.append(0) for seq in haqlist.seq: if qok[seq]: sok[-1] += 1 block_align[seq][r] = True _stage = '<2a-iii> Linking to Query' if sok[-1] == 1: # Only query OK! block_align[query][r] = False self.log.printLog('\r#PAQ', '%s 100.0%%' % infotxt, log=False) ### <PAQ2b> ### Allow for divergence (Conserved Anchors) _stage = '<2b> Anchors' if self.opt['Anchors']: infotxt = 'PAQ%d: Accounting for divergence within aligned regions ...' % self.stat[ 'PAQCyc'] ## <i> ## Setup gapped list gapped = [ False ] * query.seqLen() # Whether column of alignment is gapped for seq in haqlist.seq: self.log.printLog( '\r#PAQ', '%s %.1f%% ' % (infotxt, (50.0 * haqlist.seq.index(seq) / haqlist.seqNum())), log=False, newline=False) (start, end) = (0, seq.seqLen()) while seq.info['Sequence'][start] == '-': start += 1 while seq.info['Sequence'][end - 1] == '-': end -= 1 for r in range(start, end): if seq.info['Sequence'][r] == '-': gapped[r] = True ## <ii> ## Correction for seq in haqlist.seq: self.log.printLog( '\r#PAQ', '%s %.1f%% ' % (infotxt, (50 + (50.0 * haqlist.seq.index(seq) / haqlist.seqNum()))), log=False, newline=False) for r in range(seq.seqLen()): if block_align[seq][r] or gapped[ r]: # No need for correction continue # Move in both directions: if good residues (or sequence end) reached before gaps then reinstate winf = 0 fwd = True fok = False winb = 0 bwd = True bok = False while fwd or bwd: # End of seqs if (r + winf) >= seq.seqLen(): fwd = False if (r - winb) < 0: bwd = False # Gaps/OK if fwd: if gapped[r + winf]: fok = False fwd = False elif block_align[seq][r + winf]: fwd = False else: winf += 1 if bwd: if gapped[r - winb]: bok = False bwd = False elif block_align[seq][r - winb]: bwd = False else: winb += 1 if fok and bok: # Reinstate for w in range(r - winb, r + winf + 1): block_align[seq][w] = True self.log.printLog('\r#PAQ', '%s 100.0%% ' % infotxt, log=False) ### <PAQ3> ### X out badly-aligned blocks _stage = '<3> Making bad sequence blocks' for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][r] == '-': newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] #!# Add saving of data in 'datafull' option ### <PAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removing sequences/regions' self.verbose( 0, 4, 'PAQ%d: Removing bad sequences and/or dodgy regions...' % self.stat['PAQCyc'], 0) ## <PAQ4a> ## Process Query first - only interested in good regions within query if self.opt['NoQuery']: # No preprocessing of Query self.verbose(0, 4, 'no Master Query processing...', 0) else: haqlist.mapX( query, qtrim=True, focus=focus ) # Replaces other sequence ends and query X columns with Xs self.verbose(0, 4, 'Query (%s) processed...' % query.shortName(), 0) self.verbose(0, 3, '', 1) if self.opt['ManPAQ']: haqlist.saveFasta(seqfile='%s.manpaq.fas' % haqlist.info['Basefile']) ## <PAQ4b> ## Cycle through other sequences (worst first) until no more good residues are lost goodres = [0, self._getGood(haqlist.seq) ] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [ 0 ] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info[ 'Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = { } # Dictionary of seq:number of lost residues if seq kept losex = { } # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] #?# In Perl HAQESAC there was an option to ignore Orphans in this calculation. Reinstate? losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManPAQ']: default = 'N' if badkx * self.stat['PAQKeepLen'] > badlx * self.stat[ 'PAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo( '%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(), rje.integerString(badkx), rje.integerString(badlx)), default): seqlist.removeSeq( text= 'PAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['PAQCyc'], rje.integerString(badkx), rje.integerString(badlx)), seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose( 1, 3, '%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(), rje.integerString(badkx), rje.integerString(badlx)), 1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['PAQKeepLen'] > badlx * self.stat[ 'PAQKeepSeq']: # Lose sequence! seqlist.removeSeq( text='PAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['PAQCyc'], rje.integerString(badkx), rje.integerString(badlx)), seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) self.verbose(1, 3, '%d -> %d "good" aa' % (goodres[-2], goodres[-1]), 1) ### <PAQ5> ### Reinstate UnX'd sequence: _stage = '<5> Replacing sequences' for seq in haqlist.seq: [seq.info['PAQ'], seq.info['Sequence'] ] = [seq.info['Sequence'], seq.info['PAQ']] if self.opt['ManPAQ'] and rje.checkForFile( '%s.manpaq.fas' % haqlist.info['Basefile']): os.unlink('%s.manpaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog( 'rje_haq.py ~ Problem with pairwiseAQ %s.' % _stage, True)
def buildPam(self): ### Builds PAM Matrix in memory '''Builds PAM matrix in memory.''' try: ### Check for Alternative PAM Matrix ### if self.info['AltPam'].lower() not in ['','none']: self.altPAM() self.verbose(0,3,"Reading PAM1 matrix from %s" % self.info['Name'],2) ### <a> ### Open file & Read Lines pamfiles = [self.info['Name'],rje.makePath(self.info['Path']) + self.info['Name'],rje.makePath(self.info['Path']) + rje.makePath('../data/') + self.info['Name']] self.info['Name'] = None for pfile in pamfiles: if rje.checkForFile(pfile): file_lines = open(pfile, 'r').readlines() self.info['Name'] = pfile break if not self.info['Name']: for pfile in pamfiles: self.printLog('#ERR','File "%s" not found' % pfile) self.printLog('#ERR','No PAM file found!') raise ValueError ### <b> ### Read in alphabet self.verbose(0,3,file_lines[0],1) if file_lines[0].upper().find('X') >= 0: self.opt['X-Value'] = False if file_lines[0].find('-') >= 0: self.opt['GapValue'] = False self.alphabet = file_lines[0].split() ### <c> ### Make PAM0 ## <i> ## Clear dics zeropamp = {} for r in self.alphabet: for c in self.alphabet: zeropamp[r + c] = 0 zeropamp[r + r] = 1 if self.opt['X-Value']: zeropamp['X' + r] = 1 zeropamp[r + 'X'] = 1 if self.opt['GapValue']: zeropamp['-' + r] = 1 zeropamp[r + '-'] = 1 if self.opt['X-Value']: zeropamp['XX'] = 1 if self.opt['GapValue']: zeropamp['--'] = 1 if self.opt['X-Value'] and self.opt['GapValue']: zeropamp['-X'] = 1 zeropamp['X-'] = 1 ## <ii> ## New Matrix newmatrix = PAM(pam=0,rawpamp=zeropamp,alpha=self.alphabet) self.matrix.append(newmatrix) ## <d> ## Read in PAM1 rawpamp = {} line = 1 for r in self.alphabet: pamline = file_lines[line].split() if len(pamline) != (len(self.alphabet)+1): self.log.errorLog("%s has wrong format! Does not match %s" % (pamline, self.alphabet),printerror=False,quitchoice=True) raise for c in range(int(len(self.alphabet))): prob = float(pamline[c+1]) rawpamp[r + self.alphabet[c]] = prob if self.opt['X-Value']: rawpamp['X' + r] = 1 rawpamp[r + 'X'] = 1 if self.opt['GapValue']: rawpamp['-' + r] = 1 rawpamp[r + '-'] = 1 line += 1 if self.opt['X-Value']: rawpamp['XX'] = 1 if self.opt['GapValue']: rawpamp['--'] = 1 if self.opt['X-Value'] and self.opt['GapValue']: rawpamp['-X'] = 1 rawpamp['X-'] = 1 newmatrix = PAM(pam=1,rawpamp=rawpamp,alpha=self.alphabet) self.matrix.append(newmatrix) ## <e> ## Raise to pammax self.log.printLog('\r#PAM','Building PAM Matrices <= %d: ' % self.stat['PamMax'],log=False,newline=False) self.pamUp() self.log.printLog('\r#PAM','Building PAM Matrices <= %d: Complete.' % self.stat['PamMax']) except: self.log.errorLog('Fatal Error in PamCtrl.buildPam().') raise
def singleSeqAQ(self,seqlist,focus=[0,-1]): ### Performs SAQ on seqlist, adding seq.info['SAQ'] ''' Performs SAQ on seqlist, adding seq.info['SAQ']. >> seqlist:rje_seq.SeqList Object - NB. This object will itself have sequences removed from it, so beware! - A new info key will be added: SAQX = SAQ sequences with individual Xs - A new info key will be added: SAQ = SAQ sequences with aligment Xs >> focus:list of range positions [X:Y] to look at. If Y=0 then [X:]. ''' ### <SAQ1> ### Setup try: _stage = '<1> Setup' haqlist = seqlist # SeqList Object to store individually Xd sequences query = haqlist.obj['QuerySeq'] if self.opt['NoQuery']: query = None badres = [-1,0] # List of how many bad residues in total dataset block_align = {} # Dictionary of whether residue in block of sequence that is well-aligned or not res_align = {} # Dictionary of whether residue of sequence is well-aligned or not res_gap = {} # Dictionary of whether residue of sequence is a gap or not gap_align = {} # Dictionary of whether residue of sequence is a gap in a well-aligned block or not for seq in haqlist.seq: seq.info['SAQ'] = seq.info['Sequence'][0:] # Note! Sequence is modified and SAQ not, then they are swapped at end! block_align[seq] = [False] * seq.seqLen() res_align[seq] = [False] * seq.seqLen() res_gap[seq] = [False] * seq.seqLen() gap_align[seq] = [False] * seq.seqLen() ### <SAQ2> ### Repeated cycles of defining well- and badly-aligned blocks #X#self.deBug(self.stat) _stage = '<2> BlockID' while badres[-1] != badres[-2]: # Change in number of bad residues total_res = 0 badres.append(0) # badres[-1] is the current number of bad residues infotxt = 'SAQ%d-%d: Calculating "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2) for seq in haqlist.seq: myinfo = '%s %.1f%%' % (infotxt,(100.0 * haqlist.seq.index(seq) / haqlist.seqNum())) self.log.printLog('\r#SAQ',myinfo,log=False,newline=False) #self.verbose(0,3,'\r%45s' % myinfo,0) ## <SAQ2a> ## For each sequence, mark residues as aligned or gapped _stage = '<2a> Mark Residues' for r in range(seq.seqLen()): gap_align[seq][r] = False res_align[seq][r] = False if block_align[seq][r] or len(badres) == 3: # After first cycle, look only at well-aligned blocks (well-aligned for sequence not whole alignment) a = seq.info['Sequence'][r] res_gap[seq][r] = False if a == '-': res_gap[seq][r] = True gap_align[seq][r] = True else: # 'X' handled by self._saqCon conx = 0 # Matches with good regions of otherseqs (*including self*) for otherseq in haqlist.seq[0:]: if otherseq == seq: # > so self not counted! continue if len(otherseq.info['Sequence']) != len(seq.info['Sequence']): self.log.errorLog('Sequence lengths do not match - should be aligned!',printerror=False) raise ValueError if (block_align[otherseq][r] or len(badres) == 3): conx += self._saqCon(a, otherseq.info['Sequence'][r]) #if seq == query and r > 590: # print seq.shortName(),r,conx,'vs',self.stat['SAQCon'], if conx >= self.stat['SAQCon']: res_align[seq][r] = True #if seq == query and r > 590: # print r, res_align[seq][r] ## <SAQ2b> ## Marked regions of well-aligned residues for each sequence _stage = '<2b> Mark Regions' ## <i> ## Clear first _stage = '<2b-i> Mark Regions' for r in range(seq.seqLen()): block_align[seq][r] = False ## <ii> ## Recalculate _stage = '<2b-ii> Mark Regions' for r in range(seq.seqLen()): _stage = '<2b-ii> Blocks' if res_align[seq][r]: # Start of potential block blen = 0 # Block length (SAQBlock) = AAs win = 0 # Window length = all sequence matchx = 1 # Good residues in window (first residue must be good!) (SAQMatch) while blen < self.stat['SAQBlock'] and matchx < self.stat['SAQMatch']: win += 1 if (r + win) >= seq.seqLen() or seq.info['Sequence'][r+win] == 'X': # Hit Bad Region: Abort break else: # Better region if gap_align[seq][r+win]: # Decent gap continue else: blen += 1 # Increase Block if res_align[seq][r+win]: # Good residue matchx += 1 #if seq == query and r > 590: # print seq.shortName(),r,matchx,'vs',self.stat['SAQMatch'], if matchx >= self.stat['SAQMatch']: for w in range((win+1)): block_align[seq][r+w] = True #if seq == query and r > 590: # print r, block_align[seq][r] ## <iii> ## Update bad residue count for r in range(seq.seqLen()): _stage = '<2b-iii> Mark Regions' #print seq.shortName(), r, seq.seqLen(), block_align[seq][r], res_gap[seq][r], badres[-1] # Bad residue if not block_align[seq][r] and not res_gap[seq][r]: # Bad residue badres[-1] += 1 if not res_gap[seq][r]: total_res += 1 myinfo = '%s 100.0%%' % infotxt myinfo += ' => %s bad of %s total residues' % (rje.integerString(badres[-1]),rje.integerString(total_res)) self.log.printLog('\r#SAQ',myinfo) #self.verbose(0,3,'\r%45s' % myinfo,0) if badres[-1] == total_res: self.log.errorLog('All residues marked as bad in SAQ!',printerror=False,quitchoice=True) # Now have all residues in all sequences marked as good (block_align=True) or bad (block_align=False) ### <SAQ3> ### X out badly-aligned blocks _stage = '<3> X-Out' self.log.printLog('#SAQ','SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),log=False,newline=False) #self.verbose(0,3,'SAQ%d-%d: Masking "bad" residues ...' % (self.stat['SAQCyc'],len(badres)-2),0) for seq in haqlist.seq: newseq = '' for r in range(seq.seqLen()): if block_align[seq][r] or seq.info['Sequence'][r] == '-': #!# Was backwards? res_gap[seq][r] == False: newseq += seq.info['Sequence'][r] else: # Bad residue newseq += 'X' seq.info['Sequence'] = newseq[0:] seq.info['SAQX'] = newseq[0:] # Stores Xd sequences for individuals for use in PAQ #!# Add saving of data in 'datafull' option ### <SAQ4> ### Remove sequences and/or badly-aligned regions _stage = '<4> Removal' self.log.printLog('\r#SAQ','SAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),log=False,newline=False) #self.verbose(0,3,'\rSAQ%d-%d: Removing bad sequences and/or dodgy regions...' % (self.stat['SAQCyc'],len(badres)-2),0) ## <SAQ4a> ## Process Query first - only interested in good regions within query _stage = '<4a> Query Removal' if self.opt['NoQuery'] or query == None: # No preprocessing of Query self.verbose(0,4,'no Master Query processing...',0) else: haqlist.mapX(query, qtrim=True, focus=focus) # Replaces other sequence ends and query X columns with Xs self.verbose(0,4,'Query (%s) processed...' % query.shortName(),0) self.verbose(0,3,'',1) if self.opt['ManSAQ']: haqlist.saveFasta(seqfile='%s.mansaq.fas' % haqlist.info['Basefile']) ## <SAQ4b> ## Cycle through other sequences (worst first) until no more good residues or sequences are lost _stage = '<4b> Seq Removal' goodres = [0, self._getGood(haqlist.seq)] # List of number of 'good' residues goodseq = [0, haqlist.seqNum()] while goodres[-1] != goodres[-2] or goodseq[-1] != goodseq[-2]: colgood = [0] * haqlist.seq[0].seqLen() # Good residues per column for r in range(haqlist.seq[0].seqLen()): for seq in haqlist.seq: if seq.info['Sequence'][r] != '-' and seq.info['Sequence'][r] != 'X': colgood[r] += 1 ## <i> ## Compare relative loss of masking and losing each sequence keepx = {} # Dictionary of seq:number of lost residues if seq kept losex = {} # Dictionary of seq:number of lost residues if seq lost badkx = -1 # Biggest loss if kept badlx = -1 # Biggest loss if lost bads = None # Worst sequence for seq in haqlist.seq: if seq == query and self.opt['NoQuery'] == False: continue # Next sequence # Calculate keepx and losex keepx[seq] = 0 for r in range(seq.seqLen()): if seq.info['Sequence'][r] == 'X': keepx[seq] += colgood[r] losex[seq] = self._getGood([seq]) # Update bads if worse if keepx[seq] > badkx: badkx = keepx[seq] badlx = losex[seq] bads = seq elif keepx[seq] == badkx and losex[seq] < badlx: badlx = losex[seq] bads = seq ## <ii> ## Remove bad sequences and/or regions if badkx > 0: if self.opt['ManSAQ']: default = 'N' if badkx * self.stat['SAQKeepLen'] > badlx * self.stat['SAQKeepSeq']: # Lose sequence! default = 'Y' if rje.yesNo('%s worst: -%s aa if kept vs -%s aa if lost. Remove?' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),default): seqlist.removeSeq(text='SAQ%d: -%s aa if kept vs -%s aa if lost. (Manual decision.)' % (self.stat['SAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) else: self.verbose(1,3,'%s worst: -%s aa if kept vs -%s aa if lost.' % (bads.shortName(),rje.integerString(badkx),rje.integerString(badlx)),1) #!# Add option for upweighting certain sequence type? (e.g. vs fragment or hypothetical?) if badkx * self.stat['SAQKeepLen'] > badlx * self.stat['SAQKeepSeq']: # Lose sequence! haqlist.removeSeq(text='SAQ%d: -%s aa if kept vs -%s aa if lost.' % (self.stat['SAQCyc'],rje.integerString(badkx),rje.integerString(badlx)),seq=bads) else: # X out haqlist.mapX(bads) ### <iii> ### Recalculate goodres goodres.append(self._getGood(haqlist.seq)) goodseq.append(haqlist.seqNum()) #X#self.verbose(1,3,'%d -> %d "good" aa' % (goodres[-2],goodres[-1]),1) ### <SAQ5> ### Reinstate UnX'd sequence: _stage = '<4b> Seq Removal' for seq in haqlist.seq: #print seq.info [seq.info['SAQ'],seq.info['Sequence']] = [seq.info['Sequence'],seq.info['SAQ']] if self.opt['ManSAQ'] and rje.checkForFile('%s.mansaq.fas' % haqlist.info['Basefile']): os.unlink('%s.mansaq.fas' % haqlist.info['Basefile']) except: self.log.errorLog('Problem with singleSeqAQ() %s.' % _stage, quitchoice=True)
def buildPam(self): ### Builds PAM Matrix in memory '''Builds PAM matrix in memory.''' try: ### Check for Alternative PAM Matrix ### if self.info['AltPam'].lower() not in ['', 'none']: self.altPAM() self.verbose(0, 3, "Reading PAM1 matrix from %s" % self.info['Name'], 2) ### <a> ### Open file & Read Lines pamfiles = [ self.info['Name'], rje.makePath(self.info['Path']) + self.info['Name'], rje.makePath(self.info['Path']) + rje.makePath('../data/') + self.info['Name'] ] self.info['Name'] = None for pfile in pamfiles: if rje.checkForFile(pfile): file_lines = open(pfile, 'r').readlines() self.info['Name'] = pfile break if not self.info['Name']: for pfile in pamfiles: self.printLog('#ERR', 'File "%s" not found' % pfile) self.printLog('#ERR', 'No PAM file found!') raise ValueError ### <b> ### Read in alphabet self.verbose(0, 3, file_lines[0], 1) if file_lines[0].upper().find('X') >= 0: self.opt['X-Value'] = False if file_lines[0].find('-') >= 0: self.opt['GapValue'] = False self.alphabet = file_lines[0].split() ### <c> ### Make PAM0 ## <i> ## Clear dics zeropamp = {} for r in self.alphabet: for c in self.alphabet: zeropamp[r + c] = 0 zeropamp[r + r] = 1 if self.opt['X-Value']: zeropamp['X' + r] = 1 zeropamp[r + 'X'] = 1 if self.opt['GapValue']: zeropamp['-' + r] = 1 zeropamp[r + '-'] = 1 if self.opt['X-Value']: zeropamp['XX'] = 1 if self.opt['GapValue']: zeropamp['--'] = 1 if self.opt['X-Value'] and self.opt['GapValue']: zeropamp['-X'] = 1 zeropamp['X-'] = 1 ## <ii> ## New Matrix newmatrix = PAM(pam=0, rawpamp=zeropamp, alpha=self.alphabet) self.matrix.append(newmatrix) ## <d> ## Read in PAM1 rawpamp = {} line = 1 for r in self.alphabet: pamline = file_lines[line].split() if len(pamline) != (len(self.alphabet) + 1): self.log.errorLog( "%s has wrong format! Does not match %s" % (pamline, self.alphabet), printerror=False, quitchoice=True) raise for c in range(int(len(self.alphabet))): prob = float(pamline[c + 1]) rawpamp[r + self.alphabet[c]] = prob if self.opt['X-Value']: rawpamp['X' + r] = 1 rawpamp[r + 'X'] = 1 if self.opt['GapValue']: rawpamp['-' + r] = 1 rawpamp[r + '-'] = 1 line += 1 if self.opt['X-Value']: rawpamp['XX'] = 1 if self.opt['GapValue']: rawpamp['--'] = 1 if self.opt['X-Value'] and self.opt['GapValue']: rawpamp['-X'] = 1 rawpamp['X-'] = 1 newmatrix = PAM(pam=1, rawpamp=rawpamp, alpha=self.alphabet) self.matrix.append(newmatrix) ## <e> ## Raise to pammax self.log.printLog('\r#PAM', 'Building PAM Matrices <= %d: ' % self.stat['PamMax'], log=False, newline=False) self.pamUp() self.log.printLog( '\r#PAM', 'Building PAM Matrices <= %d: Complete.' % self.stat['PamMax']) except: self.log.errorLog('Fatal Error in PamCtrl.buildPam().') raise
def badasp(out,mainlog,cmd_list,tree=None): ### Main BADASP Method ''' Main BADASP Method. Automated run if interactive < 1 <1> Load Sequences and Tree <2> Define Subfamilies <3> GASP Ancestral Sequence Prediction <4> Peform Functional Specificity and Sequence Conservation Calculations <5> Output Results ''' try: ### <0> ### Setup _seqfile = None _treefile = None append_file = None basefile = None for cmd in cmd_list: if cmd.find('seqin=') == 0: _seqfile = cmd[len('seqin='):] if _seqfile[-4] == '.': _seqfile = _seqfile[:-4] if cmd.find('useanc=') == 0: _seqfile = cmd[len('useanc='):] if _seqfile[-8:] == '.anc.fas': _seqfile = _seqfile[:-8] if cmd.find('nsfin=') == 0: _treefile = cmd[len('nsfin='):] if cmd.find('append=') == 0: append_file = cmd[len('append='):] if cmd.find('basefile=') == 0: basefile = cmd[len('basefile='):] if _seqfile and os.path.exists('%s.grp' % _seqfile): cmd_list.append('group=%s.grp' % _seqfile) if _seqfile and _treefile == None: if rje.checkForFile('%s.nwk' % _seqfile): _treefile = '%s.nwk' % _seqfile else: _treefile = '%s.nsf' % _seqfile out.verbose(0,2,'Looking for treefile %s.' % _treefile,1) if rje.checkForFile(_treefile): cmd_list.append('nsfin=%s' % _treefile) if tree == None: mainlog.verbose(0,1,'Tree: %s' % cmd_list,2) tree = rje_tree.Tree(log=mainlog,cmd_list=cmd_list) #tree._setupFromCmd() if tree.stat['MinFamNum'] < 2: tree.stat['MinFamNum'] = 2 ### <1> ### Load Sequences and Tree while out.stat['Interactive'] > 0 or tree.obj['SeqList'] == None: tree = rje_tree.treeMenu(out,mainlog,['root=yes']+cmd_list,tree) if tree.obj['SeqList'] and tree.opt['Rooted']: break else: print '\n ** Must have loaded sequences and a rooted tree. ** \n' if out.stat['Interactive'] < 0 or rje.yesNo('Quit BADASP?',default='N'): sys.exit() basename = tree.obj['SeqList'].info['Name'] if basename[-4:] == '.fas': basename = basename[:-4] if basename[-4:] == '.anc': basename = basename[:-4] if basefile: basename = basefile except SystemExit: raise except: mainlog.errorLog('Major Error in badasp loading sequences and tree',True) try: ### <2> ### Define Subfamilies while out.stat['Interactive'] > 0 or tree.groupNum() < 2: tree.treeGroup(callmenu=True) if tree.groupNum() >= 2: break else: mainlog.errorLog('Must have at least two subfamilies for specificity analyses.',printerror=False) if out.stat['Interactive'] < 0 or rje.yesNo('Continue without specificity analyses?'): cmd_list.append('funcspec=') break elif rje.yesNo('Abort BADASP?'): sys.exit() except SystemExit: raise except: mainlog.errorLog('Major Error in BADASP subfamilies',True) try: ### <3> ### GASP Ancestral Sequence Prediction if tree.node[-1].obj['Sequence'] == None: # No ancseq loaded while out.stat['Interactive'] > 0 and rje.yesNo('Use %s for output filenames?' % basename) == False: basename = rje.choice('FILEname (FILE.anc.fas, FILE.anc.nsf, FILE.txt)?: ', default=basename) mygasp = rje_ancseq.Gasp(tree=tree,ancfile=basename,cmd_list=cmd_list,log=mainlog) out.verbose(0,2,'%s' % mygasp.details(),1) if out.stat['Interactive'] > 0: if rje.yesNo('Use these parameters?') == False: mygasp.edit() mygasp.gasp() except: mainlog.errorLog('Major Error in BADASP GASP',True) try: ### <4> ### Peform Functional Specificity and Sequence Conservation Calculations _stage = '<4> Specificity/Conservation Analyses' aaprop = rje_aaprop.AAPropMatrix(log=mainlog,cmd_list=cmd_list) query = tree.obj['SeqList'].obj['QuerySeq'] ## <a> ## Chosen Methods _stage = '<4a> Specificity/Conservation Analyses - Chosen Methods' funcspec = rje_specificity.methodlist # ['BAD','BADN','BADX'] seqcon = rje_conseq.methodlist # ['info'] for cmd in cmd_list: if cmd.find('funcspec=') == 0: funcspec = cmd[9:].split(',') if cmd.find('seqcon=') == 0: seqcon = cmd[len('seqcon='):].split(',') if 'all' in funcspec: funcspec = rje_specificity.methodlist if 'all' in seqcon: seqcon = rje_conseq.methodlist for method in ['BADX','BADN','QPCon_Mean','QPCon_Abs','QPCon_Mean_All']: while method in funcspec and query == None: if rje.yesNo('Method %s needs query but none given. Drop %s from specificity methods?' % (method,method)): funcspec.remove(method) break for seq in tree.obj['SeqList'].seq: if rje.yesNo('Method %s needs query but none given. Use sequence 1 (%s)?' % (method,seq.shortName()),default='N'): query = seq tree.obj['SeqList'].obj['Query'] = seq break while method in seqcon and query == None: if rje.yesNo('Method %s needs query but none given. Drop %s from conservation methods?' % (method,method)): seqcon.remove(method) break for seq in tree.obj['SeqList'].seq: if rje.yesNo('Method %s needs query but none given. Use sequence 1 (%s)?' % (method,seq.shortName()),default='N'): query = seq tree.obj['SeqList'].obj['Query'] = seq break qname = query if query: qname = query.info['Name'] out.verbose(0,3,'\nQuery = %s' % qname,2) ## <b> ## Spec Calculations _stage = '<4b> Specificity Calculations' specmatrix = rje_specificity.FuncSpec(log=mainlog,cmd_list=cmd_list,tree=tree,aaprop=aaprop) specmatrix.calcScore(query=query,methods=funcspec) ## <c> ## Conservation Calculations _stage = '<4c> Specificity/Conservation Analyses - Conservation Calculations' conseq = rje_conseq.SeqStat(log=mainlog,cmd_list=cmd_list,tree=tree,aaprop=aaprop) conseq.calcScore(query=query,methods=seqcon) ### Sends appropriate seqlist to self.calcScore() ## <d> ## Special Case: QPCon vs All seqs _stage = '<4d> Specificity/Conservation Analyses - QPCon vs All' qpconall = [] #if 'QPCon_Abs_All' in seqcon and query: # qpconall.append('QPCon_Abs') if 'QPCon_Mean_All' in seqcon and query: qpconall.append('QPCon_Mean') for qp in qpconall: conseq.score['%s_All' % qp] = conseq.score[qp] if conseq.alnwin.has_key(qp): conseq.alnwin['%s_All' % qp] = conseq.alnwin[qp] if conseq.qrywin.has_key(qp): conseq.qrywin['%s_All' % qp] = conseq.qrywin[qp] if conseq.rank.has_key(qp): conseq.rank['%s_All' % qp] = conseq.rank[qp] if conseq.alnrankwin.has_key(qp): conseq.alnrankwin['%s_All' % qp] = conseq.alnrankwin[qp] if conseq.qryrankwin.has_key(qp): conseq.qryrankwin['%s_All' % qp] = conseq.qryrankwin[qp] _stage = '<4d> Specificity/Conservation Analyses - FamQP' famqp = [] if 'QPCon_Mean' in seqcon: famqp.append('QPCon_Mean') if 'QPCon_Abs' in seqcon: famqp.append('QPCon_Abs') if len(famqp) > 0 and query: #!# And subfam option? qseq = [] for fam in tree.subfam: for node in tree._nodeClade(fam): if query == node.obj['Sequence']: for qnode in tree._nodeClade(fam): qseq.append(qnode.obj['Sequence']) conseq.calcScore(query=query,seqlist=qseq,methods=famqp) ### Sends appropriate seqlist to self.calcScore() except: mainlog.errorLog('Major Error in BADASP Specificity Analysis (%s):' % _stage,True) try: ### <5> ### Full Output Results _stage = '<5> Full Output' # This output is in a tab- or comma-delimited file for easy manipulation or viewing with other programs. # (1) statistics for a given residue; # (2) statistics for a given window size across # - (a) the whole alignment, (node=None) # - (b) the Query protein of interest (if given) and (node=QueryNode) # - (c) the ancestral sequence of each subfamily; (node=ancnode) # (3) Predicted ancestral sequences at # - (a) the root and # - (b) the ancestor of each subfamily. delimit = rje.getDelimit(cmd_list) ## <a> ## Setup _stage = '<5a> Output - Setup' rankout = specmatrix.opt['Rank'] #tree._regenerateSeqList(tree.obj['SeqList'],tree.node) root = tree.node[-1].obj['Sequence'] #!# At some point, make sure this is the most ancient duplication! out.verbose(0,3,'\nBADASP Results Output (%s.badasp) ...' % basename,0) ## <b> ## Header _stage = '<5b> Output - Header' _header = True if append_file: if rje.checkForFile(append_file): _header = False BADASP = open(append_file, 'a') else: BADASP = open('%s.badasp' % basename, 'w') BADASP.write("BADASP Output: %s\n" % (time.asctime(time.localtime(time.time())))) BADASP.write('%s\n\n' % cmd_list) header = ['aln_pos','anc_aa'] # Aln Pos and AA alnlen = 0 statlist = funcspec + seqcon _stage = '<5b-i> Output - Header Query' if query: header += ['qry_pos','qry_aa'] # Qry Pos and AA _stage = '<5b-ii> Output - Header Subfam' for f in range(len(tree.subfam)): header += ['fam%d_pos' % (f+1),'fam%d_aa' % (f+1)] # Subfam Pos and AA for func in statlist: _stage = '<5b-iii> Output - Header %s' % func statobj = statObj(method=func,objlist=[specmatrix,conseq]) fs = func.lower() alnlen = len(statobj.score[func]) header.append(fs) # Score if rankout: header.append('%s_rank' % fs) # Rank if statobj.stat['WinSize'] > 1: header.append('%s_alnwin' % fs) # Full align window if rankout: header.append('%s_alnrankwin' % fs) # Rank if query: header.append('%s_qrywin' % fs) # Qry window if rankout: header.append('%s_qryrankwin' % fs) # Rank if func in funcspec: for f in range(len(tree.subfam)): header.append('%s_fam%d_win' % (fs,f+1)) # Subfam windows if rankout: header.append('%s_fam%d_rankwin' % (fs,f+1)) # Subfam windows #if _header: BADASP.write('%s\n' % string.join(header, delimit)) out.verbose(1,3,'%s...' % string.join(header, delimit),0) ## <c> ## Stats _stage = '<5c> Stats' qr = 0 # Qry pos fr = [0] * len(tree.subfam) # List of subfam positions aa = '' # Root aa qa = '' # Qry aa fa = [''] * len(tree.subfam) # List of subfam aas for r in range(alnlen): # <i> # Positions and aas _stage = '<5c-i> Output - Stats, positions & aas' aa = root.info['Sequence'][r] if query: qa = query.info['Sequence'][r] if qa != '-': qr += 1 for f in range(len(tree.subfam)): fa[f] = tree.subfam[f].obj['Sequence'].info['Sequence'][r] if fa[f] != '-': fr[f] += 1 # <ii> # Positions and AAs ii _stage = '<5c-ii> Output - Pos & AA ii' line = ['%d' % (r+1), aa] # Aln Pos and AA if query: if qa == '-': line += ['-',qa] # Qry Pos and AA else: line += ['%d' % qr,qa] # Qry Pos and AA for f in range(len(tree.subfam)): if fa[f] == '-': line += ['-',fa[f]] # Subfam Pos and AA else: line += ['%d' % fr[f],fa[f]] # Subfam Pos and AA # <iii> # Stats _stage = '<5c-iii> Output - Stats' for func in statlist: statobj = statObj(method=func,objlist=[specmatrix,conseq]) fs = func.lower() line.append(str(statobj.score[func][r])) # Score if rankout: line.append(str(statobj.rank[func][r])) # Rank if specmatrix.stat['WinSize'] > 1: line.append(str(statobj.alnwin[func][r])) # Full align window if rankout: line.append(str(statobj.alnrankwin[func][r])) # Rank if query: line.append(str(statobj.qrywin[func][r])) # Qry window if rankout: line.append(str(statobj.qryrankwin[func][r])) # Rank if func in funcspec: for f in range(len(tree.subfam)): line.append(str(statobj.famwin[func][tree.subfam[f]][r])) # Subfam windows if rankout: line.append(str(statobj.famrankwin[func][tree.subfam[f]][r])) # Subfam windows # <iv> # Writing _stage = '<5c-iv> Output - Writing' BADASP.write('%s\n' % string.join(line, delimit)) BADASP.close() out.verbose(0,2,'Done!',2) except: mainlog.errorLog('Fatal Error in BADASP Full output (%s):' % _stage,True) BADASP.write('%s\n' % string.join(line, delimit)) BADASP.close() try: ### <6> ### Partial Results Output _stage = '<6> Partial Output' ## <a> ## Setup _stage = '<6a> Output - Setup' # statlist & alnlen from above _part_append = False if out.stat['Interactive'] > 0 and rje.yesNo('Output additional, filtered results?',default='N'): partfile = rje.choice('Name for partial results file?:','%s.partial.badasp' % basename,confirm=True) if rje.checkForFile(partfile) and rje.yesNo('File %s exists. Append file without headers?' % partfile): _part_append = True else: return if rje.yesNo('Filter output columns?',default='N'): if rje.yesNo('Output query details (pos,aa & win)?') == False: query = None f = 1 for fam in tree.subfam[0:]: if rje.yesNo('Output subfam %d (%s) details (pos,aa & win)?' % (f,fam.info['CladeName'])) == False: tree.subfam.remove(fam) f += 1 for func in statlist[0:]: if rje.yesNo('Output %s results?' % func) == False: statlist.remove(func) alnout = [True] * alnlen if rje.yesNo('Filter Rows by Results VALUES?'): out.verbose(0,0,'Initial Defaults are minmum values. Accept intital default for no filtering of given Stat.',1) for stat in statlist: ### Filter by value? ### statobj = statObj(method=stat,objlist=[specmatrix,conseq]) scores = statobj.score[stat][0:] scores.sort() cutoff = rje.getFloat('Min. value for %s?:' % stat,default='%f' % scores[0],confirm=True) for r in range(alnlen): if statobj.score[stat][r] < cutoff: alnout[r] = False if rankout and rje.yesNo('Filter Rows by Results RANKS?'): out.verbose(0,0,'Ranks range from 0 (low) to 1 (high).',1) for stat in statlist: ### Filter by Rank? ### statobj = statObj(method=stat,objlist=[specmatrix,conseq]) cutoff = rje.getFloat('Min. rank for %s?:' % stat,default='0.0',confirm=True) for r in range(alnlen): if statobj.rank[stat][r] < cutoff: alnout[r] = False out.verbose(0,3,'\nBADASP Partial Results Output (%s) ...' % partfile,0) ## <b> ## Header _stage = '<6b> Partial Output - Header' if _part_append: BADASP = open(partfile, 'a') else: BADASP = open(partfile, 'w') BADASP.write("Partial BADASP Output: %s\n" % (time.asctime(time.localtime(time.time())))) BADASP.write('%s\n\n' % cmd_list) header = ['aln_pos','anc_aa'] # Aln Pos and AA _stage = '<6b-i> Partial Output - Header Query' if query: header += ['qry_pos','qry_aa'] # Qry Pos and AA _stage = '<6b-ii> Partial Output - Header Subfam' for f in range(len(tree.subfam)): header += ['fam%d_pos' % (f+1),'fam%d_aa' % (f+1)] # Subfam Pos and AA for func in statlist: _stage = '<6b-iii> Partial Output - Header %s' % func statobj = statObj(method=func,objlist=[specmatrix,conseq]) fs = func.lower() header.append(fs) # Score if rankout: header.append('%s_rank' % fs) # Rank if statobj.stat['WinSize'] > 1: header.append('%s_alnwin' % fs) # Full align window if rankout: header.append('%s_alnrankwin' % fs) # Rank if query: header.append('%s_qrywin' % fs) # Qry window if rankout: header.append('%s_qryrankwin' % fs) # Rank if func in funcspec: for f in range(len(tree.subfam)): header.append('%s_fam%d_win' % (fs,f+1)) # Subfam windows if rankout: header.append('%s_fam%d_rankwin' % (fs,f+1)) # Subfam windows #if not _part_append: BADASP.write('%s\n' % string.join(header, delimit)) out.verbose(1,3,'%s...' % string.join(header, delimit),0) ## <c> ## Stats _stage = '<6c> Stats' qr = 0 # Qry pos fr = [0] * len(tree.subfam) # List of subfam positions aa = '' # Root aa qa = '' # Qry aa fa = [''] * len(tree.subfam) # List of subfam aas for r in range(alnlen): if alnout[r] == False: continue # <i> # Positions and aas _stage = '<6c-i> Partial Output - Stats, positions & aas' aa = root.info['Sequence'][r] if query: qa = query.info['Sequence'][r] if qa != '-': qr += 1 for f in range(len(tree.subfam)): fa[f] = tree.subfam[f].obj['Sequence'].info['Sequence'][r] if fa[f] != '-': fr[f] += 1 # <ii> # Positions and AAs ii _stage = '<6c-ii> Partial Output - Pos & AA ii' line = ['%d' % (r+1), aa] # Aln Pos and AA if query: if qa == '-': line += ['-',qa] # Qry Pos and AA else: line += ['%d' % qr,qa] # Qry Pos and AA for f in range(len(tree.subfam)): if fa[f] == '-': line += ['-',fa[f]] # Subfam Pos and AA else: line += ['%d' % fr[f],fa[f]] # Subfam Pos and AA # <iii> # Stats _stage = '<6c-iii> Partial Output - Stats' for func in statlist: statobj = statObj(method=func,objlist=[specmatrix,conseq]) fs = func.lower() line.append(str(statobj.score[func][r])) # Score if rankout: line.append(str(statobj.rank[func][r])) # Rank if specmatrix.stat['WinSize'] > 1: line.append(str(statobj.alnwin[func][r])) # Full align window if rankout: line.append(str(statobj.alnrankwin[func][r])) # Rank if query: line.append(str(statobj.qrywin[func][r])) # Qry window if rankout: line.append(str(statobj.qryrankwin[func][r])) # Rank if func in funcspec: for f in range(len(tree.subfam)): line.append(str(statobj.famwin[func][tree.subfam[f]][r])) # Subfam windows if rankout: line.append(str(statobj.famrankwin[func][tree.subfam[f]][r])) # Subfam windows # <iv> # Writing _stage = '<6c-iv> Partial Output - Writing' BADASP.write('%s\n' % string.join(line, delimit)) BADASP.close() out.verbose(0,2,'Done!',2) except: mainlog.errorLog('Fatal Error in BADASP Partial output (%s):' % _stage,True) BADASP.write('%s\n' % string.join(line, delimit)) BADASP.close()