def parse_result(self): """ Align fasta formated sequence to hmm profile. @return: alignment and matching hmm positions with gaps @rtype: str, str """ ## check that the outfut file is there and seems valid if not os.path.exists(self.f_out): raise HmmerError,\ 'Hmmeralign result file %s does not exist.'%self.f_out if T.fileLength(self.f_out) < 1: raise HmmerError,\ 'Hmmeralign result file %s seems incomplete.'%self.f_out ## read result hmm = open(self.f_out, 'r') out = hmm.read() hmm.close() ## extract search sequence fastaSeq = re.findall(self.fastaID + '[ ]+[-a-yA-Y]+', out) fastaSeq = string.join([string.split(i)[1] for i in fastaSeq], '') ## extract hmm sequence hmmSeq = re.findall('#=[A-Z]{2}\s[A-Z]{2}\s+[.x]+', out) hmmSeq = string.join( [string.strip(string.split(i)[2]) for i in hmmSeq], '') return fastaSeq, hmmSeq
def parse_result( self ): """ Align fasta formated sequence to hmm profile. @return: alignment and matching hmm positions with gaps @rtype: str, str """ ## check that the outfut file is there and seems valid if not os.path.exists( self.f_out ): raise HmmerError,\ 'Hmmeralign result file %s does not exist.'%self.f_out if T.fileLength( self.f_out ) < 1: raise HmmerError,\ 'Hmmeralign result file %s seems incomplete.'%self.f_out ## read result hmm = open( self.f_out, 'r') out = hmm.read() hmm.close() ## extract search sequence fastaSeq = re.findall( self.fastaID + '[ ]+[-a-yA-Y]+', out ) fastaSeq = string.join([ string.split(i)[1] for i in fastaSeq ], '') ## extract hmm sequence hmmSeq = re.findall( '#=[A-Z]{2}\s[A-Z]{2}\s+[.x]+', out ) hmmSeq = string.join([ string.strip( string.split(i)[2] ) for i in hmmSeq ], '') return fastaSeq, hmmSeq
def clusterFasta( self, fastaIn=None, simCut=1.75, lenCut=0.9, ncpu=1 ): """ Cluster sequences. The input fasta titles must be the IDs. fastaClust( fastaIn [, simCut, lenCut, ncpu] ) @param fastaIn: name of input fasta file @type fastaIn: str @param simCut: similarity threshold (score < 3 or %identity) (default: 1.75) @type simCut: double @param lenCut: length threshold (default: 0.9) @type lenCut: double @param ncpu: number of CPUs @type ncpu: int @raise BlastError: if fastaIn is empty """ fastaIn = fastaIn or self.outFolder + self.F_FASTA_ALL if T.fileLength( fastaIn ) < 1: raise IOError( "File %s empty. Nothing to cluster"%fastaIn ) if self.verbose: self.log.add( "\nClustering sequences:\n%s"%('-'*20) ) cmd = settings.blastclust_bin + ' -i %s -S %f -L %f -a %i' %\ (fastaIn, simCut, lenCut, ncpu) if self.verbose: self.log.add("- Command: %s"%cmd) ## bugfix: at all cost prevent blastclust from using shared temp folder tmp = os.environ.get( 'TMPDIR', None ) if tmp: del os.environ['TMPDIR'] err, o = commands.getstatusoutput( cmd ) if err: raise BlastError( "blastclust failed. Error code: " + str(err) ) if tmp: os.environ['TMPDIR'] = tmp ## blastclust might write errors to file, if so the errors ## occur before the dateline lines = [ l.split() for l in o.split('\n') ] dateline = [ l[-1] for l in lines ].index('queries') self.clusters = lines[dateline+1:] self.reportClustering( raw=o ) self.bestOfCluster = [ self.selectFasta( ids ) for ids in self.clusters ]
def clusterFasta(self, fastaIn=None, simCut=1.75, lenCut=0.9, ncpu=1): """ Cluster sequences. The input fasta titles must be the IDs. fastaClust( fastaIn [, simCut, lenCut, ncpu] ) @param fastaIn: name of input fasta file @type fastaIn: str @param simCut: similarity threshold (score < 3 or %identity) (default: 1.75) @type simCut: double @param lenCut: length threshold (default: 0.9) @type lenCut: double @param ncpu: number of CPUs @type ncpu: int @raise BlastError: if fastaIn is empty """ fastaIn = fastaIn or self.outFolder + self.F_FASTA_ALL if T.fileLength(fastaIn) < 1: raise IOError("File %s empty. Nothing to cluster" % fastaIn) if self.verbose: self.log.add("\nClustering sequences:\n%s" % ('-' * 20)) cmd = settings.blastclust_bin + ' -i %s -S %f -L %f -a %i' %\ (fastaIn, simCut, lenCut, ncpu) if self.verbose: self.log.add("- Command: %s" % cmd) ## bugfix: at all cost prevent blastclust from using shared temp folder tmp = os.environ.get('TMPDIR', None) if tmp: del os.environ['TMPDIR'] err, o = commands.getstatusoutput(cmd) if err: raise BlastError("blastclust failed. Error code: " + str(err)) if tmp: os.environ['TMPDIR'] = tmp ## blastclust might write errors to file, if so the errors ## occur before the dateline lines = [l.split() for l in o.split('\n')] dateline = [l[-1] for l in lines].index('queries') self.clusters = lines[dateline + 1:] self.reportClustering(raw=o) self.bestOfCluster = [self.selectFasta(ids) for ids in self.clusters]
def parse_result( self ): """ Parse the output from hmmpfam. @return: dictionary witn profile names as keys and a list of lists containing information about the range where the profile matches the sequence @rtype: dict, [list] """ matches = {} hits = [] ## check that the outfut file is there and seems valid if not os.path.exists( self.f_out ): raise HmmerError,\ 'Hmmersearch result file %s does not exist.'%self.f_out if T.fileLength( self.f_out ) < 10: raise HmmerError,\ 'Hmmersearch result file %s seems incomplete.'%self.f_out try: out = open( self.f_out, 'r' ) while 1: l = out.readline() ## get names and descriptions of matching profiles if re.match('^-{8}\s{7,8}-{11}.+-{3}$', l): m = string.split( out.readline() ) while len(m) != 0: matches[m[0]] = m[1:] m = string.split( out .readline() ) ## get hits, scores and alignment positions if re.match('^-{8}\s{7,8}-{7}\s-{5}\s-{5}.+-{7}$', l): h = string.split( out.readline() ) while len(h) != 0: hits += [ h ] h = string.split( out.readline() ) break except: raise HmmerError,\ 'ERROR parsing hmmpfam search result: %s'%self.f_out out.close() return matches, hits
def __init__( self, hmmFile, fastaFile, fastaID, **kw ): """ @param hmmFile: path to hmm file (profile) @type hmmFile: str @param fastaFile: path to fasta search sequence @type fastaFile: str @param fastaID: fasta id of search sequence @type fastaID: str """ self.fastaID = fastaID self.hmmFile = hmmFile self.fastaFile = fastaFile assert T.fileLength( self.hmmFile ) > 10, \ 'input HMM file missing or empty' Executor.__init__( self, 'hmmalign', args=' -q %s %s'%(hmmFile, fastaFile), **kw )
def parse_result(self): """ Parse the output from hmmpfam. @return: dictionary with profile names as keys and a list of lists containing information about the range where the profile matches the sequence @rtype: dict, [list] """ matches = {} hits = [] ## check that the outfut file is there and seems valid if not os.path.exists(self.f_out): raise HmmerError,\ 'Hmmersearch result file %s does not exist.'%self.f_out if T.fileLength(self.f_out) < 10: raise HmmerError,\ 'Hmmersearch result file %s seems incomplete.'%self.f_out try: lines = open(self.f_out, 'r').readlines() while lines: l = lines.pop(0) ## get names and descriptions of matching profiles if re.match('^-{8}\s{7,8}-{11}.+-{3}$', l): m = string.split(lines.pop(0)) while len(m) != 0: matches[m[0]] = m[1:] m = string.split(lines.pop(0)) ## get hits, scores and alignment positions if re.match('^-{8}\s{7,8}-{7}\s-{5}\s-{5}.+-{7}$', l): h = string.split(lines.pop(0)) while len(h) != 0: hits += [h] h = string.split(lines.pop(0)) break except Exception, why: raise HmmerError,\ 'ERROR parsing hmmpfam search result: %s'%self.f_out +\ '\n%r' % why
def __init__(self, hmmFile, fastaFile, fastaID, **kw): """ @param hmmFile: path to hmm file (profile) @type hmmFile: str @param fastaFile: path to fasta search sequence @type fastaFile: str @param fastaID: fasta id of search sequence @type fastaID: str """ self.fastaID = fastaID self.hmmFile = hmmFile self.fastaFile = fastaFile assert T.fileLength( self.hmmFile ) > 10, \ 'input HMM file missing or empty' Executor.__init__(self, 'hmmalign', args=' -q %s %s' % (hmmFile, fastaFile), **kw)
def parse_result(self): """ Extract some information about the profile as well as the match state emmission scores. Keys of the returned dictionary:: 'AA', 'name', 'NrSeq', 'emmScore', 'accession', 'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum' @return: dictionary with warious information about the profile @rtype: dict """ ## check that the outfut file is there and seems valid if not os.path.exists(self.f_out): raise HmmerError,\ 'Hmmerfetch result file %s does not exist.'%self.f_out if T.fileLength(self.f_out) < 10: raise HmmerError,\ 'Hmmerfetch result file %s seems incomplete.'%self.f_out profileDic = {} ## read result hmm = open(self.f_out, 'r') out = hmm.read() hmm.close() ## collect some data about the hmm profile profileDic['name'] = self.hmmName profileDic['profLength'] = \ int( string.split(re.findall('LENG\s+[0-9]+', out)[0])[1] ) profileDic['accession'] = \ string.split(re.findall('ACC\s+PF[0-9]+', out)[0])[1] profileDic['NrSeq'] = \ int( string.split(re.findall('NSEQ\s+[0-9]+', out)[0])[1] ) profileDic['AA'] = \ string.split(re.findall('HMM[ ]+' + '[A-Y][ ]+'*20, out)[0] )[1:] ## collect null emmission scores pattern = 'NULE[ ]+' + '[-0-9]+[ ]+' * 20 nullEmm = [ float(j) for j in string.split(re.findall(pattern, out)[0])[1:] ] ## get emmision scores prob = [] for i in range(1, profileDic['profLength'] + 1): pattern = "[ ]+%i" % i + "[ ]+[-0-9]+" * 20 e = [float(j) for j in string.split(re.findall(pattern, out)[0])] prob += [e] profileDic['seqNr'] = N.transpose(N.take(prob, (0, ), 1)) profileDic['emmScore'] = N.array(prob)[:, 1:] ## calculate emission probablitities emmProb, nullProb = self.hmmEmm2Prob(nullEmm, profileDic['emmScore']) ent = [ N.resize(self.entropy(e, nullProb), (1, 20))[0] for e in emmProb ] profileDic['ent'] = N.array(ent) ###### TEST ##### proba = N.array(prob)[:, 1:] ## # test set all to max score ## p = proba ## p1 = [] ## for i in range( len(p) ): ## p1 += [ N.resize( p[i][N.argmax( N.array( p[i] ) )] , N.shape( p[i] ) ) ] ## profileDic['maxAll'] = p1 # test set all to N.sum( abs( probabilities ) ) p = proba p2 = [] for i in range(len(p)): p2 += [N.resize(N.sum(N.absolute(p[i])), N.shape(p[i]))] profileDic['absSum'] = p2 # set all to normalized max score p = proba p4 = [] for i in range(len(p)): p_scale = (p[i] - N.average(p[i])) / math.SD(p[i]) p4 += [ N.resize(p_scale[N.argmax(N.array(p_scale))], N.shape(p[i])) ] profileDic['maxAllScale'] = p4 return profileDic
def parse_result( self ): """ Extract some information about the profile as well as the match state emmission scores. Keys of the returned dictionary:: 'AA', 'name', 'NrSeq', 'emmScore', 'accession', 'maxAllScale', 'seqNr', 'profLength', 'ent', 'absSum' @return: dictionary with warious information about the profile @rtype: dict """ ## check that the outfut file is there and seems valid if not os.path.exists( self.f_out ): raise HmmerError,\ 'Hmmerfetch result file %s does not exist.'%self.f_out if T.fileLength( self.f_out ) < 10: raise HmmerError,\ 'Hmmerfetch result file %s seems incomplete.'%self.f_out profileDic = {} ## read result hmm = open( self.f_out, 'r') out = hmm.read() hmm.close() ## collect some data about the hmm profile profileDic['name'] = self.hmmName profileDic['profLength'] = \ int( string.split(re.findall('LENG\s+[0-9]+', out)[0])[1] ) profileDic['accession'] = \ string.split(re.findall('ACC\s+PF[0-9]+', out)[0])[1] profileDic['NrSeq'] = \ int( string.split(re.findall('NSEQ\s+[0-9]+', out)[0])[1] ) profileDic['AA'] = \ string.split(re.findall('HMM[ ]+' + '[A-Y][ ]+'*20, out)[0] )[1:] ## collect null emmission scores pattern = 'NULE[ ]+' + '[-0-9]+[ ]+'*20 nullEmm = [ float(j) for j in string.split(re.findall(pattern, out)[0])[1:] ] ## get emmision scores prob=[] for i in range(1, profileDic['profLength']+1): pattern = "[ ]+%i"%i + "[ ]+[-0-9]+"*20 e = [ float(j) for j in string.split(re.findall(pattern, out)[0]) ] prob += [ e ] profileDic['seqNr'] = N.transpose( N.take( prob, (0,),1 ) ) profileDic['emmScore'] = N.array(prob)[:,1:] ## calculate emission probablitities emmProb, nullProb = self.hmmEmm2Prob( nullEmm, profileDic['emmScore']) ent = [ N.resize( self.entropy(e, nullProb), (1,20) )[0] for e in emmProb ] profileDic['ent'] = N.array(ent) ###### TEST ##### proba = N.array(prob)[:,1:] ## # test set all to max score ## p = proba ## p1 = [] ## for i in range( len(p) ): ## p1 += [ N.resize( p[i][N.argmax( N.array( p[i] ) )] , N.shape( p[i] ) ) ] ## profileDic['maxAll'] = p1 # test set all to N.sum( abs( probabilities ) ) p = proba p2 = [] for i in range( len(p) ) : p2 += [ N.resize( N.sum( N.absolute( p[i] )), N.shape( p[i] ) ) ] profileDic['absSum'] = p2 # set all to normalized max score p = proba p4 = [] for i in range( len(p) ) : p_scale = (p[i] - N.average(p[i]) )/ math.SD(p[i]) p4 += [ N.resize( p_scale[N.argmax( N.array(p_scale) )] , N.shape( p[i] ) ) ] profileDic['maxAllScale'] = p4 return profileDic