def test_ChainSeparator( self ): """ChainSeparator test""" self.sep = ChainSeparator( self.fname, self.outPath, 1) self.chain = self.sep.next() i=1 all_chains = [] while self.chain <> None: if self.local: print 'Chain %i:'%i, ''.join(singleAA(self.chain.sequence() ) ) all_chains += self.chain.sequence() self.chain = self.sep.next() i += 1 if self.local: print 'ChainSeparator log file written to: %s'%self.sep.log.fname r = ''.join( singleAA( all_chains ) ) self.assertEqual(r, self.EXPECTED)
def test_ChainSeparator(self): """ChainSeparator test""" self.sep = ChainSeparator(self.fname, self.outPath, 1) self.chain = self.sep.next() i = 1 all_chains = [] while self.chain <> None: if self.local: print 'Chain %i:' % i, ''.join(singleAA(self.chain.sequence())) all_chains += self.chain.sequence() self.chain = self.sep.next() i += 1 if self.local: print 'ChainSeparator log file written to: %s' % self.sep.log.fname r = ''.join(singleAA(all_chains)) self.assertEqual(r, self.EXPECTED)
def _removeDuplicateChains(self, chainMask=None): """ Get rid of identical chains by comparing all chains with Blast2seq. @param chainMask: chain mask for overriding the chain identity checking (default: None) @type chainMask: [int] @return: number of chains removed @rtype: int """ chainCount = len(self.chains) matrix = 1.0 * N.zeros((chainCount,chainCount)) chain_ids = [] ## create identity matrix for all chains against all chains for i in range(0, chainCount): chain_ids = chain_ids + [self.chains[i].chain_id] # collect for log file for j in range(i, len(self.chains)): # convert 3-letter-code res list into 1-letter-code String seq1 = singleAA( self.chains[i].sequence() ) seq2 = singleAA( self.chains[j].sequence() ) ## if len(seq1) > len(seq2): # take shorter sequence ## # aln len at least half the len of the shortest sequence ## alnCutoff = len(seq2) * 0.5 ## else: ## alnCutoff = len(seq1) * 0.5 ## if id['aln_len'] > alnCutoff: ## matrix[i,j] = id['aln_id'] ## else: # aln length too short, ignore ## matrix[i,j] = 0 matrix[i,j] = self._compareSequences( seq1, seq2 ) ## report activity self.log.add("\n Chain ID's of compared chains: "+str(chain_ids)) self.log.add(" Cross-Identity between chains:\n"+str(matrix)) self.log.add(" Identity threshold used: "+str(self.threshold)) ## override the automatic chain deletion by supplying a ## chain mask to this function if chainMask: if len(chainMask) == chainCount: self.chains = N.compress(chainMask, self.chains) self.log.add("NOTE: chain mask %s used for removing chains.\n"%chainMask) else: self.log.add("########## ERROR ###############") self.log.add("# Chain mask is only %i chains long"%len(chainMask)) self.log.add("# when a mask of length %i is needed"%chainCount) self.log.add("# No cleaning will be performed.\n") if not chainMask: ## look at diagonals in "identity matrix" ## (each chain against each) duplicate = len(self.chains) for offset in range(1,chainCount): diag = N.diagonal(matrix, offset ,0,1) # diagonal of 1's mark begin of duplicate avg = 1.0 * N.sum(diag)/len(diag) if (avg >= self.threshold): duplicate = offset break self.chains = self.chains[:duplicate] self.log.add("NOTE: Identity matrix will be used for removing identical chains.") ## report activit self.log.add(str(chainCount - len(self.chains))+\ " chains have been removed.\n") # how many chains have been removed? return (chainCount - len(self.chains))
def _removeDuplicateChains(self, chainMask=None): """ Get rid of identical chains by comparing all chains with Blast2seq. @param chainMask: chain mask for overriding the chain identity checking (default: None) @type chainMask: [int] @return: number of chains removed @rtype: int """ chainCount = len(self.chains) matrix = 1.0 * N.zeros((chainCount, chainCount)) chain_ids = [] ## create identity matrix for all chains against all chains for i in range(0, chainCount): chain_ids = chain_ids + [self.chains[i].chain_id ] # collect for log file for j in range(i, len(self.chains)): # convert 3-letter-code res list into 1-letter-code String seq1 = singleAA(self.chains[i].sequence()) seq2 = singleAA(self.chains[j].sequence()) ## if len(seq1) > len(seq2): # take shorter sequence ## # aln len at least half the len of the shortest sequence ## alnCutoff = len(seq2) * 0.5 ## else: ## alnCutoff = len(seq1) * 0.5 ## if id['aln_len'] > alnCutoff: ## matrix[i,j] = id['aln_id'] ## else: # aln length too short, ignore ## matrix[i,j] = 0 matrix[i, j] = self._compareSequences(seq1, seq2) ## report activity self.log.add("\n Chain ID's of compared chains: " + str(chain_ids)) self.log.add(" Cross-Identity between chains:\n" + str(matrix)) self.log.add(" Identity threshold used: " + str(self.threshold)) ## override the automatic chain deletion by supplying a ## chain mask to this function if chainMask: if len(chainMask) == chainCount: self.chains = N.compress(chainMask, self.chains) self.log.add( "NOTE: chain mask %s used for removing chains.\n" % chainMask) else: self.log.add("########## ERROR ###############") self.log.add("# Chain mask is only %i chains long" % len(chainMask)) self.log.add("# when a mask of length %i is needed" % chainCount) self.log.add("# No cleaning will be performed.\n") if not chainMask: ## look at diagonals in "identity matrix" ## (each chain against each) duplicate = len(self.chains) for offset in range(1, chainCount): diag = N.diagonal(matrix, offset, 0, 1) # diagonal of 1's mark begin of duplicate avg = 1.0 * N.sum(diag) / len(diag) if (avg >= self.threshold): duplicate = offset break self.chains = self.chains[:duplicate] self.log.add( "NOTE: Identity matrix will be used for removing identical chains." ) ## report activit self.log.add(str(chainCount - len(self.chains))+\ " chains have been removed.\n") # how many chains have been removed? return (chainCount - len(self.chains))