def calcModes(self, n_modes=20, turbo=True): """Calculate principal (or essential) modes. This method uses :func:`scipy.linalg.eigh`, or :func:`numpy.linalg.eigh`, function to diagonalize the covariance matrix. :arg n_modes: number of non-zero eigenvalues/vectors to calculate, default is 20, if **None** or ``'all'`` is given, all modes will be calculated :type n_modes: int :arg turbo: when available, use a memory intensive but faster way to calculate modes, default is **True** :type turbo: bool""" if self._cov is None: raise ValueError('covariance matrix is not built or set') start = time.time() self._clear() if str(n_modes).lower() == 'all': n_modes = None values, vectors, _ = solveEig(self._cov, n_modes=n_modes, zeros=True, turbo=turbo, reverse=True) which = values > ZERO self._eigvals = values[which] self._array = vectors[:, which] self._vars = values self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format( self._n_modes, time.time() - start))
def _sample(self, conf): tmp = self._atoms.copy() tmp.setCoords(conf) cg = tmp[self._idx_cg] anm_cg = self._buildANM(cg) if not self._checkANM(anm_cg): return None anm_cg.calcModes(self._n_modes) anm_ex = self._extendModel(anm_cg, cg, tmp) ens_ex = sampleModes(anm_ex, atoms=tmp, n_confs=self._n_confs, rmsd=self._rmsd[self._cycle]) coordsets = ens_ex.getCoordsets() if self._targeted: if self._parallel: with Pool(cpu_count()) as p: pot_conf = p.map(self._multi_targeted_sim, [(conf, coords) for coords in coordsets]) else: pot_conf = [self._multi_targeted_sim((conf, coords)) for coords in coordsets] pots, poses = list(zip(*pot_conf)) idx = np.logical_not(np.isnan(pots)) coordsets = np.array(poses)[idx] LOGGER.debug('%d/%d sets of coordinates were moved to the target' % (len(poses), len(coordsets))) return coordsets
def calcHitTime(model, method='standard'): """Returns the hit and commute times between pairs of nodes calculated based on a :class:`.NMA` object. .. [CB95] Chennubhotla C., Bahar I. Signal Propagation in Proteins and Relation to Equilibrium Fluctuations. *PLoS Comput Biol* **2007** 3(9). :arg model: model to be used to calculate hit times :type model: :class:`.NMA` :arg method: method to be used to calculate hit times. Available options are ``"standard"`` or ``"kirchhoff"``. Default is ``"standard"`` :type method: str :returns: (:class:`~numpy.ndarray`, :class:`~numpy.ndarray`) """ try: K = model.getKirchhoff() except AttributeError: raise TypeError('model must be an NMA instance') if K is None: raise ValueError('model not built') method = method.lower() D = np.diag(K) A = np.diag(D) - K start = time.time() linalg = importLA() if method == 'standard': st = D / sum(D) P = np.dot(np.diag(D**(-1)), A) W = np.ones((len(st), 1)) * st.T Z = linalg.pinv(np.eye(P.shape[0], P.shape[1]) - P + W) H = np.ones((len(st), 1)) * np.diag(Z).T - Z H = H / W H = H.T elif method == 'kirchhoff': K_inv = linalg.pinv(K) sum_D = sum(D) T1 = (sum_D * np.ones((len(D),1)) * np.diag(K_inv)).T T2 = sum_D * K_inv T3_i = np.dot((np.ones((len(D),1)) * D), K_inv) H = T1 - T2 + T3_i - T3_i.T C = H + H.T LOGGER.debug('Hit and commute times are calculated in {0:.2f}s.' .format(time.time()-start)) return H, C
def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if (not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if (e.errno != 17): raise e else: raise Exception( 'Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.' ) # Check if the file exists already if (not os.path.isfile(data_folder + '/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder + '/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})'.format( go_obo_url, sympath(filename))) else: LOGGER.warn( '{0} download failed, reason unknown.'.format(go_obo_url)) else: go_obo = data_folder + '/go-basic.obo' return obo_parser.GODag(go_obo)
def buildReducedHessian(coords, system, cutoff=15., gamma=1.0, **kwargs): r0 = kwargs.pop('r0', 20.) dr = kwargs.pop('dr', 20.) labels = peelr(coords, system, r0, dr) LOGGER.debug('layers: ' + str(np.unique(labels))) H = calcHessianRecursion(coords, labels, 0, cutoff=cutoff, gamma=gamma, **kwargs) return H
def performSVD(self, coordsets): """Calculate principal modes using singular value decomposition (SVD). *coordsets* argument may be a :class:`.Atomic`, :class:`.Ensemble`, or :class:`numpy.ndarray` instance. If *coordsets* is a numpy array, its shape must be ``(n_csets, n_atoms, 3)``. Note that coordinate sets must be aligned prior to SVD calculations. This is a considerably faster way of performing PCA calculations compared to eigenvalue decomposition of covariance matrix, but is an approximate method when heterogeneous datasets are analyzed. Covariance method should be preferred over this one for analysis of ensembles with missing atomic data. See :ref:`pca-xray-calculations` example for comparison of results from SVD and covariance methods.""" linalg = importLA() start = time.time() if not isinstance(coordsets, (Ensemble, Atomic, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Atomic, Numpy ' 'array instance') if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') deviations = coordsets - coordsets.mean(0) else: if isinstance(coordsets, Ensemble): deviations = coordsets.getDeviations() elif isinstance(coordsets, Atomic): deviations = (coordsets._getCoordsets() - coordsets._getCoords()) n_confs = deviations.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate sets') n_atoms = deviations.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 deviations = deviations.reshape((n_confs, dof)).T vectors, values, self._temp = linalg.svd(deviations, full_matrices=False) values = (values**2) / n_confs self._dof = dof self._n_atoms = n_atoms which = values > 1e-18 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._trace = self._vars.sum() self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format( self._n_modes, time.time() - start))
def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if(not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if(e.errno != 17): raise e else: raise Exception('Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.') # Check if the file exists already if(not os.path.isfile(data_folder+'/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder+'/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})' .format(go_obo_url, sympath(filename))) else: LOGGER.warn('{0} download failed, reason unknown.' .format(go_obo_url)) else: go_obo = data_folder+'/go-basic.obo' return obo_parser.GODag(go_obo)
def searchDali(pdbId, chainId, daliURL=None, subset='fullPDB', **kwargs): """Search Dali server with input of PDB ID and chain ID. Dali server: http://ekhidna2.biocenter.helsinki.fi/dali/ :arg subset: fullPDB, PDB25, PDB50, PDB90 :type subset: str """ LOGGER.timeit('_dali') # timeout = 120 timeout = kwargs.pop('timeout', 120) if daliURL is None: daliURL = "http://ekhidna2.biocenter.helsinki.fi/cgi-bin/sans/dump.cgi" pdbId = pdbId.lower() pdb_chain = pdbId + chainId parameters = { 'cd1': pdb_chain, 'method': 'search', 'title': 'Title_' + pdb_chain, 'address': '' } enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request(daliURL, enc_params) try_error = 3 while try_error >= 0: try: url = urllib2.urlopen(request).url break except: try_error -= 1 if try_error >= 0: LOGGER.sleep( 2, '. Connection error happened. Trying to reconnect...') continue else: url = urllib2.urlopen(request).url break if url.split('.')[-1].lower() in ['html', 'php']: # print('test -1: '+url) url = url.replace(url.split('/')[-1], '') LOGGER.debug( 'Submitted Dali search for PDB and chain "{0} and {1}".'.format( pdbId, chainId)) LOGGER.info(url) LOGGER.clear() obj = DaliRecord(url, pdbId, chainId, subset=subset, timeout=timeout, **kwargs) #if obj.isSuccess: return obj
def performSVD(self, coordsets): """Calculate principal modes using singular value decomposition (SVD). *coordsets* argument may be a :class:`.Atomic`, :class:`.Ensemble`, or :class:`numpy.ndarray` instance. If *coordsets* is a numpy array, its shape must be ``(n_csets, n_atoms, 3)``. Note that coordinate sets must be aligned prior to SVD calculations. This is a considerably faster way of performing PCA calculations compared to eigenvalue decomposition of covariance matrix, but is an approximate method when heterogeneous datasets are analyzed. Covariance method should be preferred over this one for analysis of ensembles with missing atomic data. See :ref:`pca-xray-calculations` example for comparison of results from SVD and covariance methods.""" linalg = importLA() start = time.time() if not isinstance(coordsets, (Ensemble, Atomic, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Atomic, Numpy ' 'array instance') if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') deviations = coordsets - coordsets.mean(0) else: if isinstance(coordsets, Ensemble): deviations = coordsets.getDeviations() elif isinstance(coordsets, Atomic): deviations = (coordsets._getCoordsets() - coordsets._getCoords()) n_confs = deviations.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate sets') n_atoms = deviations.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 deviations = deviations.reshape((n_confs, dof)).T vectors, values, self._temp = linalg.svd(deviations, full_matrices=False) values = (values ** 2) / n_confs self._dof = dof self._n_atoms = n_atoms which = values > 1e-18 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._trace = self._vars.sum() self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.' .format(self._n_modes, time.time()-start))
def SCN(M, **kwargs): """ Performs Sequential Component Normalization on matrix *M*. .. [AC12] Cournac A, Marie-Nelly H, Marbouty M, Koszul R, Mozziconacci J. Normalization of a chromosomal contact map. *BMC Genomics* **2012**. """ total_count = kwargs.pop('total_count', None) max_loops = kwargs.pop('max_loops', 100) tol = kwargs.pop('tol', 1e-5) N = M.copy() n = 0 d0 = None p = 1 last_p = None while True: C = np.diag(div0(1., np.sum(N, axis=0))) N = np.dot(N, C) R = np.diag(div0(1., np.sum(N, axis=1))) N = np.dot(R, N) n += 1 # check convergence of symmetry d = np.mean(np.abs(N - N.T)) if d0 is not None: p = div0(d, d0) dp = np.abs(p - last_p) if dp < tol: break else: d0 = d LOGGER.debug('Iteration {0}: d = {1}, p = {2}'.format( str(n), str(d), str(p))) last_p = p if max_loops is not None: if n >= max_loops: LOGGER.warn('The SCN algorithm did not converge after {0} ' 'iterations.'.format(max_loops)) break # guarantee symmetry N = (N + N.T) / 2. if total_count == 'original': total_count = np.sum(M) if total_count is not None: sum_N = np.sum(N) k = total_count / sum_N N = N * k return N
def _getEigvecs(modes, row_norm=False, dummy_mode=False): la = importLA() if isinstance(modes, (Mode, ModeSet, NMA)): model = modes._model if isinstance(model, MaskedGNM): masked = model.masked model.masked = True V = modes.getArray() model.masked = masked else: V = modes.getArray() elif isinstance(modes, np.ndarray): V = modes else: try: mode0 = modes[0] if isinstance(mode0, Mode): V = np.empty((len(mode0), 0)) for mode in modes: assert isinstance(mode, Mode), 'Modes should be a list of modes.' v = mode.getEigvec() v = np.expand_dims(v, axis=1) V = np.hstack((V, v)) else: V = np.array(modes) except TypeError: raise TypeError('Modes should be a list of modes.') if V.ndim == 1: V = np.expand_dims(V, axis=1) # add a dummy zero mode to the modeset if dummy_mode: v0 = V[:, 0] if np.allclose(v0, np.mean(v0)): dummy_mode = False LOGGER.warn( 'at least one zero mode is detected therefore dummy mode will NOT be added' ) if dummy_mode: n, _ = V.shape v0 = np.ones((n, 1), dtype=V.dtype) v0 /= la.norm(v0) V = np.hstack((v0, V)) LOGGER.debug('a dummy zero mode is added') # normalize the rows so that feature vectors are unit vectors if row_norm: norms = la.norm(V, axis=1) N = np.diag(div0(1., norms)) V = np.dot(N, V) return V
def calcModes(self, n_modes=20, turbo=True): """Calculate principal (or essential) modes. This method uses :func:`scipy.linalg.eigh`, or :func:`numpy.linalg.eigh`, function to diagonalize the covariance matrix. :arg n_modes: number of non-zero eigenvalues/vectors to calculate, default is 20, if **None** or ``'all'`` is given, all modes will be calculated :type n_modes: int :arg turbo: when available, use a memory intensive but faster way to calculate modes, default is **True** :type turbo: bool""" linalg = importLA() if self._cov is None: raise ValueError('covariance matrix is not built or set') start = time.time() dof = self._dof self._clear() if str(n_modes).lower() == 'all': n_modes = None if linalg.__package__.startswith('scipy'): if n_modes is None: eigvals = None n_modes = dof else: n_modes = int(n_modes) if n_modes >= self._dof: eigvals = None n_modes = dof else: eigvals = (dof - n_modes, dof - 1) values, vectors = linalg.eigh(self._cov, turbo=turbo, eigvals=eigvals) else: if n_modes is not None: LOGGER.info('Scipy is not found, all modes are calculated.') values, vectors = linalg.eigh(self._cov) # Order by descending SV revert = list(range(len(values) - 1, -1, -1)) values = values[revert] vectors = vectors[:, revert] which = values > 1e-8 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format( self._n_modes, time.time() - start))
def SCN(M, **kwargs): la = importLA() total_count = kwargs.pop('total_count', None) max_loops = kwargs.pop('max_loops', 100) tol = kwargs.pop('tol', 1e-5) N = M.copy() n = 0 d0 = None p = 1 last_p = None while True: C = np.diag(div0(1., np.sum(N, axis=0))) N = np.dot(N, C) R = np.diag(div0(1., np.sum(N, axis=1))) N = np.dot(R, N) n += 1 # check convergence of symmetry d = np.mean(np.abs(N - N.T)) if d0 is not None: p = div0(d, d0) dp = np.abs(p - last_p) if dp < tol: break else: d0 = d LOGGER.debug('Iteration {0}: d = {1}, p = {2}'.format( str(n), str(d), str(p))) last_p = p if max_loops is not None: if n >= max_loops: LOGGER.warn('The SCN algorithm did not converge after {0} ' 'iterations.'.format(max_loops)) break # guarantee symmetry N = (N + N.T) / 2. if total_count is 'original': total_count = np.sum(M) if total_count is not None: sum_N = np.sum(N) k = total_count / sum_N N = N * k return N
def calcModes(self, n_modes=20, turbo=True): """Calculate principal (or essential) modes. This method uses :func:`scipy.linalg.eigh`, or :func:`numpy.linalg.eigh`, function to diagonalize the covariance matrix. :arg n_modes: number of non-zero eigenvalues/vectors to calculate, default is 20, if **None** or ``'all'`` is given, all modes will be calculated :type n_modes: int :arg turbo: when available, use a memory intensive but faster way to calculate modes, default is **True** :type turbo: bool""" linalg = importLA() if self._cov is None: raise ValueError('covariance matrix is not built or set') start = time.time() dof = self._dof self._clear() if str(n_modes).lower() == 'all': n_modes = None if linalg.__package__.startswith('scipy'): if n_modes is None: eigvals = None n_modes = dof else: n_modes = int(n_modes) if n_modes >= self._dof: eigvals = None n_modes = dof else: eigvals = (dof - n_modes, dof - 1) values, vectors = linalg.eigh(self._cov, turbo=turbo, eigvals=eigvals) else: if n_modes is not None: LOGGER.info('Scipy is not found, all modes are calculated.') values, vectors = linalg.eigh(self._cov) # Order by descending SV revert = list(range(len(values)-1, -1, -1)) values = values[revert] vectors = vectors[:, revert] which = values > 1e-8 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.' .format(self._n_modes, time.time()-start))
def SCN(M, **kwargs): la = importLA() total_count = kwargs.pop('total_count', None) max_loops = kwargs.pop('max_loops', 100) tol = kwargs.pop('tol', 1e-5) N = M.copy() n = 0 d0 = None p = 1 last_p = None while True: C = np.diag(div0(1., np.sum(N, axis=0))) N = np.dot(N, C) R = np.diag(div0(1., np.sum(N, axis=1))) N = np.dot(R, N) n += 1 # check convergence of symmetry d = np.mean(np.abs(N - N.T)) if d0 is not None: p = div0(d, d0) dp = np.abs(p - last_p) if dp < tol: break else: d0 = d LOGGER.debug('Iteration {0}: d = {1}, p = {2}'.format(str(n), str(d), str(p))) last_p = p if max_loops is not None: if n >= max_loops: LOGGER.warn('The SCN algorithm did not converge after {0} ' 'iterations.'.format(max_loops)) break # guarantee symmetry N = (N + N.T) / 2. if total_count is 'original': total_count = np.sum(M) if total_count is not None: sum_N = np.sum(N) k = total_count / sum_N N = N * k return N
def match(self): """Matches the modes across mode sets according the mode overlaps.""" if self._modesets: #LOGGER.debug('Matching {0} modes across {1} modesets...' # .format(self.numModes(), self.numModeSets())) start = time.time() self._modesets = matchModes(*self._modesets) LOGGER.debug( '{0} modes across {1} modesets were matched in {2:.2f}s.'. format(self.numModes(), self.numModeSets(), time.time() - start)) else: LOGGER.warn('Mode ensemble has no modesets') self._matched = True return
def calcHitTime(self, method='Z'): if self._affinity is None: self._buildAffinity() start = time.time() linalg = importLA() if method == 'Z': D = self._diagonal A = self._affinity st = D / sum(D) P = np.dot(np.diag(D**(-1)), A) W = np.ones((len(st),1)) * st.T Z = linalg.pinv(np.eye(P.shape[0], P.shape[1]) - P + W) H = np.ones((len(st),1)) * np.diag(Z).T - Z H = H / W H = H.T elif method == 'K': K = self._kirchhoff D = self._diagonal K_inv = linalg.pinv(K) sum_D = sum(D) T1 = (sum_D * np.ones((len(D),1)) * np.diag(K_inv)).T T2 = sum_D * K_inv T3_i = np.dot((np.ones((len(D),1)) * D), K_inv) H = T1 - T2 + T3_i - T3_i.T self._hitTime = H self._commuteTime = H + H.T LOGGER.debug('Hitting and commute time are calculated in {0:.2f}s.' .format(time.time()-start))
def runManySteps(self, n_steps, **kwargs): LOGGER.timeit('_prody_runManySteps') n_start = self.numSteps while self.numSteps < n_start + n_steps: self.runStep(structA=self.structA, structB=self.structB, **kwargs) LOGGER.debug( 'Total time so far is %.2f minutes' % ((time.time() - LOGGER._times['_prody_runManySteps']) / 60)) converged = self.checkConvergence() if converged: self.structA.setCoords( self.coordsA ) # That way the original object is back to normal self.structB.setCoords( self.coordsB ) # That way the original object is back to normal LOGGER.debug( 'Process completed in %.2f hours' % ((time.time() - LOGGER._times['_prody_runManySteps']) / 3600)) break
def calcHessianRecursion(coords, layers, layer, cutoff=15., gamma=1.0, **kwargs): if layer == 0: LOGGER.debug('max layer: %d' % max(layers)) LOGGER.debug('layer: %d' % layer) Hss, Hse = buildLayerHessian(coords, layers, layer, cutoff=cutoff, gamma=gamma, **kwargs) if Hse is None: # last layer, Hee=Hss H = Hss else: Hee = calcHessianRecursion(coords, layers, layer + 1, cutoff=cutoff, gamma=gamma, **kwargs) Cee = inv(Hee) #H = Hss - Hse.dot(Cee.dot(Hse.T)) #H = Hss - Hse @ Cee @ Hse.T if PY3K: H = Hss - Hse.__matmul__(Cee).__matmul__(Hse.T) else: H = Hss - Hse.dot(Cee.dot(Hse.T)) LOGGER.debug('layer: %d finished' % layer) return H
def calcKirchhoffRecursion(coords, layers, layer, cutoff=15., gamma=1.0, **kwargs): if layer == 0: LOGGER.debug('max layer: %d' % max(layers)) LOGGER.debug('layer: %d' % layer) Gss, Gse = buildLayerKirchhoff(coords, layers, layer, cutoff=cutoff, gamma=gamma, **kwargs) if Gse is None: # last layer, Gee=Gss G = Gss else: Gee = calcKirchhoffRecursion(coords, layers, layer + 1, cutoff=cutoff, gamma=gamma, **kwargs) Cee = inv(Gee) #G = Gss - Gse.dot(Cee.dot(Gse.T)) #G = Gss - Gse @ Cee @ Gse.T if PY3K: G = Gss - Gse.__matmul__(Cee).__matmul__(Gse.T) else: G = Gss - Gse.dot(Cee.dot(Gse.T)) LOGGER.debug('layer: %d finished' % layer) return G
def runManyStepsAlternating(self, n_steps, **kwargs): LOGGER.timeit('_prody_runManySteps') n_start = self.numSteps while self.numSteps < n_start + n_steps: n_modes = self.n_modes self.runStep(structA=self.structA, structB=self.structB, reduceSelA=self.reduceSelA, reduceSelB=self.reduceSelB, alignSelA=self.alignSelA, alignSelB=self.alignSelB, n_modes=n_modes, **kwargs) LOGGER.debug( 'Total time so far is %.2f minutes' % ((time.time() - LOGGER._times['_prody_runManySteps']) / 60)) self.runStep(structA=self.structB, structB=self.structA, reduceSelA=self.reduceSelB, reduceSelB=self.reduceSelA, alignSelA=self.alignSelB, alignSelB=self.alignSelA, n_modes=n_modes, **kwargs) LOGGER.debug( 'Total time so far is %.2f minutes' % ((time.time() - LOGGER._times['_prody_runManySteps']) / 60)) converged = self.checkConvergence() if converged: self.structA.setCoords( self.coordsA ) # That way the original object is back to normal self.structB.setCoords( self.coordsB ) # That way the original object is back to normal LOGGER.debug( 'Process completed in %.2f hours' % ((time.time() - LOGGER._times['_prody_runManySteps']) / 3600)) break ensemble = Ensemble('combined trajectory') ensemble.setAtoms(self.structA) for coordset in self.ensembleA.getCoordsets(): ensemble.addCoordset(coordset) for coordset in reversed(self.ensembleB.getCoordsets()): ensemble.addCoordset(coordset) if self.outputPDB: writePDB(self.filename, ensemble) if self.outputDCD: writeDCD(self.filename, ensemble) return
def peelr(coords, system, r0=20., dr=20.): n_sys_atoms = int(system.sum()) n_atoms = len(system) labels = np.zeros(n_atoms, dtype=int) # identify system beads sys_coords = coords[system, :2] sys_norms = norm(sys_coords, axis=1) sys_r = max(sys_norms) r0 += sys_r # label environment beads env_coords = coords[~system, :2] env_norms = norm(env_coords, axis=1) L = (env_norms - r0) // dr + 1 L = np.clip(L, 0, None) + 1 labels[n_sys_atoms:] = L uniq_labels = np.unique(labels) if len(uniq_labels) >= 3: uniq_labels.sort() lbl_last = uniq_labels[-1] lbl_2nd_last = uniq_labels[-2] n_last = np.sum(labels == lbl_last) n_2nd_last = np.sum(labels == lbl_2nd_last) if n_last < 0.2 * n_2nd_last: LOGGER.debug('edge nodes detected (%d/%d)' % (n_2nd_last, n_last)) labels[labels == lbl_last] = lbl_2nd_last if len(uniq_labels) >= 3: uniq_labels.sort() lbl_first = uniq_labels[1] lbl_2nd = uniq_labels[2] n_first = np.sum(labels == lbl_first) n_2nd = np.sum(labels == lbl_2nd) if n_first < 0.2 * n_2nd: LOGGER.debug('inner nodes detected (%d/%d)' % (n_2nd, n_first)) labels[labels == lbl_first] = lbl_2nd if not any(uniq_labels == 1): LOGGER.debug('no layer inside the system') for i in range(len(labels)): if labels[i] > 1: labels[i] -= 1 uniq_labels = np.unique(labels) for i, label in enumerate(uniq_labels): labels[labels == label] = i return labels
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters) request = urllib.request.Request('http://hmmer.janelia.org/search/hmmscan', enc_params) url = ( urllib.request.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = {} for child in root[0]: if child.tag == 'hits': accession = child.get('acc') pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession']=accession matches[pfam_id]['class']='Domain' matches[pfam_id]['id']=child.get('name') matches[pfam_id]['locations']={} matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') matches[pfam_id]['locations']['end']=child[0].get('alisqto') matches[pfam_id]['locations']['evalue']=child.get('evalue') matches[pfam_id]['locations']['evidence']='hmmer v3.0' matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') matches[pfam_id]['locations']['significant']=child[0].get('significant') matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') matches[pfam_id]['type']='Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' else: url = ('http://pfam.xfam.org/protein/' + idcode + '?output=xml') else: url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def _targeted_sim(self, coords0, coords1, tmdk=15., d_steps=100, n_max_steps=10000, ddtol=1e-3, n_conv=5): try: from openmm import CustomExternalForce from openmm.app import StateDataReporter from openmm.unit import nanometer, kelvin, angstrom, kilojoule_per_mole, MOLAR_GAS_CONSTANT_R except ImportError: raise ImportError('Please install PDBFixer and OpenMM 7.6 in order to use ClustENM.') tmdk *= kilojoule_per_mole/angstrom**2 tmdk = tmdk.value_in_unit(kilojoule_per_mole/nanometer**2) # coords1_ca = coords1[self._idx_cg, :] pos1 = coords1 * angstrom # pos1_ca = pos1[self._idx_cg, :] force = CustomExternalForce('tmdk*((x-x0)^2+(y-y0)^2+(z-z0)^2)') force.addGlobalParameter('tmdk', 0.) force.addPerParticleParameter('x0') force.addPerParticleParameter('y0') force.addPerParticleParameter('z0') force.setForceGroup(1) # for i, atm_idx in enumerate(self._idx_cg): # pars = pos1_ca[i, :].value_in_unit(nanometer) # force.addParticle(int(atm_idx), pars) n_atoms = coords0.shape[0] atom_indices = np.arange(n_atoms) for i, atm_idx in enumerate(atom_indices): pars = pos1[i, :].value_in_unit(nanometer) force.addParticle(int(atm_idx), pars) simulation = self._prep_sim([force]) # automatic conversion into nanometer will be carried out. simulation.context.setPositions(coords0 * angstrom) dist = dist0 = calcRMSD(coords0, coords1) m_conv = 0 n_steps = 0 try: simulation.minimizeEnergy(tolerance=self._tolerance*kilojoule_per_mole, maxIterations=self._maxIterations) # update parameters while n_steps < n_max_steps: simulation.context.setParameter('tmdk', tmdk) force.updateParametersInContext(simulation.context) simulation.step(d_steps) n_steps += d_steps # evaluate distance to destination pos = simulation.context.getState(getPositions=True).getPositions(asNumpy=True).value_in_unit(angstrom) d = calcRMSD(pos, coords1) dd = np.abs(dist - d) if dd < ddtol: m_conv += 1 if m_conv >= n_conv: break dist = d LOGGER.debug('RMSD: %4.2f -> %4.2f' % (dist0, dist)) simulation.context.setParameter('tmdk', 0.0) simulation.minimizeEnergy(tolerance=self._tolerance*kilojoule_per_mole, maxIterations=self._maxIterations) pos = simulation.context.getState(getPositions=True).getPositions(asNumpy=True).value_in_unit(angstrom) pot = simulation.context.getState(getEnergy=True).getPotentialEnergy().value_in_unit(kilojoule_per_mole) return pot, pos except BaseException as be: LOGGER.warning('OpenMM exception: ' + be.__str__() + ' so the corresponding conformer will be discarded!') return np.nan, np.full_like(coords0, np.nan)
def buildKirchhoff(self, coords, cutoff=10., gamma=1., **kwargs): """Build Kirchhoff matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` or :class:`.Atomic` :arg cutoff: cutoff distance (Å) for pairwise interactions default is 10.0 Å, , minimum is 4.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float :arg sparse: elect to use sparse matrices, default is **False**. If Scipy is not found, :class:`ImportError` is raised. :type sparse: bool :arg kdtree: elect to use KDTree for building Kirchhoff matrix faster, default is **True** :type kdtree: bool Instances of :class:`Gamma` classes and custom functions are accepted as *gamma* argument. When Scipy is available, user can select to use sparse matrices for efficient usage of memory at the cost of computation speed.""" try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') cutoff, g, gamma = checkENMParameters(cutoff, gamma) self._reset() self._cutoff = cutoff self._gamma = g n_atoms = coords.shape[0] start = time.time() if kwargs.get('sparse', False): try: from scipy import sparse as scipy_sparse except ImportError: raise ImportError('failed to import scipy.sparse, which is ' 'required for sparse matrix calculations') kirchhoff = scipy_sparse.lil_matrix((n_atoms, n_atoms)) else: kirchhoff = np.zeros((n_atoms, n_atoms), 'd') if kwargs.get('kdtree', True): kdtree = KDTree(coords) kdtree.search(cutoff) dist2 = kdtree.getDistances() ** 2 r = 0 for i, j in kdtree.getIndices(): g = gamma(dist2[r], i, j) kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] + g kirchhoff[j, j] = kirchhoff[j, j] + g r += 1 else: LOGGER.info('Using slower method for building the Kirchhoff.') cutoff2 = cutoff * cutoff mul = np.multiply for i in range(n_atoms): xyz_i = coords[i, :] i_p1 = i+1 i2j = coords[i_p1:, :] - xyz_i mul(i2j, i2j, i2j) for j, dist2 in enumerate(i2j.sum(1)): if dist2 > cutoff2: continue j += i_p1 g = gamma(dist2, i, j) kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] + g kirchhoff[j, j] = kirchhoff[j, j] + g LOGGER.debug('Kirchhoff was built in {0:.2f}s.' .format(time.time()-start)) self._kirchhoff = kirchhoff self._n_atoms = n_atoms self._dof = n_atoms
def fetchPDBfromMirror(*pdb, **kwargs): """Returns path(s) to PDB (default), PDBML, or mmCIF file(s) for specified *pdb* identifier(s). If a *folder* is specified, files will be copied into this folder. If *compressed* is **False**, files will decompressed. *format* argument can be used to get `PDBML <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files: ``format='cif'`` will fetch an mmCIF file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" mirror = pathPDBMirror() if mirror is None: raise IOError('no mirror path is set') try: mirror, mirror_format = mirror except ValueError: mirror_format = None format = str(kwargs.pop('format', 'pdb')).lower() if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) if format == 'pdb': ftp_divided = 'data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if bool(kwargs.pop('noatom', False)): ftp_divided = 'data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' else: if format: raise ValueError('{0} is not a recognized format'.format( repr(format))) else: raise ValueError('please specify a valid format') if mirror_format: if mirror_format.lower() != format: raise IOError('mirror contains only ' + mirror_format + ' files') ftp_divided = '' else: ftp_divided = join(*ftp_divided.split('/')) folder = kwargs.get('folder') compressed = kwargs.get('compressed', True) filenames = [] append = filenames.append success = 0 failure = 0 for pdb in identifiers: if pdb is None: append(None) continue fn = join(mirror, ftp_divided, pdb[1:3], ftp_prefix + pdb + ftp_pdbext) if isfile(fn): if folder or not compressed: if compressed: fn = copyFile(fn, join(folder or '.', pdb + extension + '.gz')) else: fn = gunzip(fn, join(folder or '.', pdb + extension)) append(normpath(fn)) success += 1 else: append(None) failure += 1 if len(identifiers) == 1: fn = filenames[0] if success: LOGGER.debug('PDB file is found in the local mirror ({0}).'.format( sympath(fn))) return fn else: LOGGER.debug('PDB files found in the local mirror ({0} found, ' '{1} missed).'.format(success, failure)) return filenames
def refineMSA(msa, label=None, rowocc=None, seqid=None, colocc=None, **kwargs): """Refine *msa* by removing sequences (rows) and residues (columns) that contain gaps. :arg msa: multiple sequence alignment :type msa: :class:`.MSA` :arg label: remove columns that are gaps in the sequence matching label, ``msa.getIndex(label)`` must return a sequence index, a PDB identifier is also acceptable :type label: str :arg rowocc: row occupancy, sequences with less occupancy will be removed after *label* refinement is applied :type rowocc: float :arg seqid: keep unique sequences at specified sequence identity level, unique sequences are identified using :func:`.uniqueSequences` :type seqid: float :arg colocc: column occupancy, residue positions with less occupancy will be removed after other refinements are applied :type colocc: float :arg keep: keep columns corresponding to residues not resolved in the PDB structure, default is **False**, applies when *label* is a PDB identifier :arg type: bool For Pfam MSA data, *label* is UniProt entry name for the protein. You may also use PDB structure and chain identifiers, e.g. ``'1p38'`` or ``'1p38A'``, for *label* argument and UniProt entry names will be parsed using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and :class:`.DBRef`). The order of refinements are applied in the order of arguments. If *label* and *unique* is specified is specified, sequence matching *label* will be kept in the refined :class:`.MSA` although it may be similar to some other sequence.""" # if msa is a char array, it will be refined but label won't work try: ndim, dtype_ = msa.ndim, msa.dtype except AttributeError: try: arr = msa._getArray() except AttributeError: raise TypeError('msa must be a character array or an MSA instance') ndim, dtype_ = arr.ndim, arr.dtype else: arr, msa = msa, None if dtype('|S1') != dtype_: raise ValueError('msa must be a character array or an MSA instance') if ndim != 2: raise ValueError('msa must be a 2D array or an MSA instance') title = [] cols = None index = None if label is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: upper, lower = label.upper(), label.lower() except AttributeError: raise TypeError('label must be a string') if msa is None: raise TypeError('msa must be an MSA instance, ' 'label cannot be used') index = msa.getIndex(label) if index is None: index = msa.getIndex(upper) if index is None: index = msa.getIndex(lower) chain = None if index is None and (len(label) == 4 or len(label) == 5): from prody import parsePDB try: structure, header = parsePDB(label[:4], header=True) except Exception as err: raise IOError('failed to parse header for {0} ({1})' .format(label[:4], str(err))) chid = label[4:].upper() for poly in header['polymers']: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if index is None: index = msa.getIndex(dbref.idcode) if index is not None: LOGGER.info('{0} idcode {1} for {2}{3} ' 'is found in chain {3}.'.format( dbref.database, dbref.idcode, label[:4], poly.chid, str(msa))) break if index is None: index = msa.getIndex(dbref.accession) if index is not None: LOGGER.info('{0} accession {1} for {2}{3} ' 'is found in chain {3}.'.format( dbref.database, dbref.accession, label[:4], poly.chid, str(msa))) break if index is not None: chain = structure[poly.chid] if index is None: raise ValueError('label is not in msa, or msa is not indexed') try: len(index) except TypeError: pass else: raise ValueError('label {0} maps onto multiple sequences, ' 'so cannot be used for refinement'.format(label)) title.append('label=' + label) cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) LOGGER.report('Label refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if chain is not None and not kwargs.get('keep', False): before = arr.shape[1] LOGGER.timeit('_refine') from prody.proteins.compare import importBioPairwise2 from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY pw2 = importBioPairwise2() chseq = chain.getSequence() algn = pw2.align.localms(arr[index].tostring().upper(), chseq, MATCH_SCORE, MISMATCH_SCORE, GAP_PENALTY, GAP_EXT_PENALTY, one_alignment_only=1) torf = [] for s, c in zip(*algn[0][:2]): if s == '-': continue elif c != '-': torf.append(True) else: torf.append(False) torf = array(torf) tsum = torf.sum() assert tsum <= before, 'problem in mapping sequence to structure' if tsum < before: arr = arr.take(torf.nonzero()[0], 1) LOGGER.report('Structure refinement reduced number of ' 'columns from {0} to {1} in %.2fs.' .format(before, arr.shape[1]), '_refine') else: LOGGER.debug('All residues in the sequence are contained in ' 'PDB structure {0}.'.format(label)) from .analysis import calcMSAOccupancy, uniqueSequences rows = None if rowocc is not None: before = arr.shape[0] LOGGER.timeit('_refine') try: rowocc = float(rowocc) except Exception as err: raise TypeError('rowocc must be a float ({0})'.format(str(err))) assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1' rows = calcMSAOccupancy(arr, 'row') >= rowocc if index is not None: index = rows[:index].sum() rows = (rows).nonzero()[0] arr = arr[rows] title.append('rowocc>=' + str(rowocc)) LOGGER.report('Row occupancy refinement reduced number of rows from ' '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if seqid is not None: before = arr.shape[0] LOGGER.timeit('_refine') unique = uniqueSequences(arr, seqid) if index is not None: unique[index] = True unique = unique.nonzero()[0] arr = arr[unique] title.append('seqid>=' + str(seqid)) if rows is not None: rows = rows[unique] else: rows = unique LOGGER.report('Sequence identity refinement reduced number of rows ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if colocc is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: colocc = float(colocc) except Exception as err: raise TypeError('colocc must be a float ({0})'.format(str(err))) assert 0. <= colocc <= 1., 'colocc must be between 0 and 1' cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0] arr = arr.take(cols, 1) title.append('colocc>=' + str(colocc)) LOGGER.report('Column occupancy refinement reduced number of columns ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if not title: raise ValueError('label, rowocc, colocc all cannot be None') # depending on slicing of rows, arr may not have it's own memory if arr.base is not None: arr = arr.copy() if msa is None: return arr else: if rows is None: from copy import copy labels = copy(msa._labels) mapping = copy(msa._mapping) else: labels = msa._labels labels = [labels[i] for i in rows] mapping = None return MSA(arr, title=msa.getTitle() + ' refined ({0})' .format(', '.join(title)), labels=labels, mapping=mapping)
def matchChains(atoms1, atoms2, **kwargs): """Return pairs of chains matched based on sequence similarity. Makes an all-to-all comparison of chains in *atoms1* and *atoms2*. Chains are obtained from hierarchical views (:class:`.HierView`) of atom groups. This function returns a list of matching chains in a tuples that contain 4 items: * matching chain from *atoms1* as a :class:`.AtomMap` instance, * matching chain from *atoms2* as a :class:`.AtomMap` instance, * percent sequence identity of the match, * percent sequence overlap of the match. List of matches are sorted in decreasing percent sequence identity order. :class:`.AtomMap` instances can be used to calculate RMSD values and superpose atom groups. :arg atoms1: atoms that contain a chain :type atoms1: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg atoms2: atoms that contain a chain :type atoms2: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :keyword subset: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is 90 :type seqid: float :keyword overlap: percent overlap, default is 90 :type overlap: float :keyword pwalign: perform pairwise sequence alignment :type pwalign: bool If *subset* is set to *calpha* or *backbone*, only alpha carbon atoms or backbone atoms will be paired. If set to *all*, all atoms common to matched residues will be returned. This function tries to match chains based on residue numbers and names. All chains in *atoms1* is compared to all chains in *atoms2*. This works well for different structures of the same protein. When it fails, :mod:`Bio.pairwise2` is used for pairwise sequence alignment, and matching is performed based on the sequence alignment. User can control, whether sequence alignment is performed or not with *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is enforced.""" if not isinstance(atoms1, (AtomGroup, Chain, Selection)): raise TypeError('atoms1 must be an AtomGroup, Chain, or Selection') if not isinstance(atoms2, (AtomGroup, Chain, Selection)): raise TypeError('atoms2 must be an AtomGroup, Chain, or Selection') subset = kwargs.get('subset', 'calpha') if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument' .format(str(subset))) seqid = kwargs.get('seqid', 90.) assert isinstance(seqid, (float, int)), 'seqid must be float' assert 0 < seqid <= 100, 'seqid must be in the range from 0 to 100' coverage = kwargs.get('overlap') if coverage is None: coverage = kwargs.get('coverage', 90.) assert isinstance(coverage, (float, int)), 'overlap must be float' assert 0 < coverage <= 100, 'overlap must be in the range from 0 to 100' pwalign = kwargs.get('pwalign', None) if isinstance(atoms1, Chain): chains1 = [atoms1] atoms1 = atoms1.getAtomGroup() else: chains1 = list(atoms1.getHierView().iterChains()) if not isinstance(atoms1, AtomGroup): atoms1 = atoms1.getAtomGroup() chains = list() for ch in chains1: simpch = SimpleChain(ch) if len(simpch) > 0: chains.append(simpch) chains1 = chains if not isinstance(atoms1, Chain): LOGGER.debug('Checking {0}: {1} chains are identified' .format(str(atoms1), len(chains1))) if isinstance(atoms2, Chain): chains2 = [atoms2] atoms2 = atoms2.getAtomGroup() else: chains2 = list(atoms2.getHierView().iterChains()) if not isinstance(atoms2, AtomGroup): atoms2 = atoms2.getAtomGroup() chains = list() for ch in chains2: simpch = SimpleChain(ch) if len(simpch) > 0: chains.append(simpch) chains2 = chains if not isinstance(atoms2, Chain): LOGGER.debug('Checking {0}: {1} chains are identified' .format(str(atoms2), len(chains2))) matches = [] unmatched = [] LOGGER.debug('Trying to match chains based on residue numbers and names:') for simpch1 in chains1: for simpch2 in chains2: LOGGER.debug(' Comparing {0} (len={1}) and {2} (len={3}):' .format(simpch1.getTitle(), len(simpch1), simpch2.getTitle(), len(simpch2))) match1, match2, nmatches = getTrivialMatch(simpch1, simpch2) _seqid = nmatches * 100 / min(len(simpch1), len(simpch2)) _cover = len(match2) * 100 / max(len(simpch1), len(simpch2)) if _seqid >= seqid and _cover >= coverage: LOGGER.debug('\tMatch: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.' .format(len(match1), _seqid, _cover)) matches.append((match1, match2, _seqid, _cover, simpch1, simpch2)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) unmatched.append((simpch1, simpch2)) if pwalign or (not matches and (pwalign is None or pwalign)): pairwise2 = importBioPairwise2() if pairwise2: LOGGER.debug('Trying to match chains based on {0} sequence ' 'alignment:'.format(ALIGNMENT_METHOD)) for simpch1, simpch2 in unmatched: LOGGER.debug(' Comparing {0} (len={1}) and {2} ' '(len={3}):' .format(simpch1.getTitle(), len(simpch1), simpch2.getTitle(), len(simpch2))) match1, match2, nmatches = getAlignedMatch(simpch1, simpch2) _seqid = nmatches * 100 / min(len(simpch1), len(simpch2)) _cover = len(match2) * 100 / max(len(simpch1), len(simpch2)) if _seqid >= seqid and _cover >= coverage: LOGGER.debug('\tMatch: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.' .format(len(match1), _seqid, _cover)) matches.append((match1, match2, _seqid, _cover, simpch1, simpch2)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).' .format(_seqid, _cover)) else: LOGGER.warning('Pairwise alignment could not be performed.') if not matches: return None if subset == 'calpha': subset = 'ca' elif subset == 'backbone': subset = 'bb' elif subset == 'heavy': subset = 'noh' for mi, result in enumerate(matches): match1, match2, _seqid, _cover, simpch1, simpch2 = result indices1 = [] indices2 = [] for i in range(len(match1)): ares = match1[i] bres = match2[i] if subset == 'ca': try: aid = ares.getNames().tolist().index('CA') except ValueError: aid = None try: bid = bres.getNames().tolist().index('CA') if aid is not None: indices1.append(ares._indices[aid]) indices2.append(bres._indices[bid]) except ValueError: pass elif subset == 'bb': for bban in ('N', 'CA', 'C', 'O'): try: aid = ares.getNames().tolist().index(bban) except ValueError: continue try: bid = bres.getNames().tolist().index(bban) except ValueError: continue else: indices1.append(ares._indices[aid]) indices2.append(bres._indices[bid]) elif subset == 'noh': for han, aid, noh in zip(ares.getNames(), ares._indices, ares.getFlags('noh')): if not noh: continue try: bid = bres.getNames().tolist().index(han) except ValueError: continue else: indices1.append(aid) indices2.append(bres._indices[bid]) elif subset is None or subset is 'all': aans = ares.getNames() bans = bres.getNames().tolist() aids = ares.getIndices() #bids = bres.getIndices() for j in range(len(aans)): try: bid = bres._indices[bans.index(aans[j])] indices1.append(aids[j]) indices2.append(bid) except ValueError: pass indices1 = np.array(indices1, int) indices2 = np.array(indices2, int) match1 = AM(atoms1, indices1, atoms1.getACSIndex(), title=simpch1.getTitle() + ' -> ' + simpch2.getTitle(), intarrays=True) match2 = AM(atoms2, indices2, atoms2.getACSIndex(), title=simpch2.getTitle() + ' -> ' + simpch1.getTitle(), intarrays=True) matches[mi] = (match1, match2, _seqid, _cover) if len(matches) > 1: def compare(m1, m2): return cmp(m1[2], m2[2]) matches.sort(compare, reverse=True) return matches
def refineMSA(msa, index=None, label=None, rowocc=None, seqid=None, colocc=None, **kwargs): """Refine *msa* by removing sequences (rows) and residues (columns) that contain gaps. :arg msa: multiple sequence alignment :type msa: :class:`.MSA` :arg index: remove columns that are gaps in the sequence with that index :type index: int :arg label: remove columns that are gaps in the sequence matching label, ``msa.getIndex(label)`` must return a sequence index, a PDB identifier is also acceptable :type label: str :arg rowocc: row occupancy, sequences with less occupancy will be removed after *label* refinement is applied :type rowocc: float :arg seqid: keep unique sequences at specified sequence identity level, unique sequences are identified using :func:`.uniqueSequences` :type seqid: float :arg colocc: column occupancy, residue positions with less occupancy will be removed after other refinements are applied :type colocc: float :arg keep: keep columns corresponding to residues not resolved in the PDB structure, default is **False**, applies when *label* is a PDB identifier :arg type: bool For Pfam MSA data, *label* is UniProt entry name for the protein. You may also use PDB structure and chain identifiers, e.g. ``'1p38'`` or ``'1p38A'``, for *label* argument and UniProt entry names will be parsed using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and :class:`.DBRef`). The order of refinements are applied in the order of arguments. If *label* and *unique* is specified, sequence matching *label* will be kept in the refined :class:`.MSA` although it may be similar to some other sequence.""" # if msa is a char array, it will be refined but label won't work try: ndim, dtype_ = msa.ndim, msa.dtype except AttributeError: try: arr = msa._getArray() except AttributeError: raise TypeError('msa must be a character array or an MSA instance') ndim, dtype_ = arr.ndim, arr.dtype else: arr, msa = msa, None if dtype('|S1') != dtype_: raise ValueError('msa must be a character array or an MSA instance') if ndim != 2: raise ValueError('msa must be a 2D array or an MSA instance') title = [] cols = None if index is not None: before = arr.shape[1] LOGGER.timeit('_refine') cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) title.append('index=' + str(index)) LOGGER.report( 'Index refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if label is not None: if index is not None: LOGGER.info('An index was provided so the label will be ignored.') else: before = arr.shape[1] LOGGER.timeit('_refine') try: upper, lower = label.upper(), label.lower() except AttributeError: raise TypeError('label must be a string') if msa is None: raise TypeError('msa must be an MSA instance, ' 'label cannot be used') index = msa.getIndex(label) if index is None: index = msa.getIndex(upper) if index is None: index = msa.getIndex(lower) chain = None if index is None and (len(label) == 4 or len(label) == 5): from prody import parsePDB try: structure, header = parsePDB(label[:4], header=True) except Exception as err: raise IOError( 'failed to parse header for {0} ({1})'.format( label[:4], str(err))) chid = label[4:].upper() for poly in header['polymers']: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if index is None: index = msa.getIndex(dbref.idcode) if index is not None: LOGGER.info('{0} idcode {1} for {2}{3} ' 'is found in chain {4}.'.format( dbref.database, dbref.idcode, label[:4], poly.chid, str(msa))) break if index is None: index = msa.getIndex(dbref.accession) if index is not None: LOGGER.info('{0} accession {1} for {2}{3} ' 'is found in chain {4}.'.format( dbref.database, dbref.accession, label[:4], poly.chid, str(msa))) break if index is not None: chain = structure[poly.chid] if index is None: raise ValueError('label is not in msa, or msa is not indexed') try: len(index) except TypeError: pass else: raise ValueError( 'label {0} maps onto multiple sequences, ' 'so cannot be used for refinement'.format(label)) title.append('label=' + label) cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) LOGGER.report( 'Label refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if chain is not None and not kwargs.get('keep', False): before = arr.shape[1] LOGGER.timeit('_refine') from prody.proteins.compare import importBioPairwise2 from prody.proteins.compare import MATCH_SCORE, MISMATCH_SCORE from prody.proteins.compare import GAP_PENALTY, GAP_EXT_PENALTY pw2 = importBioPairwise2() chseq = chain.getSequence() algn = pw2.align.localms(arr[index].tostring().upper(), chseq, MATCH_SCORE, MISMATCH_SCORE, GAP_PENALTY, GAP_EXT_PENALTY, one_alignment_only=1) torf = [] for s, c in zip(*algn[0][:2]): if s == '-': continue elif c != '-': torf.append(True) else: torf.append(False) torf = array(torf) tsum = torf.sum() assert tsum <= before, 'problem in mapping sequence to structure' if tsum < before: arr = arr.take(torf.nonzero()[0], 1) LOGGER.report( 'Structure refinement reduced number of ' 'columns from {0} to {1} in %.2fs.'.format( before, arr.shape[1]), '_refine') else: LOGGER.debug( 'All residues in the sequence are contained in ' 'PDB structure {0}.'.format(label)) from .analysis import calcMSAOccupancy, uniqueSequences rows = None if rowocc is not None: before = arr.shape[0] LOGGER.timeit('_refine') try: rowocc = float(rowocc) except Exception as err: raise TypeError('rowocc must be a float ({0})'.format(str(err))) assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1' rows = calcMSAOccupancy(arr, 'row') >= rowocc if index is not None: index = rows[:index].sum() rows = (rows).nonzero()[0] arr = arr[rows] title.append('rowocc>=' + str(rowocc)) LOGGER.report( 'Row occupancy refinement reduced number of rows from ' '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if seqid is not None: before = arr.shape[0] LOGGER.timeit('_refine') unique = uniqueSequences(arr, seqid) if index is not None: unique[index] = True unique = unique.nonzero()[0] arr = arr[unique] title.append('seqid>=' + str(seqid)) if rows is not None: rows = rows[unique] else: rows = unique LOGGER.report( 'Sequence identity refinement reduced number of rows ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if colocc is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: colocc = float(colocc) except Exception as err: raise TypeError('colocc must be a float ({0})'.format(str(err))) assert 0. <= colocc <= 1., 'colocc must be between 0 and 1' cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0] arr = arr.take(cols, 1) title.append('colocc>=' + str(colocc)) LOGGER.report( 'Column occupancy refinement reduced number of columns ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if not title: raise ValueError( 'label, index, seqid, rowocc, colocc all cannot be None') # depending on slicing of rows, arr may not have it's own memory if arr.base is not None: arr = arr.copy() if msa is None: return arr else: if rows is None: from copy import copy labels = copy(msa._labels) else: labels = msa._labels labels = [labels[i] for i in rows] return MSA(arr, title=msa.getTitle() + ' refined ({0})'.format(', '.join(title)), labels=labels)
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request('https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = { 'format' : 'tsv' } enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace('results','download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child['Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND','RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = {'hmmdb': 'pfam', 'seq': fseq} enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request( 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = {'format': 'tsv'} enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace( 'results', 'download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format( seq[:MINSEQLEN])) try: #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() except: raise ValueError('No matching Pfam domains were found.') # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches if PY3K: tsv = tsv.decode() lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id] = {} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child[ 'Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})'.format( seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.'.format( idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND', 'RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def mapOntoChain(atoms, chain, **kwargs): """Map *atoms* onto *chain*. This function returns a list of mappings. Each mapping is a tuple that contains 4 items: * Mapped chain as an :class:`.AtomMap` instance, * *chain* as an :class:`.AtomMap` instance, * Percent sequence identitity, * Percent sequence overlap Mappings are returned in decreasing percent sequence identity order. :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms in place of unmapped atoms. :arg atoms: atoms that will be mapped to the target *chain* :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg chain: chain to which atoms will be mapped :type chain: :class:`.Chain` :keyword subset: one of the following well-defined subsets of atoms: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is **90** if sequence alignment is performed, otherwise **0** :type seqid: float :keyword overlap: percent overlap, default is **70** :type overlap: float :keyword mapping: if ``"ce"`` or ``"cealign"``, then the CE algorithm [IS98]_ will be performed. It can also be a list of prealigned sequences, a :class:`.MSA` instance, or a dict of indices such as that derived from a :class:`.DaliRecord`. If set to anything other than the options listed above, including the default value (**None**), a simple mapping will be first attempted and if that failed then sequence alignment with a function from :mod:`~Bio.pairwise2` will be used unless *pwalign* is set to **False**, in which case the mapping will fail. :type mapping: list, str :keyword pwalign: if **True**, then pairwise sequence alignment will be performed. If **False** then a simple mapping will be performed based on residue numbers (as well as insertion codes). This will be overridden by the *mapping* keyword's value. :type pwalign: bool This function tries to map *atoms* to *chain* based on residue numbers and types. Each individual chain in *atoms* is compared to target *chain*. .. [IS98] Shindyalov IN, Bourne PE. Protein structure alignment by incremental combinatorial extension (CE) of the optimal path. *Protein engineering* **1998** 11(9):739-47. """ if not isinstance(atoms, (AtomGroup, AtomSubset)): raise TypeError('atoms must be an AtomGroup or a AtomSubset instance') if not isinstance(chain, Chain): raise TypeError('chain must be Chain instance') subset = str(kwargs.get('subset', 'calpha')).lower() if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument'.format( str(subset))) seqid = kwargs.get('seqid', 90.) coverage = kwargs.get('overlap', 70.) coverage = kwargs.get('coverage', coverage) pwalign = kwargs.get('pwalign', None) pwalign = kwargs.get('mapping', pwalign) alignment = None if pwalign is not None: if isinstance(pwalign, basestring): pwalign = str(pwalign).strip().lower() elif not isinstance(pwalign, bool): alignment = pwalign pwalign = True if subset != 'all': chid = chain.getChid() segname = chain.getSegname() chain_subset = chain.select(subset) target_chain = chain_subset.getHierView()[segname, chid] mobile = atoms.select(subset) else: target_chain = chain mobile = atoms if isinstance(mobile, Chain): chains = [mobile] map_ag = mobile.getAtomGroup() else: if isinstance(mobile, AtomGroup): map_ag = mobile else: map_ag = mobile.getAtomGroup() chains = list(mobile.getHierView().iterChains()) LOGGER.debug('Evaluating {0}: {1} chains are identified'.format( str(atoms), len(chains))) mappings = [] unmapped = [] unmapped_chids = [] target_ag = target_chain.getAtomGroup() simple_target = SimpleChain(target_chain, False) LOGGER.debug('Trying to map atoms based on residue numbers and ' 'identities:') for chain in chains: simple_chain = SimpleChain(chain, False) if len(simple_chain) == 0: LOGGER.debug(' Skipping {0}, which does not contain any amino ' 'acid residues.'.format(simple_chain)) continue LOGGER.debug(' Comparing {0} (len={1}) with {2}:'.format( simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) # trivial mapping serves as a first simple trial of alignment the two # sequences based on residue number, therefore the sequence identity # (TRIVIAL_SEQID) criterion is strict. _seqid = _cover = -1 target_list, chain_list, n_match, n_mapped = getTrivialMapping( simple_target, simple_chain) if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) trivial_seqid = TRIVIAL_SEQID if pwalign else seqid trivial_cover = TRIVIAL_COVERAGE if pwalign else coverage if _seqid >= trivial_seqid and _cover >= trivial_cover: LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: if not pwalign: LOGGER.debug( '\tFailed to match chains based on residue numbers ' '(seqid={0:.0f}%, overlap={1:.0f}%).'.format( _seqid, _cover)) unmapped.append(simple_chain) unmapped_chids.append(chain.getChid()) if not mappings and pwalign is None: pwalign = True if pwalign and unmapped: if alignment is None: if pwalign in ['ce', 'cealign']: aln_type = 'structure alignment' method = 'CE' if not 'seqid' in kwargs: seqid = 0. else: aln_type = 'sequence alignment' method = ALIGNMENT_METHOD else: aln_type = 'alignment' method = 'predefined' if not 'seqid' in kwargs: seqid = 0. LOGGER.debug('Trying to map atoms based on {0} {1}:'.format( method, aln_type)) for chid, simple_chain in zip(unmapped_chids, unmapped): LOGGER.debug(' Comparing {0} (len={1}) with {2}:'.format( simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) if method == 'CE': result = getCEAlignMapping(simple_target, simple_chain) else: if isinstance(alignment, dict): result = getDictMapping(simple_target, simple_chain, map_dict=alignment) else: result = getAlignedMapping(simple_target, simple_chain, alignment=alignment) if result is not None: target_list, chain_list, n_match, n_mapped = result if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: LOGGER.debug( '\tMapped: {0} residues match with {1:.0f}%' ' sequence identity and {2:.0f}% overlap.'.format( n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) for mi, result in enumerate(mappings): residues_target, residues_chain, _seqid, _cover = result indices_target = [] indices_chain = [] indices_mapping = [] indices_dummies = [] counter = 0 for i in range(len(residues_target)): res_tar = residues_target[i] res_chn = residues_chain[i] for atom_tar in res_tar: indices_target.append(atom_tar.getIndex()) if res_chn is not None: atom_chn = res_chn.getAtom(atom_tar.getName()) if atom_chn is not None: indices_chain.append(atom_chn.getIndex()) indices_mapping.append(counter) else: indices_dummies.append(counter) else: indices_dummies.append(counter) counter += 1 #n_atoms = len(indices_target) ch_tar = next((r for r in residues_target if r is not None)).getChain() ch_chn = next((r for r in residues_chain if r is not None)).getChain() title_tar = 'Chain {0} from {1}'.format( ch_tar.getChid(), ch_tar.getAtomGroup().getTitle()) title_chn = 'Chain {0} from {1}'.format( ch_chn.getChid(), ch_chn.getAtomGroup().getTitle()) # note that chain here is from atoms atommap = AM(map_ag, indices_chain, chain.getACSIndex(), mapping=indices_mapping, dummies=indices_dummies, title=title_chn + ' -> ' + title_tar) selection = AM(target_ag, indices_target, target_chain.getACSIndex(), title=title_tar + ' -> ' + title_chn, intarrays=True) mappings[mi] = (atommap, selection, _seqid, _cover) if len(mappings) > 1: mappings.sort(key=lambda m: m[-2:], reverse=True) return mappings
def fetchPDBviaFTP(*pdb, **kwargs): """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*. *format* keyword argument can be used to retrieve `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_ and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) format = str(kwargs.pop('format', 'pdb')).lower() noatom = bool(kwargs.pop('noatom', False)) if format == 'pdb': ftp_divided = 'pdb/data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if noatom: ftp_divided = 'pdb/data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'pdb/data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'pdb/data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' elif format == 'emd' or format == 'map': ftp_divided = 'emdb/structures' ftp_pdbext = '.map.gz' ftp_prefix = 'emd_' extension = '.map' else: raise ValueError(repr(format) + ' is not valid format') local_folder = pathPDBFolder() if format == 'pdb' and local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us'] LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name)) from ftplib import FTP try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: success = 0 failure = 0 filenames = [] ftp.login('') for pdb in identifiers: if pdb is None: filenames.append(None) continue data = [] ftp_fn = ftp_prefix + pdb + ftp_pdbext try: ftp.cwd(ftp_path) ftp.cwd(ftp_divided) if format == 'emd': ftp.cwd('EMD-{0}/map'.format(pdb)) else: ftp.cwd(pdb[1:3]) ftp.retrbinary('RETR ' + ftp_fn, data.append) except Exception as error: if ftp_fn in ftp.nlst(): LOGGER.warn('{0} download failed ({1}). It is ' 'possible that you do not have rights to ' 'download .gz files in the current network.' .format(pdb, str(error))) else: LOGGER.info('{0} download failed. {1} does not exist ' 'on {2}.'.format(ftp_fn, pdb, ftp_host)) failure += 1 filenames.append(None) else: if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: write = pdbfile.write [write(block) for block in data] filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) ftp.quit() LOGGER.debug('PDB download via FTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def searchPfam(query, search_b=False, skip_a=False, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.sanger.ac.uk/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') urlextension = '' if kwargs: ga = int(kwargs.get('ga', 1)) if not (ga == 1 or ga == 0): raise ValueError('ga must be either 0 or 1') evalue = kwargs.get('evalue', None) if evalue: if not float(evalue) <= 10.0: raise ValueError('evalue must be a valid float < 10.0') urlextension = urlextension + '&evalue=' + str(evalue) else: urlextension = urlextension + '&ga=' + str(ga) search_b = int(bool(search_b)) skip_a = int(bool(skip_a)) if skip_a == 1: search_b = 1 urlextension = urlextension + '&searchBs=' + str(search_b) urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) + urlextension + '&output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) try: url = dictElement(root[0], prefix)['result_url'] except (IndexError, KeyError): raise ValueError('failed to parse results XML, check URL: ' + url) else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' else: url = ('http://pfam.sanger.ac.uk/protein/' + idcode + '?output=xml') else: url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass #else: # if xml: # break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def mapOntoChain(atoms, chain, **kwargs): """Map *atoms* onto *chain*. This function returns a list of mappings. Each mapping is a tuple that contains 4 items: * Mapped chain as an :class:`.AtomMap` instance, * *chain* as an :class:`.AtomMap` instance, * Percent sequence identitity, * Percent sequence overlap Mappings are returned in decreasing percent sequence identity order. :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms in place of unmapped atoms. :arg atoms: atoms that will be mapped to the target *chain* :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg chain: chain to which atoms will be mapped :type chain: :class:`.Chain` :keyword subset: one of the following well-defined subsets of atoms: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is **90** if sequence alignment is performed, otherwise **0** :type seqid: float :keyword overlap: percent overlap, default is **70** :type overlap: float :keyword mapping: if ``"ce"`` or ``"cealign"``, then the CE algorithm [IS98]_ will be performed. It can also be a list of prealigned sequences, a :class:`.MSA` instance, or a dict of indices such as that derived from a :class:`.DaliRecord`. If set to anything other than the options listed above, including the default value (**None**), a simple mapping will be first attempted and if that failed then sequence alignment with a function from :mod:`~Bio.pairwise2` will be used unless *pwalign* is set to **False**, in which case the mapping will fail. :type mapping: list, str :keyword pwalign: if **True**, then pairwise sequence alignment will be performed. If **False** then a simple mapping will be performed based on residue numbers (as well as insertion codes). This will be overridden by the *mapping* keyword's value. :type pwalign: bool This function tries to map *atoms* to *chain* based on residue numbers and types. Each individual chain in *atoms* is compared to target *chain*. .. [IS98] Shindyalov IN, Bourne PE. Protein structure alignment by incremental combinatorial extension (CE) of the optimal path. *Protein engineering* **1998** 11(9):739-47. """ if not isinstance(atoms, (AtomGroup, AtomSubset)): raise TypeError('atoms must be an AtomGroup or a AtomSubset instance') if not isinstance(chain, Chain): raise TypeError('chain must be Chain instance') subset = str(kwargs.get('subset', 'calpha')).lower() if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument' .format(str(subset))) seqid = kwargs.get('seqid', 90.) coverage = kwargs.get('overlap', 70.) coverage = kwargs.get('coverage', coverage) pwalign = kwargs.get('pwalign', None) pwalign = kwargs.get('mapping', pwalign) alignment = None if pwalign is not None: if isinstance(pwalign, basestring): pwalign = str(pwalign).strip().lower() elif not isinstance(pwalign, bool): alignment = pwalign pwalign = True if subset != 'all': chid = chain.getChid() segname = chain.getSegname() chain_subset = chain.select(subset) target_chain = chain_subset.getHierView()[segname, chid] mobile = atoms.select(subset) else: target_chain = chain mobile = atoms if isinstance(mobile, Chain): chains = [mobile] map_ag = mobile.getAtomGroup() else: if isinstance(mobile, AtomGroup): map_ag = mobile else: map_ag = mobile.getAtomGroup() chains = list(mobile.getHierView().iterChains()) LOGGER.debug('Evaluating {0}: {1} chains are identified' .format(str(atoms), len(chains))) mappings = [] unmapped = [] unmapped_chids = [] target_ag = target_chain.getAtomGroup() simple_target = SimpleChain(target_chain, False) LOGGER.debug('Trying to map atoms based on residue numbers and ' 'identities:') for chain in chains: simple_chain = SimpleChain(chain, False) if len(simple_chain) == 0: LOGGER.debug(' Skipping {0}, which does not contain any amino ' 'acid residues.'.format(simple_chain)) continue LOGGER.debug(' Comparing {0} (len={1}) with {2}:' .format(simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) # trivial mapping serves as a first simple trial of alignment the two # sequences based on residue number, therefore the sequence identity # (TRIVIAL_SEQID) criterion is strict. _seqid = _cover = -1 target_list, chain_list, n_match, n_mapped = getTrivialMapping( simple_target, simple_chain) if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) trivial_seqid = TRIVIAL_SEQID if pwalign else seqid trivial_cover = TRIVIAL_COVERAGE if pwalign else coverage if _seqid >= trivial_seqid and _cover >= trivial_cover: LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.' .format(n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: if not pwalign: LOGGER.debug('\tFailed to match chains based on residue numbers ' '(seqid={0:.0f}%, overlap={1:.0f}%).' .format(_seqid, _cover)) unmapped.append(simple_chain) unmapped_chids.append(chain.getChid()) if not mappings and pwalign is None: pwalign = True if pwalign and unmapped: if alignment is None: if pwalign in ['ce', 'cealign']: aln_type = 'structure alignment' method = 'CE' if not 'seqid' in kwargs: seqid = 0. else: aln_type = 'sequence alignment' method = ALIGNMENT_METHOD else: aln_type = 'alignment' method = 'predefined' if not 'seqid' in kwargs: seqid = 0. LOGGER.debug('Trying to map atoms based on {0} {1}:' .format(method, aln_type)) for chid, simple_chain in zip(unmapped_chids, unmapped): LOGGER.debug(' Comparing {0} (len={1}) with {2}:' .format(simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) if method == 'CE': result = getCEAlignMapping(simple_target, simple_chain) else: if isinstance(alignment, dict): result = getDictMapping(simple_target, simple_chain, map_dict=alignment) else: result = getAlignedMapping(simple_target, simple_chain, alignment=alignment) if result is not None: target_list, chain_list, n_match, n_mapped = result if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: LOGGER.debug('\tMapped: {0} residues match with {1:.0f}%' ' sequence identity and {2:.0f}% overlap.' .format(n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).' .format(_seqid, _cover)) for mi, result in enumerate(mappings): residues_target, residues_chain, _seqid, _cover = result indices_target = [] indices_chain = [] indices_mapping = [] indices_dummies = [] counter = 0 for i in range(len(residues_target)): res_tar = residues_target[i] res_chn = residues_chain[i] for atom_tar in res_tar: indices_target.append(atom_tar.getIndex()) if res_chn is not None: atom_chn = res_chn.getAtom(atom_tar.getName()) if atom_chn is not None: indices_chain.append(atom_chn.getIndex()) indices_mapping.append(counter) else: indices_dummies.append(counter) else: indices_dummies.append(counter) counter += 1 #n_atoms = len(indices_target) ch_tar = next((r for r in residues_target if r is not None)).getChain() ch_chn = next((r for r in residues_chain if r is not None)).getChain() title_tar = 'Chain {0} from {1}'.format(ch_tar.getChid(), ch_tar.getAtomGroup().getTitle()) title_chn = 'Chain {0} from {1}'.format(ch_chn.getChid(), ch_chn.getAtomGroup().getTitle()) # note that chain here is from atoms atommap = AM(map_ag, indices_chain, chain.getACSIndex(), mapping=indices_mapping, dummies=indices_dummies, title=title_chn + ' -> ' + title_tar ) selection = AM(target_ag, indices_target, target_chain.getACSIndex(), title=title_tar + ' -> ' + title_chn, intarrays=True) mappings[mi] = (atommap, selection, _seqid, _cover) if len(mappings) > 1: mappings.sort(key=lambda m: m[-2:], reverse=True) return mappings
def fetchPDB(pdb, folder='.', compressed=True, copy=False, **kwargs): """Retrieve PDB, PDBML, or mmCIF file(s) for specified *pdb* identifier(s). *pdb* may be a string or a list. The function will return a filename or a list of filenames depending on input (see :ref:`fetchpdb` for examples). If *compressed* is **False**, all files will be decompressed. If *copy* is **True**, all files from local PDB mirror will copied to the user specified *folder*. *format* keyword argument can be used to retrieve `PDBML <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files: ``format="cif"`` will fetch an mmCIF file (e.g. :file:`1XXX.cif.gz`), similarly ``format="xml"`` will fetch a PDBML file. If PDBML header file is desired, ``format="xml", noatom=True`` will do the job (e.g. :file:`1XXX-noatom.xml.gz`) The order of file search operations are as follows: First, files are sought in *folder*. Second, local PDB mirror will be sought, if one is set by the user (see :func:`setPDBMirrorPath`). Then, local PDB folder will be sought, if one is set by the user (see :func:`setPDBLocalFolder`). Finally, if files are not found locally, they will be downloaded one of wwPDB FTP servers (use :func:`setWWPDBFTPServer` to specify one close to you).""" if isinstance(pdb, str): identifiers = [pdb] elif isinstance(pdb, list): identifiers = pdb else: raise TypeError('pdb may be a string or a list of strings') assert isinstance(folder, str), 'folder must be a string' assert isinstance(compressed, bool), 'compressed must be a boolean' assert isinstance(copy, bool), 'copy must be a boolean' format = kwargs.pop('format', 'pdb') assert isinstance(format, str), 'format must be a string' format = format.lower() assert format in _PDB_FORMATS, '{0:s} is not valid format'.format( repr(format)) noatom = kwargs.pop('noatom', False) assert isinstance(noatom, bool), 'noatom must be a boolean' if kwargs: raise TypeError('{0:s} is not a valid keyword argument for this' 'function'.format(repr(kwargs.iterkeys().next()))) if folder != '.': folder = makePath(folder) if not os.access(folder, os.W_OK): raise IOError('permission to write in {0:s} is denied, please ' 'specify another folder'.format(folder)) filenames = [] exists = 0 success = 0 failure = 0 download = False if format == 'pdb': divided = 'data/structures/divided/pdb' pdbext = '.ent.gz' extensions = ['.ent', '.pdb'] # '.pdb' should be the last item prefix = 'pdb' elif format == 'xml': if noatom: divided = 'data/structures/divided/XML-noatom' pdbext = '-noatom.xml.gz' extensions = ['-noatom.xml'] else: divided = 'data/structures/divided/XML' pdbext = '.xml.gz' extensions = ['.xml'] prefix = '' else: divided = 'data/structures/divided/mmCIF' pdbext = '.cif.gz' extensions = ['.cif'] # '.pdb' should be the last item prefix = '' pdbfnmap = {} for extension in extensions: for pdbfn in glob(os.path.join(folder, '*' + extension + '*')): if os.path.splitext(pdbfn)[1] in _PDB_EXTENSIONS: pdbfnmap[os.path.split(pdbfn)[1].split('.')[0].lower()] = pdbfn for pdbfn in glob(os.path.join(folder, '*' + extension.upper() + '*')): if os.path.splitext(pdbfn)[1] in _PDB_EXTENSIONS: pdbfnmap[os.path.split(pdbfn)[1].split('.')[0].lower()] = pdbfn for i, pdbid in enumerate(identifiers): # Check validity of identifiers if not isinstance(pdbid, str): LOGGER.debug('{0:s} is not a valid identifier.'.format(pdbid)) filenames.append(None) failure += 1 continue pdbid = pdbid.strip().lower() if not (len(pdbid) == 4 and pdbid.isalnum()): LOGGER.debug('{0:s} is not a valid identifier.'.format(pdbid)) filenames.append(None) failure += 1 continue # Check if file exists in working directory identifiers[i] = pdbid if noatom: fn = pdbfnmap.get(pdbid + '-noatom', None) else: fn = pdbfnmap.get(pdbid, None) or pdbfnmap.get('pdb'+pdbid, None) if fn: fn = relpath(fn) if not compressed: temp, ext = os.path.splitext(fn) if ext == '.gz': fn = gunzip(fn, temp) filenames.append(fn) LOGGER.debug('{0:s} ({1:s}) is found in the working directory.' .format(pdbid, fn)) exists += 1 continue # Check the PDB mirror mirror_path = getPDBMirrorPath() if mirror_path is not None and os.path.isdir(mirror_path): fn = os.path.join(mirror_path, divided, pdbid[1:3], prefix + pdbid + pdbext) if os.path.isfile(fn): if copy or not compressed: if compressed: filename = os.path.join(folder, pdbid + extension + '.gz') shutil.copy(fn, filename) else: filename = os.path.join(folder, pdbid + extension) gunzip(fn, filename) filenames.append(filename) LOGGER.debug('{0:s} copied from local mirror ({1:s})' .format(pdbid, filename)) success += 1 else: filenames.append(fn) LOGGER.debug('{0:s} ({1:s}...{2:s}) is found in the local ' 'mirror.'.format(pdbid, fn[:fn[1:].index(os.path.sep)+2], fn[-15:])) exists += 1 continue # Check the PDB mirror local_folder = getPDBLocalFolder() if format and local_folder: local_folder, is_divided = local_folder if is_divided: fn = os.path.join(local_folder, pdbid[1:3], 'pdb' + pdbid + '.pdb.gz') else: fn = os.path.join(local_folder, pdbid + '.pdb.gz') if os.path.isfile(fn): if copy or not compressed: if compressed: filename = os.path.join(folder, pdbid + extension + '.gz') shutil.copy(fn, filename) else: filename = os.path.join(folder, pdbid + extension) gunzip(fn, filename) filenames.append(filename) LOGGER.debug('{0:s} copied from local PDB folder ({1:s})' .format(pdbid, filename)) success += 1 else: filenames.append(fn) LOGGER.debug('{0:s} ({1:s}...{2:s}) is found in the PDB ' 'local folder.'.format(pdbid, fn[:fn[1:].index(os.path.sep)+2], fn[-15:])) exists += 1 continue filenames.append(pdbid) download = True if download: from ftplib import FTP ftp_name, ftp_host, ftp_path = getWWPDBFTPServer() LOGGER.debug('Connecting wwPDB FTP server {0:s}.'.format(ftp_name)) if format == 'pdb' and not copy and local_folder: folder = local_folder compressed = True if is_divided: getfn = lambda folder, pdbid, ext: \ os.path.join(makePath(os.path.join(local_folder, pdbid[1:3])), 'pdb' + pdbid + ext) else: getfn = lambda folder, pdbid, ext: os.path.join(folder, pdbid + ext) else: getfn = lambda folder, pdbid, ext: os.path.join(folder, pdbid + ext) try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: #ftp_path = os.path.join(ftp_path, divided) ftp.login('') for i, pdbid in enumerate(identifiers): if pdbid != filenames[i]: continue filename = getfn(folder, pdbid, extension) if compressed: filename += '.gz' pdbfile = open(filename, 'w+b') fn = prefix + pdbid + pdbext try: ftp.cwd(ftp_path) ftp.cwd(divided) ftp.cwd(pdbid[1:3]) ftp.retrbinary('RETR ' + fn, pdbfile.write) except Exception as error: pdbfile.close() os.remove(filename) if fn in ftp.nlst(): LOGGER.debug('{0:s} download failed ({1:s}). It ' 'is possible that you don\'t have ' 'rights to download .gz files in the ' 'current network.'.format(pdbid, str(error))) else: LOGGER.debug('{0:s} download failed. {1:s} does not ' 'exist on {2:s}.' .format(fn, pdbid, ftp_host)) failure += 1 filenames[i] = None else: pdbfile.close() if not compressed: gunzip(filename) filename = relpath(filename) LOGGER.debug('{0:s} downloaded ({1:s})' .format(pdbid, filename)) success += 1 filenames[i] = filename ftp.quit() if len(identifiers) == 1: return filenames[0] else: LOGGER.info('PDB download completed ({2:d} found, ' '{0:d} downloaded, {1:d} failed).' .format(success, failure, exists)) return filenames
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) result = root[0].get('id') return result
def buildKirchhoff(self, coords, cutoff=10., gamma=1., **kwargs): """Build Kirchhoff matrix for given coordinate set. :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` or :class:`.Atomic` :arg cutoff: cutoff distance (Å) for pairwise interactions default is 10.0 Å, , minimum is 4.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float :arg sparse: elect to use sparse matrices, default is **False**. If Scipy is not found, :class:`ImportError` is raised. :type sparse: bool :arg kdtree: elect to use KDTree for building Kirchhoff matrix faster, default is **True** :type kdtree: bool Instances of :class:`Gamma` classes and custom functions are accepted as *gamma* argument. When Scipy is available, user can select to use sparse matrices for efficient usage of memory at the cost of computation speed.""" try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') cutoff, g, gamma = checkENMParameters(cutoff, gamma) self._reset() self._cutoff = cutoff self._gamma = g n_atoms = coords.shape[0] start = time.time() if kwargs.get('sparse', False): try: from scipy import sparse as scipy_sparse except ImportError: raise ImportError('failed to import scipy.sparse, which is ' 'required for sparse matrix calculations') kirchhoff = scipy_sparse.lil_matrix((n_atoms, n_atoms)) else: kirchhoff = np.zeros((n_atoms, n_atoms), 'd') if kwargs.get('kdtree', True): kdtree = KDTree(coords) kdtree.search(cutoff) dist2 = kdtree.getDistances()**2 r = 0 for i, j in kdtree.getIndices(): g = gamma(dist2[r], i, j) kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] + g kirchhoff[j, j] = kirchhoff[j, j] + g r += 1 else: LOGGER.info('Using slower method for building the Kirchhoff.') cutoff2 = cutoff * cutoff mul = np.multiply for i in range(n_atoms): xyz_i = coords[i, :] i_p1 = i + 1 i2j = coords[i_p1:, :] - xyz_i mul(i2j, i2j, i2j) for j, dist2 in enumerate(i2j.sum(1)): if dist2 > cutoff2: continue j += i_p1 g = gamma(dist2, i, j) kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] + g kirchhoff[j, j] = kirchhoff[j, j] + g LOGGER.debug('Kirchhoff was built in {0:.2f}s.'.format(time.time() - start)) self._kirchhoff = kirchhoff self._n_atoms = n_atoms self._dof = n_atoms
def calcModes(self, n_modes=20, zeros=False, turbo=True): """Calculate normal modes. This method uses :func:`scipy.linalg.eigh` function to diagonalize the Hessian matrix. When Scipy is not found, :func:`numpy.linalg.eigh` is used. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If ``None`` is given, all modes will be calculated. :type n_modes: int or None, default is 20 :arg zeros: If ``True``, modes with zero eigenvalues will be kept. :type zeros: bool, default is ``False`` :arg turbo: Use a memory intensive, but faster way to calculate modes. :type turbo: bool, default is ``True`` """ if self._hessian is None: raise ValueError("Hessian matrix is not built or set") assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, "n_modes must be a positive integer" assert isinstance(zeros, bool), "zeros must be a boolean" assert isinstance(turbo, bool), "turbo must be a boolean" linalg = importLA() start = time.time() shift = 5 if linalg.__package__.startswith("scipy"): if n_modes is None: eigvals = None n_modes = self._dof else: if n_modes >= self._dof: eigvals = None n_modes = self._dof else: eigvals = (0, n_modes + shift) if eigvals: turbo = False if isinstance(self._hessian, np.ndarray): values, vectors = linalg.eigh(self._hessian, turbo=turbo, eigvals=eigvals) else: try: from scipy.sparse import linalg as scipy_sparse_la except ImportError: raise ImportError( "failed to import scipy.sparse.linalg, " "which is required for sparse matrix " "decomposition" ) try: values, vectors = scipy_sparse_la.eigsh(self._hessian, k=n_modes + 6, which="SA") except: values, vectors = scipy_sparse_la.eigen_symmetric(self._hessian, k=n_modes + 6, which="SA") else: if n_modes is not None: LOGGER.info("Scipy is not found, all modes are calculated.") values, vectors = linalg.eigh(self._hessian) n_zeros = sum(values < ZERO) if n_zeros < 6: LOGGER.warning("Less than 6 zero eigenvalues are calculated.") shift = n_zeros - 1 elif n_zeros > 6: LOGGER.warning("More than 6 zero eigenvalues are calculated.") shift = n_zeros - 1 if zeros: shift = -1 self._eigvals = values[1 + shift :] self._vars = 1 / self._eigvals self._trace = self._vars.sum() self._array = vectors[:, 1 + shift :] self._n_modes = len(self._eigvals) LOGGER.debug("{0} modes were calculated in {1:.2f}s.".format(self._n_modes, time.time() - start))
def calcModes(self, n_modes=20, zeros=False, turbo=True): """Calculate normal modes. This method uses :func:`scipy.linalg.eigh` function to diagonalize the Kirchhoff matrix. When Scipy is not found, :func:`numpy.linalg.eigh` is used. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If ``None`` is given, all modes will be calculated. :type n_modes: int or None, default is 20 :arg zeros: If ``True``, modes with zero eigenvalues will be kept. :type zeros: bool, default is ``False`` :arg turbo: Use a memory intensive, but faster way to calculate modes. :type turbo: bool, default is ``True`` """ if self._kirchhoff is None: raise ValueError('Kirchhoff matrix is not built or set') assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, \ 'n_modes must be a positive integer' assert isinstance(zeros, bool), 'zeros must be a boolean' assert isinstance(turbo, bool), 'turbo must be a boolean' linalg = importLA() start = time.time() shift = 0 if linalg.__package__.startswith('scipy'): if n_modes is None: eigvals = None n_modes = self._dof else: if n_modes >= self._dof: eigvals = None n_modes = self._dof else: eigvals = (0, n_modes + shift) if eigvals: turbo = False if isinstance(self._kirchhoff, np.ndarray): values, vectors = linalg.eigh(self._kirchhoff, turbo=turbo, eigvals=eigvals) else: try: from scipy.sparse import linalg as scipy_sparse_la except ImportError: raise ImportError('failed to import scipy.sparse.linalg, ' 'which is required for sparse matrix ' 'decomposition') try: values, vectors = (scipy_sparse_la.eigsh(self._kirchhoff, k=n_modes + 1, which='SA')) except: values, vectors = (scipy_sparse_la.eigen_symmetric( self._kirchhoff, k=n_modes + 1, which='SA')) else: if n_modes is not None: LOGGER.info('Scipy is not found, all modes are calculated.') values, vectors = linalg.eigh(self._kirchhoff) n_zeros = sum(values < ZERO) if n_zeros < 1: LOGGER.warning('Less than 1 zero eigenvalues are calculated.') shift = n_zeros - 1 elif n_zeros > 1: LOGGER.warning('More than 1 zero eigenvalues are calculated.') shift = n_zeros - 1 if zeros: shift = -1 self._eigvals = values[1 + shift:] self._vars = 1 / self._eigvals self._trace = self._vars.sum() self._array = vectors[:, 1 + shift:] self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.'.format( self._n_modes, time.time() - start))
def mapOntoChain(atoms, chain, **kwargs): """Map *atoms* onto *chain*. This function returns a list of mappings. Each mapping is a tuple that contains 4 items: * Mapped chain as an :class:`.AtomMap` instance, * *chain* as an :class:`.AtomMap` instance, * Percent sequence identitity, * Percent sequence overlap Mappings are returned in decreasing percent sequence identity order. :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms in place of unmapped atoms. :arg atoms: atoms that will be mapped to the target *chain* :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg chain: chain to which atoms will be mapped :type chain: :class:`.Chain` :keyword seqid: percent sequence identity, default is 90 :type seqid: float :keyword overlap: percent overlap, default is 90 :type overlap: float :keyword pwalign: perform pairwise sequence alignment :type pwalign: bool This function tries to map *atoms* to *chain* based on residue numbers and types. Each individual chain in *atoms* is compared to target *chain*. This works well for different structures of the same protein. When it fails, :mod:`Bio.pairwise2` is used for sequence alignment, and mapping is performed based on the sequence alignment. User can control, whether sequence alignment is performed or not with *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is enforced.""" """ :keyword subset: "calpha" (or "ca"), "backbone" (or "bb"), or "all", default is "calpha" :type subset: string """ target_chain = chain if not isinstance(atoms, (AtomGroup, Chain, Selection)): raise TypeError('atoms must be an AtomGroup, a Chain, or a ' 'Selection instance') if not isinstance(target_chain, Chain): raise TypeError('chain must be Chain instance') subset = str(kwargs.get('subset', 'calpha')).lower() if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument' .format(str(subset))) seqid = kwargs.get('seqid', 90.) coverage = kwargs.get('overlap') if coverage is None: coverage = kwargs.get('coverage', 90.) pwalign = kwargs.get('pwalign', None) if isinstance(atoms, Chain): chains = [atoms] map_ag = atoms.getAtomGroup() else: if isinstance(atoms, AtomGroup): map_ag = atoms else: map_ag = atoms.getAtomGroup() chains = list(atoms.getHierView().iterChains()) LOGGER.debug('Evaluating {0}: {1} chains are identified' .format(str(atoms), len(chains))) if subset != 'all': target_chain = target_chain.select(subset ).getHierView()[target_chain.getChid()] mappings = [] unmapped = [] target_ag = target_chain.getAtomGroup() simple_target = SimpleChain(target_chain, True) LOGGER.debug('Trying to map atoms based on residue numbers and ' 'identities:') for chain in chains: simple_chain = SimpleChain(True) simple_chain.buildFromChain(chain) if len(simple_chain) == 0: LOGGER.debug(' Skipping {0}, which does not contain any amino ' 'acid residues.'.format(simple_chain)) continue LOGGER.debug(' Comparing {0} (len={1}) with {2}:' .format(simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) target_list, chain_list, n_match, n_mapped = getTrivialMapping( simple_target, simple_chain) if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.' .format(n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: LOGGER.debug('\tFailed to match chains based on residue numbers ' '(seqid={0:.0f}%, overlap={1:.0f}%).' .format(_seqid, _cover)) unmapped.append(simple_chain) if pwalign or (not mappings and (pwalign is None or pwalign)): LOGGER.debug('Trying to map atoms based on {0} sequence alignment:' .format(ALIGNMENT_METHOD)) for simple_chain in unmapped: LOGGER.debug(' Comparing {0} (len={1}) with {2}:' .format(simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) result = getAlignedMapping(simple_target, simple_chain) if result is not None: target_list, chain_list, n_match, n_mapped = result if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: LOGGER.debug('\tMapped: {0} residues match with {1:.0f}%' ' sequence identity and {2:.0f}% overlap.' .format(n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).' .format(_seqid, _cover)) for mi, result in enumerate(mappings): residues_target, residues_chain, _seqid, _cover = result indices_target = [] indices_chain = [] indices_mapping = [] indices_dummies = [] counter = 0 for i in range(len(residues_target)): res_tar = residues_target[i] res_chn = residues_chain[i] for atom_tar in res_tar: indices_target.append(atom_tar.getIndex()) if res_chn is not None: atom_chn = res_chn.getAtom(atom_tar.getName()) if atom_chn is not None: indices_chain.append(atom_chn.getIndex()) indices_mapping.append(counter) else: indices_dummies.append(counter) else: indices_dummies.append(counter) counter += 1 #n_atoms = len(indices_target) atommap = AM(map_ag, indices_chain, chain.getACSIndex(), mapping=indices_mapping, dummies=indices_dummies, title=simple_chain.getTitle() + ' -> ' + simple_target.getTitle()) selection = AM(target_ag, indices_target, target_chain.getACSIndex(), title=simple_target.getTitle() + ' -> ' + simple_chain.getTitle(), intarrays=True) mappings[mi] = (atommap, selection, _seqid, _cover) if len(mappings) > 1: def compare(m1, m2): return cmp(m1[2], m2[2]) mappings.sort(compare, reverse=True) return mappings
def fetchPDBfromMirror(*pdb, **kwargs): """Returns path(s) to PDB (default), PDBML, or mmCIF file(s) for specified *pdb* identifier(s). If a *folder* is specified, files will be copied into this folder. If *compressed* is **False**, files will decompressed. *format* argument can be used to get `PDBML <http://pdbml.pdb.org/>`_ and `mmCIF <http://mmcif.pdb.org/>`_ files: ``format='cif'`` will fetch an mmCIF file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" mirror = pathPDBMirror() if mirror is None: raise IOError('no mirror path is set') try: mirror, mirror_format = mirror except ValueError: mirror_format = None format = str(kwargs.pop('format', 'pdb')).lower() if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) if format == 'pdb': ftp_divided = 'data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if bool(kwargs.pop('noatom', False)): ftp_divided = 'data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' else: if format: raise ValueError('{0} is not a recognized format' .format(repr(format))) else: raise ValueError('please specify a valid format') if mirror_format: if mirror_format.lower() != format: raise IOError('mirror contains only ' + mirror_format + ' files') ftp_divided = '' else: ftp_divided = join(*ftp_divided.split('/')) folder = kwargs.get('folder') compressed = kwargs.get('compressed', True) filenames = [] append = filenames.append success = 0 failure = 0 for pdb in identifiers: if pdb is None: append(None) continue fn = join(mirror, ftp_divided, pdb[1:3], ftp_prefix + pdb + ftp_pdbext) if isfile(fn): if folder or not compressed: if compressed: fn = copyFile(fn, join(folder or '.', pdb + extension + '.gz')) else: fn = gunzip(fn, join(folder or '.', pdb + extension)) append(normpath(fn)) success += 1 else: append(None) failure += 1 if len(identifiers) == 1: fn = filenames[0] if kwargs.get('report', True): if success: LOGGER.debug('PDB file is found in the local mirror ({0}).' .format(sympath(fn))) return fn else: if kwargs.get('report', True): LOGGER.debug('PDB files found in the local mirror ({0} found, ' '{1} missed).'.format(success, failure)) return filenames
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = "{http://pfam.xfam.org/}" query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = "".join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError("could not parse a sequence without gaps from " + query) else: seq = "".join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit("_pfam") timeout = int(kwargs.get("timeout", 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + " is not a valid sequence") fseq = ">Seq\n" + seq parameters = {"hmmdb": "pfam", "seq": fseq} enc_params = urllib.urlencode(parameters) request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params) url = urllib2.urlopen(request).geturl() + "?output=xml" LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) matches = {} for child in root[0]: if child.tag == "hits": accession = child.get("acc") pfam_id = accession.split(".")[0] matches[pfam_id] = {} matches[pfam_id]["accession"] = accession matches[pfam_id]["class"] = "Domain" matches[pfam_id]["id"] = child.get("name") matches[pfam_id]["locations"] = {} matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom") matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore") matches[pfam_id]["locations"]["end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["evalue"] = child.get("evalue") matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0" matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto") matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom") matches[pfam_id]["locations"]["significant"] = child[0].get("significant") matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom") matches[pfam_id]["type"] = "Pfam-A" return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], "polymers") except Exception as err: LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != "UniProt": continue idcode = dbref.idcode LOGGER.info( "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid) ) break if idcode is not None: break if idcode is None: LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq))) url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" LOGGER.debug("Retrieving Pfam search results: " + url) xml = None while LOGGER.timing("_pfam") < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url) else: LOGGER.report("Pfam search completed in %.2fs.", "_pfam") if xml.find(b"There was a system error on your last request.") > 0: LOGGER.warn("No Pfam matches found for: " + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError("failed to parse results XML, check URL: " + url) else: results = dictElement(root[0], prefix) try: xml_matches = results["matches"] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) matches = dict() for child in xml_matches: try: accession = child.attrib["accession"][:7] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) if not re.search("^P(F|B)[0-9]{5}$", accession): raise ValueError("{0} does not match pfam accession" " format".format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault("locations", []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = "Query " + repr(query) else: query = "Query sequence" if matches: LOGGER.info(query + " matched {0} Pfam families.".format(len(matches))) else: LOGGER.info(query + " did not match any Pfam families.") return matches
def matchChains(atoms1, atoms2, **kwargs): """Returns pairs of chains matched based on sequence similarity. Makes an all-to-all comparison of chains in *atoms1* and *atoms2*. Chains are obtained from hierarchical views (:class:`.HierView`) of atom groups. This function returns a list of matching chains in a tuples that contain 4 items: * matching chain from *atoms1* as a :class:`.AtomMap` instance, * matching chain from *atoms2* as a :class:`.AtomMap` instance, * percent sequence identity of the match, * percent sequence overlap of the match. List of matches are sorted in decreasing percent sequence identity order. :class:`.AtomMap` instances can be used to calculate RMSD values and superpose atom groups. :arg atoms1: atoms that contain a chain :type atoms1: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg atoms2: atoms that contain a chain :type atoms2: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :keyword subset: one of the following well-defined subsets of atoms: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is 90 :type seqid: float :keyword overlap: percent overlap, default is 90 :type overlap: float :keyword pwalign: perform pairwise sequence alignment :type pwalign: bool If *subset* is set to *calpha* or *backbone*, only alpha carbon atoms or backbone atoms will be paired. If set to *all*, all atoms common to matched residues will be returned. This function tries to match chains based on residue numbers and names. All chains in *atoms1* is compared to all chains in *atoms2*. This works well for different structures of the same protein. When it fails, :mod:`Bio.pairwise2` is used for pairwise sequence alignment, and matching is performed based on the sequence alignment. User can control, whether sequence alignment is performed or not with *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is enforced.""" if not isinstance(atoms1, (AtomGroup, Chain, Selection)): raise TypeError('atoms1 must be an AtomGroup, Chain, or Selection') if not isinstance(atoms2, (AtomGroup, Chain, Selection)): raise TypeError('atoms2 must be an AtomGroup, Chain, or Selection') subset = kwargs.get('subset', 'calpha') if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument'.format( str(subset))) seqid = kwargs.get('seqid', 90.) assert isinstance(seqid, (float, int)), 'seqid must be float' assert 0 < seqid <= 100, 'seqid must be in the range from 0 to 100' coverage = kwargs.get('overlap') if coverage is None: coverage = kwargs.get('coverage', 90.) assert isinstance(coverage, (float, int)), 'overlap must be float' assert 0 < coverage <= 100, 'overlap must be in the range from 0 to 100' pwalign = kwargs.get('pwalign', None) if isinstance(atoms1, Chain): chains1 = [atoms1] atoms1 = atoms1.getAtomGroup() else: chains1 = list(atoms1.getHierView().iterChains()) if not isinstance(atoms1, AtomGroup): atoms1 = atoms1.getAtomGroup() chains = list() for ch in chains1: simpch = SimpleChain(ch) if len(simpch) > 0: chains.append(simpch) chains1 = chains if not isinstance(atoms1, Chain): LOGGER.debug('Checking {0}: {1} chains are identified'.format( str(atoms1), len(chains1))) if isinstance(atoms2, Chain): chains2 = [atoms2] atoms2 = atoms2.getAtomGroup() else: chains2 = list(atoms2.getHierView().iterChains()) if not isinstance(atoms2, AtomGroup): atoms2 = atoms2.getAtomGroup() chains = list() for ch in chains2: simpch = SimpleChain(ch) if len(simpch) > 0: chains.append(simpch) chains2 = chains if not isinstance(atoms2, Chain): LOGGER.debug('Checking {0}: {1} chains are identified'.format( str(atoms2), len(chains2))) matches = [] unmatched = [] LOGGER.debug('Trying to match chains based on residue numbers and names:') for simpch1 in chains1: for simpch2 in chains2: LOGGER.debug(' Comparing {0} (len={1}) and {2} (len={3}):'.format( simpch1.getTitle(), len(simpch1), simpch2.getTitle(), len(simpch2))) match1, match2, nmatches = getTrivialMatch(simpch1, simpch2) _seqid = nmatches * 100 / min(len(simpch1), len(simpch2)) _cover = len(match2) * 100 / max(len(simpch1), len(simpch2)) if _seqid >= seqid and _cover >= coverage: LOGGER.debug('\tMatch: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( len(match1), _seqid, _cover)) matches.append( (match1, match2, _seqid, _cover, simpch1, simpch2)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) unmatched.append((simpch1, simpch2)) if pwalign or (not matches and (pwalign is None or pwalign)): pairwise2 = importBioPairwise2() if pairwise2: LOGGER.debug('Trying to match chains based on {0} sequence ' 'alignment:'.format(ALIGNMENT_METHOD)) for simpch1, simpch2 in unmatched: LOGGER.debug(' Comparing {0} (len={1}) and {2} ' '(len={3}):'.format(simpch1.getTitle(), len(simpch1), simpch2.getTitle(), len(simpch2))) match1, match2, nmatches = getAlignedMatch(simpch1, simpch2) _seqid = nmatches * 100 / min(len(simpch1), len(simpch2)) _cover = len(match2) * 100 / max(len(simpch1), len(simpch2)) if _seqid >= seqid and _cover >= coverage: LOGGER.debug( '\tMatch: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( len(match1), _seqid, _cover)) matches.append( (match1, match2, _seqid, _cover, simpch1, simpch2)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) else: LOGGER.warning('Pairwise alignment could not be performed.') if not matches: return None subset = _SUBSETS[subset] for mi, result in enumerate(matches): match1, match2, _seqid, _cover, simpch1, simpch2 = result indices1 = [] indices2 = [] for i in range(len(match1)): ares = match1[i] bres = match2[i] if subset == 'ca': try: aid = ares.getNames().tolist().index('CA') except ValueError: aid = None try: bid = bres.getNames().tolist().index('CA') if aid is not None: indices1.append(ares._indices[aid]) indices2.append(bres._indices[bid]) except ValueError: pass elif subset == 'bb': for bban in ('N', 'CA', 'C', 'O'): try: aid = ares.getNames().tolist().index(bban) except ValueError: continue try: bid = bres.getNames().tolist().index(bban) except ValueError: continue else: indices1.append(ares._indices[aid]) indices2.append(bres._indices[bid]) elif subset == 'noh': for han, aid, noh in zip(ares.getNames(), ares._indices, ares.getFlags('noh')): if not noh: continue try: bid = bres.getNames().tolist().index(han) except ValueError: continue else: indices1.append(aid) indices2.append(bres._indices[bid]) elif subset is None or subset is 'all': aans = ares.getNames() bans = bres.getNames().tolist() aids = ares.getIndices() #bids = bres.getIndices() for j in range(len(aans)): try: bid = bres._indices[bans.index(aans[j])] indices1.append(aids[j]) indices2.append(bid) except ValueError: pass indices1 = np.array(indices1, int) indices2 = np.array(indices2, int) match1 = AM(atoms1, indices1, atoms1.getACSIndex(), title=simpch1.getTitle() + ' -> ' + simpch2.getTitle(), intarrays=True) match2 = AM(atoms2, indices2, atoms2.getACSIndex(), title=simpch2.getTitle() + ' -> ' + simpch1.getTitle(), intarrays=True) matches[mi] = (match1, match2, _seqid, _cover) if len(matches) > 1: def compare(m1, m2): return cmp(m1[2], m2[2]) matches.sort(compare, reverse=True) return matches
def fetchPDB(*pdb, **kwargs): """Returns path(s) to PDB file(s) for specified *pdb* identifier(s). Files will be sought in user specified *folder* or current working director, and then in local PDB folder and mirror, if they are available. If *copy* is set **True**, files will be copied into *folder*. If *compressed* is **False**, all files will be decompressed. See :func:`pathPDBFolder` and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP` and :func:`.fetchPDBviaFTP` for downloading files from PDB servers.""" if len(pdb) == 1 and isinstance(pdb[0], list): pdb = pdb[0] if 'format' in kwargs and kwargs.get('format') != 'pdb': return fetchPDBviaFTP(*pdb, **kwargs) identifiers = checkIdentifiers(*pdb) folder = kwargs.get('folder', '.') compressed = kwargs.get('compressed') # check *folder* specified by the user, usually pwd ('.') filedict = findPDBFiles(folder, compressed=compressed) filenames = [] not_found = [] exists = 0 for i, pdb in enumerate(identifiers): if pdb is None: filenames.append(None) elif pdb in filedict: filenames.append(filedict[pdb]) exists += 1 else: filenames.append(None) not_found.append((i, pdb)) if not not_found: if len(filenames) == 1: filenames = filenames[0] if exists: LOGGER.debug( 'PDB file is found in working directory ({0}).'.format( sympath(filenames))) return filenames if not isWritable(folder): raise IOError('permission to write in {0} is denied, please ' 'specify another folder'.format(folder)) if compressed is not None and not compressed: filedict = findPDBFiles(folder, compressed=True) not_found, decompress = [], not_found for i, pdb in decompress: if pdb in filedict: fn = filedict[pdb] filenames[i] = gunzip(fn, splitext(fn)[0]) else: not_found.append((i, pdb)) if not not_found: return filenames[0] if len(identifiers) == 1 else filenames local_folder = pathPDBFolder() copy = kwargs.setdefault('copy', False) if local_folder: local_folder, is_divided = local_folder temp, not_found = not_found, [] for i, pdb in temp: if is_divided: fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz') else: fn = join(local_folder, pdb + '.pdb.gz') if isfile(fn): if copy or not compressed and compressed is not None: if compressed: fn = copyFile(fn, join(folder, pdb + 'pdb.gz')) else: fn = gunzip(fn, join(folder, pdb + '.pdb')) filenames[i] = normpath(fn) else: not_found.append((i, pdb)) if not not_found: if len(identifiers) == 1: fn = filenames[0] items = fn.split(pathsep) if len(items) > 5: fndisp = pathsep.join(items[:3] + ['...'] + items[-1:]) else: fndisp = relpath(fn) LOGGER.debug( 'PDB file is found in the local folder ({0}).'.format(fndisp)) return fn else: return filenames if kwargs['copy'] or (compressed is not None and not compressed): kwargs['folder'] = folder downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBfromMirror(*downloads, **kwargs) except IOError: pass else: if len(downloads) == 1: fns = [fns] temp, not_found = not_found, [] for i, fn in enumerate(fns): if fn is None: not_found.append(temp[i]) else: i, _ = temp[i] filenames[i] = fn if not not_found: return filenames[0] if len(identifiers) == 1 else filenames if fns: downloads = [pdb for i, pdb in not_found] fns = None tp = kwargs.pop('tp', None) if tp is not None: tp = tp.lower() if tp == 'http': try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP failed ' '({0}).'.format(str(err))) elif tp == 'ftp': try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via FTP failed ' '({0}).'.format(str(err))) else: tryHTTP = False try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: tryHTTP = True if fns is None or isinstance(fns, list) and None in fns: tryHTTP = True elif isinstance(fns, list): downloads = [ not_found[i][1] for i in range(len(fns)) if fns[i] is None ] if len(downloads) > 0: tryHTTP = True if tryHTTP: LOGGER.info('Downloading PDB files via FTP failed, ' 'trying HTTP.') try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP also failed ' '({0}).'.format(str(err))) if len(downloads) == 1: fns = [fns] if fns: for i, fn in zip([i for i, pdb in not_found], fns): filenames[i] = fn return filenames[0] if len(identifiers) == 1 else filenames
def solveEig(M, n_modes=None, zeros=False, turbo=True, is3d=False): linalg = importLA() dof = M.shape[0] expct_n_zeros = 6 if is3d else 1 if n_modes is None: eigvals = None n_modes = dof else: if n_modes >= dof: eigvals = None n_modes = dof else: eigvals = (0, n_modes+expct_n_zeros-1) def _eigh(M, eigvals=None, turbo=True): if linalg.__package__.startswith('scipy'): from scipy.sparse import issparse if eigvals: turbo = False if not issparse(M): values, vectors = linalg.eigh(M, turbo=turbo, eigvals=eigvals) else: try: from scipy.sparse import linalg as scipy_sparse_la except ImportError: raise ImportError('failed to import scipy.sparse.linalg, ' 'which is required for sparse matrix ' 'decomposition') if eigvals: j = eigvals[0] k = eigvals[-1] + 1 else: j = 0 k = dof if k >= dof: k -= 1 LOGGER.warning('Cannot calculate all eigenvalues for sparse matrices, thus ' 'the last eigenvalue is omitted. See scipy.sparse.linalg.eigsh ' 'for more information') values, vectors = scipy_sparse_la.eigsh(M, k=k, which='SA') values = values[j:k] vectors = vectors[:, j:k] else: if n_modes is not None: LOGGER.info('Scipy is not found, all modes were calculated.') else: n_modes = dof values, vectors = linalg.eigh(M) return values, vectors def _calc_n_zero_modes(M): from scipy.sparse import issparse if not issparse(M): w = linalg.eigvalsh(M) else: try: from scipy.sparse import linalg as scipy_sparse_la except ImportError: raise ImportError('failed to import scipy.sparse.linalg, ' 'which is required for sparse matrix ' 'decomposition') w, _ = scipy_sparse_la.eigsh(M, k=dof-1, which='SA') n_zeros = sum(w < ZERO) return n_zeros values, vectors = _eigh(M, eigvals, turbo) n_zeros = sum(values < ZERO) if n_zeros < n_modes + expct_n_zeros: if n_zeros < expct_n_zeros: LOGGER.warning('Fewer than %d (%d) zero eigenvalues were calculated.'%(expct_n_zeros, n_zeros)) elif n_zeros > expct_n_zeros: LOGGER.warning('More than %d (%d) zero eigenvalues were calculated.'%(expct_n_zeros, n_zeros)) else: LOGGER.warning('More than %d zero eigenvalues were detected.'%expct_n_zeros) if not zeros: if n_zeros > expct_n_zeros: if n_zeros == n_modes + expct_n_zeros and n_modes != dof: LOGGER.debug('Determing the number of zero eigenvalues...') # find the actual number of zero modes n_zeros = _calc_n_zero_modes(M) LOGGER.debug('%d zero eigenvalues detected.'%n_zeros) LOGGER.debug('Solving for additional eigenvalues...') start = min(n_modes+expct_n_zeros, dof-1); end = min(n_modes+n_zeros-1, dof-1) values_, vectors_ = _eigh(M, eigvals=(start, end)) values = np.concatenate((values, values_)) vectors = np.hstack((vectors, vectors_)) # final_n_modes may exceed len(eigvals) - no need to fix for the sake of the simplicity of the code final_n_modes = n_zeros + n_modes eigvals = values[n_zeros:final_n_modes] eigvecs = vectors[:, n_zeros:final_n_modes] vars = 1 / eigvals else: eigvals = values[:n_modes] eigvecs = vectors[:, :n_modes] vars = div0(1, values) vars[:n_zeros] = 0. vars = vars[:n_modes] return eigvals, eigvecs, vars
def calcModes(self, n_modes=20, zeros=False, turbo=True, hinges=True): """Calculate normal modes. This method uses :func:`scipy.linalg.eigh` function to diagonalize the Kirchhoff matrix. When Scipy is not found, :func:`numpy.linalg.eigh` is used. :arg n_modes: number of non-zero eigenvalues/vectors to calculate. If ``None`` is given, all modes will be calculated. :type n_modes: int or None, default is 20 :arg zeros: If ``True``, modes with zero eigenvalues will be kept. :type zeros: bool, default is ``False`` :arg turbo: Use a memory intensive, but faster way to calculate modes. :type turbo: bool, default is ``True`` :arg hinges: Identify hinge sites after modes are computed. :type hinges: bool, default is ``True`` """ if self._kirchhoff is None: raise ValueError('Kirchhoff matrix is not built or set') assert n_modes is None or isinstance(n_modes, int) and n_modes > 0, \ 'n_modes must be a positive integer' assert isinstance(zeros, bool), 'zeros must be a boolean' assert isinstance(turbo, bool), 'turbo must be a boolean' linalg = importLA() start = time.time() shift = 0 if linalg.__package__.startswith('scipy'): if n_modes is None: eigvals = None n_modes = self._dof else: if n_modes >= self._dof: eigvals = None n_modes = self._dof else: eigvals = (0, n_modes + shift) if eigvals: turbo = False if isinstance(self._kirchhoff, np.ndarray): values, vectors = linalg.eigh(self._kirchhoff, turbo=turbo, eigvals=eigvals) else: try: from scipy.sparse import linalg as scipy_sparse_la except ImportError: raise ImportError('failed to import scipy.sparse.linalg, ' 'which is required for sparse matrix ' 'decomposition') try: values, vectors = ( scipy_sparse_la.eigsh(self._kirchhoff, k=n_modes + 1, which='SA')) except: values, vectors = ( scipy_sparse_la.eigen_symmetric(self._kirchhoff, k=n_modes + 1, which='SA')) else: if n_modes is not None: LOGGER.info('Scipy is not found, all modes are calculated.') values, vectors = linalg.eigh(self._kirchhoff) n_zeros = sum(values < ZERO) if n_zeros < 1: LOGGER.warning('Less than 1 zero eigenvalues are calculated.') shift = n_zeros - 1 elif n_zeros > 1: LOGGER.warning('More than 1 zero eigenvalues are calculated.') shift = n_zeros - 1 if zeros: shift = -1 self._eigvals = values[1+shift:] self._vars = 1 / self._eigvals self._trace = self._vars.sum() self._array = vectors[:, 1+shift:] self._n_modes = len(self._eigvals) if hinges: self.calcHinges() LOGGER.debug('{0} modes were calculated in {1:.2f}s.' .format(self._n_modes, time.time()-start))
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.xfam.org/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = { 'hmmdb' : 'pfam', 'seq': fseq } enc_params = urllib.urlencode(parameters) request = urllib2.Request('http://hmmer.janelia.org/search/hmmscan', enc_params) url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) matches = {} for child in root[0]: if child.tag == 'hits': accession = child.get('acc') pfam_id = accession.split('.')[0] matches[pfam_id]={} matches[pfam_id]['accession']=accession matches[pfam_id]['class']='Domain' matches[pfam_id]['id']=child.get('name') matches[pfam_id]['locations']={} matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') matches[pfam_id]['locations']['end']=child[0].get('alisqto') matches[pfam_id]['locations']['evalue']=child.get('evalue') matches[pfam_id]['locations']['evidence']='hmmer v3.0' matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') matches[pfam_id]['locations']['significant']=child[0].get('significant') matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') matches[pfam_id]['type']='Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' else: url = ('http://pfam.xfam.org/protein/' + idcode + '?output=xml') else: url = 'http://pfam.xfam.org/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def fetchPDB(*pdb, **kwargs): """Returns path(s) to PDB file(s) for specified *pdb* identifier(s). Files will be sought in user specified *folder* or current working director, and then in local PDB folder and mirror, if they are available. If *copy* is set **True**, files will be copied into *folder*. If *compressed* is **False**, all files will be decompressed. See :func:`pathPDBFolder` and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP` and :func:`.fetchPDBviaFTP` for downloading files from PDB servers.""" if len(pdb) == 1 and isinstance(pdb[0], list): pdb = pdb[0] if 'format' in kwargs and kwargs.get('format') != 'pdb': return fetchPDBviaFTP(*pdb, **kwargs) identifiers = checkIdentifiers(*pdb) folder = kwargs.get('folder', '.') compressed = kwargs.get('compressed') # check *folder* specified by the user, usually pwd ('.') filedict = findPDBFiles(folder, compressed=compressed) filenames = [] not_found = [] exists = 0 for i, pdb in enumerate(identifiers): if pdb is None: filenames.append(None) elif pdb in filedict: filenames.append(filedict[pdb]) exists += 1 else: filenames.append(None) not_found.append((i, pdb)) if not not_found: if len(filenames) == 1: filenames = filenames[0] if exists: LOGGER.debug('PDB file is found in working directory ({0}).' .format(sympath(filenames))) return filenames if not isWritable(folder): raise IOError('permission to write in {0} is denied, please ' 'specify another folder'.format(folder)) if compressed is not None and not compressed: filedict = findPDBFiles(folder, compressed=True) not_found, decompress = [], not_found for i, pdb in decompress: if pdb in filedict: fn = filedict[pdb] filenames[i] = gunzip(fn, splitext(fn)[0]) else: not_found.append((i, pdb)) if not not_found: return filenames[0] if len(identifiers) == 1 else filenames local_folder = pathPDBFolder() copy = kwargs.setdefault('copy', False) if local_folder: local_folder, is_divided = local_folder temp, not_found = not_found, [] for i, pdb in temp: if is_divided: fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz') else: fn = join(local_folder, pdb + '.pdb.gz') if isfile(fn): if copy or not compressed and compressed is not None: if compressed: fn = copyFile(fn, join(folder, pdb + 'pdb.gz')) else: fn = gunzip(fn, join(folder, pdb + '.pdb')) filenames[i] = normpath(fn) else: not_found.append((i, pdb)) if not not_found: if len(identifiers) == 1: fn = filenames[0] if kwargs.get('report', True): items = fn.split(pathsep) if len(items) > 5: fndisp = pathsep.join(items[:3] + ['...'] + items[-1:]) else: fndisp = relpath(fn) LOGGER.debug('PDB file is found in the local folder ({0}).' .format(fndisp)) return fn else: return filenames if kwargs['copy'] or (compressed is not None and not compressed): kwargs['folder'] = folder downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBfromMirror(*downloads, **kwargs) except IOError: pass else: if len(downloads) == 1: fns = [fns] temp, not_found = not_found, [] for i, fn in enumerate(fns): if fn is None: not_found.append(temp[i]) else: i, _ = temp[i] filenames[i] = fn if not not_found: return filenames[0] if len(identifiers) == 1 else filenames if fns: downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via FTP failed ({0}), ' 'trying HTTP.'.format(str(err))) try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP also failed ' '({0}).'.format(str(err))) if len(downloads) == 1: fns = [fns] if fns: for i, fn in zip([i for i, pdb in not_found], fns): filenames[i] = fn return filenames[0] if len(identifiers) == 1 else filenames
def fetchPDBviaHTTP(*pdb, **kwargs): """Retrieve PDB file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) extension = '.pdb' local_folder = pathPDBFolder() if local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us'] success = 0 failure = 0 filenames = [] for pdb in identifiers: if pdb is None: filenames.append(None) continue try: handle = openURL(getURL(pdb)) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err))) failure += 1 filenames.append(None) else: data = handle.read() if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: pdbfile.write(data) filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) LOGGER.debug('PDB download via HTTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def mapOntoChain(atoms, chain, **kwargs): """Map *atoms* onto *chain*. This function returns a list of mappings. Each mapping is a tuple that contains 4 items: * Mapped chain as an :class:`.AtomMap` instance, * *chain* as an :class:`.AtomMap` instance, * Percent sequence identitity, * Percent sequence overlap Mappings are returned in decreasing percent sequence identity order. :class:`.AtomMap` that keeps mapped atom indices contains dummy atoms in place of unmapped atoms. :arg atoms: atoms that will be mapped to the target *chain* :type atoms: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg chain: chain to which atoms will be mapped :type chain: :class:`.Chain` :keyword subset: one of the following well-defined subsets of atoms: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is 90 :type seqid: float :keyword overlap: percent overlap, default is 90 :type overlap: float :keyword pwalign: perform pairwise sequence alignment :type pwalign: bool :keyword fast: get rid of verbosity and just returns sequence identity. :type fast: bool This function tries to map *atoms* to *chain* based on residue numbers and types. Each individual chain in *atoms* is compared to target *chain*. This works well for different structures of the same protein. When it fails, :mod:`Bio.pairwise2` is used for sequence alignment, and mapping is performed based on the sequence alignment. User can control, whether sequence alignment is performed or not with *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is enforced.""" target_chain = chain if not isinstance(atoms, (AtomGroup, Chain, Selection)): raise TypeError('atoms must be an AtomGroup, a Chain, or a ' 'Selection instance') if not isinstance(target_chain, Chain): raise TypeError('chain must be Chain instance') subset = str(kwargs.get('subset', 'calpha')).lower() if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument'.format( str(subset))) seqid = kwargs.get('seqid', 90.) coverage = kwargs.get('overlap') if coverage is None: coverage = kwargs.get('coverage', 70.) pwalign = kwargs.get('pwalign', None) fast = kwargs.get('fast', False) if isinstance(atoms, Chain): chains = [atoms] map_ag = atoms.getAtomGroup() else: if isinstance(atoms, AtomGroup): map_ag = atoms else: map_ag = atoms.getAtomGroup() chains = list(atoms.getHierView().iterChains()) LOGGER.debug('Evaluating {0}: {1} chains are identified'.format( str(atoms), len(chains))) if subset != 'all': target_chain = target_chain.select(subset).getHierView()[ target_chain.getChid()] mappings = [] unmapped = [] target_ag = target_chain.getAtomGroup() simple_target = SimpleChain(target_chain, True) if fast is False: LOGGER.debug('Trying to map atoms based on residue numbers and ' 'identities:') for chain in chains: simple_chain = SimpleChain(True) simple_chain.buildFromChain(chain) if len(simple_chain) == 0: if fast is False: LOGGER.debug( ' Skipping {0}, which does not contain any amino ' 'acid residues.'.format(simple_chain)) continue if fast is False: LOGGER.debug(' Comparing {0} (len={1}) with {2}:'.format( simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) target_list, chain_list, n_match, n_mapped = getTrivialMapping( simple_target, simple_chain) if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: if fast is False: LOGGER.debug('\tMapped: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: if fast is False: LOGGER.debug( '\tFailed to match chains based on residue numbers ' '(seqid={0:.0f}%, overlap={1:.0f}%).'.format( _seqid, _cover)) unmapped.append(simple_chain) if pwalign or (not mappings and (pwalign is None or pwalign)): LOGGER.debug( 'Trying to map atoms based on {0} sequence alignment:'.format( ALIGNMENT_METHOD)) for simple_chain in unmapped: LOGGER.debug(' Comparing {0} (len={1}) with {2}:'.format( simple_chain.getTitle(), len(simple_chain), simple_target.getTitle())) result = getAlignedMapping(simple_target, simple_chain) if result is not None: target_list, chain_list, n_match, n_mapped = result if n_mapped > 0: _seqid = n_match * 100 / n_mapped _cover = n_mapped * 100 / max(len(simple_target), len(simple_chain)) else: _seqid = 0 _cover = 0 if _seqid >= seqid and _cover >= coverage: LOGGER.debug( '\tMapped: {0} residues match with {1:.0f}%' ' sequence identity and {2:.0f}% overlap.'.format( n_mapped, _seqid, _cover)) mappings.append((target_list, chain_list, _seqid, _cover)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) for mi, result in enumerate(mappings): residues_target, residues_chain, _seqid, _cover = result indices_target = [] indices_chain = [] indices_mapping = [] indices_dummies = [] counter = 0 for i in range(len(residues_target)): res_tar = residues_target[i] res_chn = residues_chain[i] for atom_tar in res_tar: indices_target.append(atom_tar.getIndex()) if res_chn is not None: atom_chn = res_chn.getAtom(atom_tar.getName()) if atom_chn is not None: indices_chain.append(atom_chn.getIndex()) indices_mapping.append(counter) else: indices_dummies.append(counter) else: indices_dummies.append(counter) counter += 1 #n_atoms = len(indices_target) ch_tar = next((r for r in residues_target if r is not None)).getChain() ch_chn = next((r for r in residues_chain if r is not None)).getChain() title_tar = 'Chain {0} from {1}'.format( ch_tar.getChid(), ch_tar.getAtomGroup().getTitle()) title_chn = 'Chain {0} from {1}'.format( ch_chn.getChid(), ch_chn.getAtomGroup().getTitle()) atommap = AM(map_ag, indices_chain, chain.getACSIndex(), mapping=indices_mapping, dummies=indices_dummies, title=title_chn + ' -> ' + title_tar) selection = AM(target_ag, indices_target, target_chain.getACSIndex(), title=title_tar + ' -> ' + title_chn, intarrays=True) mappings[mi] = (atommap, selection, _seqid, _cover) if len(mappings) > 1: def compare(m1, m2): return cmp(m1[2], m2[2]) mappings.sort(compare, reverse=True) return mappings