def parsePDB(pdb, **kwargs): """Return an :class:`.AtomGroup` and/or dictionary containing header data parsed from a PDB file. This function extends :func:`.parsePDBStream`. |example| See :ref:`parsepdb` for a detailed example. :arg pdb: a PDB identifier or a filename If needed, PDB files are downloaded using :func:`.fetchPDB()` function. """ title = kwargs.get('title', None) if not os.path.isfile(pdb): if len(pdb) == 4 and pdb.isalnum(): if title is None: title = pdb kwargs['title'] = title filename = fetchPDB(pdb) if filename is None: raise IOError('PDB file for {0:s} could not be downloaded.' .format(pdb)) pdb = filename else: raise IOError('{0:s} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) if title is None: title, ext = os.path.splitext(os.path.split(pdb)[1]) if ext == '.gz': title, ext = os.path.splitext(title) kwargs['title'] = title pdb = openFile(pdb) result = parsePDBStream(pdb, **kwargs) pdb.close() return result
def _parsePDB(pdb, **kwargs): title = kwargs.get('title', None) chain = '' if not os.path.isfile(pdb): pdb, chain = _getPDBid(pdb) if title is None: title = pdb kwargs['title'] = title filename = fetchPDB(pdb, **kwargs) if filename is None: raise IOError( 'PDB file for {0} could not be downloaded.'.format(pdb)) pdb = filename if title is None: title, ext = os.path.splitext(os.path.split(pdb)[1]) if ext == '.gz': title, ext = os.path.splitext(title) if len(title) == 7 and title.startswith('pdb'): title = title[3:] kwargs['title'] = title pdb = openFile(pdb, 'rt') if chain != '': kwargs['chain'] = chain result = parsePDBStream(pdb, **kwargs) pdb.close() return result
def loadPDBClusters(sqid=None): """Load previously fetched PDB sequence clusters from disk to memory.""" PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if sqid is None: sqid_list = list(PDB_CLUSTERS) LOGGER.info('Loading all PDB sequence clusters.') else: assert isinstance(sqid, int), 'sqid must be an integer' if sqid not in PDB_CLUSTERS: raise ValueError('PDB cluster data is not available for sequence ' 'identity {0}%, try one of {1}' .format(sqid, PDB_CLUSTERS_SQID_STR)) LOGGER.info('Loading PDB sequence clusters for sequence identity ' '{0}.'.format(sqid)) sqid_list = [sqid] global PDB_CLUSTERS_UPDATE_WARNING for sqid in sqid_list: filename = os.path.join(PDB_CLUSTERS_PATH, 'bc-{0}.out.gz'.format(sqid)) if not os.path.isfile(filename): fetchPDBClusters(sqid) if PDB_CLUSTERS_UPDATE_WARNING: import time diff = (time.time() - os.path.getmtime(filename)) / 604800. if diff > 1.: LOGGER.warning('PDB sequence clusters are {0:.1f} week(s) old,' ' call `fetchPDBClusters` to receive updates.' .format(diff)) PDB_CLUSTERS_UPDATE_WARNING = False inp = openFile(filename) PDB_CLUSTERS[sqid] = inp.read() inp.close()
def _parsePDB(pdb, **kwargs): title = kwargs.get('title', None) chain = '' if not os.path.isfile(pdb): pdb, chain = _getPDBid(pdb) if title is None: title = pdb kwargs['title'] = title filename = fetchPDB(pdb, **kwargs) if filename is None: raise IOError('PDB file for {0} could not be downloaded.' .format(pdb)) pdb = filename if title is None: title, ext = os.path.splitext(os.path.split(pdb)[1]) if ext == '.gz': title, ext = os.path.splitext(title) if len(title) == 7 and title.startswith('pdb'): title = title[3:] kwargs['title'] = title pdb = openFile(pdb, 'rt') if chain != '': kwargs['chain'] = chain result = parsePDBStream(pdb, **kwargs) pdb.close() return result
def loadPDBClusters(sqid=None): """Load previously fetched PDB sequence clusters from disk to memory.""" PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if sqid is None: sqid_list = list(PDB_CLUSTERS) LOGGER.info('Loading all PDB sequence clusters.') else: assert isinstance(sqid, int), 'sqid must be an integer' if sqid not in PDB_CLUSTERS: raise ValueError('PDB cluster data is not available for sequence ' 'identity {0}%, try one of {1}'.format( sqid, PDB_CLUSTERS_SQID_STR)) LOGGER.info('Loading PDB sequence clusters for sequence identity ' '{0}.'.format(sqid)) sqid_list = [sqid] global PDB_CLUSTERS_UPDATE_WARNING for sqid in sqid_list: filename = os.path.join(PDB_CLUSTERS_PATH, 'bc-{0}.out.gz'.format(sqid)) if not os.path.isfile(filename): fetchPDBClusters(sqid) if PDB_CLUSTERS_UPDATE_WARNING: import time diff = (time.time() - os.path.getmtime(filename)) / 604800. if diff > 1.: LOGGER.warning( 'PDB sequence clusters are {0:.1f} week(s) old,' ' call `fetchPDBClusters` to receive updates.'.format( diff)) PDB_CLUSTERS_UPDATE_WARNING = False inp = openFile(filename) PDB_CLUSTERS[sqid] = inp.read() inp.close()
def saveEnsemble(ensemble, filename=None, **kwargs): """Save *ensemble* model data as :file:`filename.ens.npz`. If *filename* is ``None``, title of the *ensemble* will be used as the filename, after white spaces in the title are replaced with underscores. Extension is :file:`.ens.npz`. Upon successful completion of saving, filename is returned. This function makes use of :func:`numpy.savez` function.""" if not isinstance(ensemble, Ensemble): raise TypeError('invalid type for ensemble, {0}' .format(type(ensemble))) if len(ensemble) == 0: raise ValueError('ensemble instance does not contain data') dict_ = ensemble.__dict__ attr_list = ['_title', '_confs', '_weights', '_coords'] if isinstance(ensemble, PDBEnsemble): attr_list.append('_labels') attr_list.append('_trans') if filename is None: filename = ensemble.getTitle().replace(' ', '_') attr_dict = {} for attr in attr_list: value = dict_[attr] if value is not None: attr_dict[attr] = value attr_dict['_atoms'] = np.array([dict_['_atoms'], 0]) filename += '.ens.npz' ostream = openFile(filename, 'wb', **kwargs) np.savez(ostream, **attr_dict) ostream.close() return filename
def saveEnsemble(ensemble, filename=None, **kwargs): """Save *ensemble* model data as :file:`filename.ens.npz`. If *filename* is ``None``, title of the *ensemble* will be used as the filename, after white spaces in the title are replaced with underscores. Extension is :file:`.ens.npz`. Upon successful completion of saving, filename is returned. This function makes use of :func:`numpy.savez` function.""" if not isinstance(ensemble, Ensemble): raise TypeError('invalid type for ensemble, {0}'.format( type(ensemble))) if len(ensemble) == 0: raise ValueError('ensemble instance does not contain data') dict_ = ensemble.__dict__ attr_list = ['_title', '_confs', '_weights', '_coords'] if isinstance(ensemble, PDBEnsemble): attr_list.append('_labels') attr_list.append('_trans') if filename is None: filename = ensemble.getTitle().replace(' ', '_') attr_dict = {} for attr in attr_list: value = dict_[attr] if value is not None: attr_dict[attr] = value attr_dict['_atoms'] = np.array([dict_['_atoms'], 0]) filename += '.ens.npz' ostream = openFile(filename, 'wb', **kwargs) np.savez(ostream, **attr_dict) ostream.close() return filename
def saveHiC(hic, filename=None, map=True, **kwargs): """Saves *HiC* model data as :file:`filename.hic.npz`. If *map* is **True**, Hi-C contact map will not be saved and it can be loaded from raw data file later. If *filename* is **None**, name of the Hi-C instance will be used as the filename, after ``" "`` (white spaces) in the name are replaced with ``"_"`` (underscores). Upon successful completion of saving, filename is returned. This function makes use of :func:`numpy.savez` function.""" assert isinstance(hic, HiC), 'hic must be a HiC instance.' if filename is None: filename = hic.getTitle().replace(' ', '_') if filename.endswith('.hic'): filename += '.npz' elif not filename.endswith('.hic.npz'): filename += '.hic.npz' attr_dict = hic.__dict__.copy() if not map: attr_dict.pop('_map') ostream = openFile(filename, 'wb', **kwargs) np.savez(ostream, **attr_dict) ostream.close() return filename
def parsePQR(filename, **kwargs): """Returns an :class:`.AtomGroup` containing data parsed from PDB lines. :arg filename: a PQR filename :type filename: str""" title = kwargs.get('title', kwargs.get('name')) chain = kwargs.get('chain') subset = kwargs.get('subset') if not os.path.isfile(filename): raise IOError('No such file: {0}'.format(repr(filename))) if title is None: fn, ext = os.path.splitext(os.path.split(filename)[1]) if ext == '.gz': fn, ext = os.path.splitext(fn) title = fn.lower() title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset'.format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() else: ag = AtomGroup(title + title_suffix) n_csets = 0 pqr = openFile(filename, 'rt') lines = pqr.readlines() pqr.close() LOGGER.timeit() ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain, subset=subset, altloc_torf=False, format='pqr') if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) return ag else: return None
def parsePQR(filename, **kwargs): """Returns an :class:`.AtomGroup` containing data parsed from PDB lines. :arg filename: a PQR filename :type filename: str""" title = kwargs.get('title', kwargs.get('name')) model = 1 header = False chain = kwargs.get('chain') subset = kwargs.get('subset') altloc = kwargs.get('altloc', 'A') max_n_atoms = kwargs.get('max_n_atoms', 1e5) if not os.path.isfile(filename): raise IOError('No such file: {0}'.format(repr(filename))) if title is None: fn, ext = os.path.splitext(os.path.split(filename)[1]) if ext == '.gz': fn, ext = os.path.splitext(fn) title = fn.lower() title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset' .format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() else: ag = AtomGroup(title + title_suffix) n_csets = 0 pqr = openFile(filename, 'rt') lines = pqr.readlines() pqr.close() LOGGER.timeit() ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain, subset=subset, altloc_torf=False, format='pqr', max_n_atoms=max_n_atoms) if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) return ag else: return None
def saveEnsemble(ensemble, filename=None, **kwargs): """Save *ensemble* model data as :file:`filename.ens.npz`. If *filename* is **None**, title of the *ensemble* will be used as the filename, after white spaces in the title are replaced with underscores. Extension is :file:`.ens.npz`. Upon successful completion of saving, filename is returned. This function makes use of :func:`~numpy.savez` function.""" if not isinstance(ensemble, Ensemble): raise TypeError('invalid type for ensemble, {0}'.format( type(ensemble))) if len(ensemble) == 0: raise ValueError('ensemble instance does not contain data') dict_ = ensemble.__dict__ attr_list = ['_title', '_confs', '_weights', '_coords', '_indices'] if isinstance(ensemble, PDBEnsemble): attr_list.append('_labels') attr_list.append('_trans') elif isinstance(ensemble, ClustENM): attr_list.extend([ '_ph', '_cutoff', '_gamma', '_n_modes', '_n_confs', '_rmsd', '_n_gens', '_maxclust', '_threshold', '_sol', '_padding', '_ionicStrength', '_force_field', '_tolerance', '_maxIterations', '_sim', '_temp', '_t_steps', '_outlier', '_mzscore', '_v1', '_parallel', '_idx_cg', '_n_cg', '_cycle', '_time', '_targeted', '_tmdk' ]) if filename is None: filename = ensemble.getTitle().replace(' ', '_') attr_dict = {} for attr in attr_list: value = dict_[attr] if value is not None: attr_dict[attr] = value atoms = dict_['_atoms'] if atoms is not None: attr_dict['_atoms'] = np.array([atoms, None], dtype=object) data = dict_['_data'] if len(data): attr_dict['_data'] = np.array([data, None], dtype=object) if isinstance(ensemble, PDBEnsemble): msa = dict_['_msa'] if msa is not None: attr_dict['_msa'] = np.array([msa, None], dtype=object) attr_dict['_type'] = ensemble.__class__.__name__ if filename.endswith('.ens'): filename += '.npz' if not filename.endswith('.npz'): filename += '.ens.npz' ostream = openFile(filename, 'wb', **kwargs) np.savez(ostream, **attr_dict) ostream.close() return filename
def parseEMD(emd, **kwargs): """Returns an :class:`.AtomGroup` containing the information parsed from EMD file. This function extends :func:`.parseEMDStream`. See :ref:`parseEMD` for a detailed usage example. :arg emd: an EMD identifier or a file name. A 4-digit EMDataBank identifier can be provided to download it via FTP. :type emd: str :arg cutoff: density cutoff to read EMD map. The regions with lower density than given cutoff are discarded. :type cutoff: float or None :arg n_nodes: A bead based network will be constructed over provided density map. n_nodes parameter will show the number of beads that will fit to density map. :type n_nodes: integer :arg num_iter: After reading density map, coordinates are predicted with topological domain reconstruction method. This parameter is the total number of iterations of this algorithm: :type num_iter: integer :arg return_map: Return the density map itself. Default is False in line with previous behaviour. This value is reset to True if make_nodes is False as something must be returned. :type return_map: bool :arg make_nodes: Use the topology representing network algorithm to fit pseudoatom nodes to the map. Default is False and sets return_map to True. :type make_nodes: bool """ title = kwargs.get('title', None) if not os.path.isfile(emd): if len(emd) == 4 and emd.isdigit(): if title is None: title = emd kwargs['title'] = title if os.path.isfile(emd + '.map'): filename = emd + '.map' elif os.path.isfile(emd + '.map.gz'): filename = emd + '.map.gz' else: filename = fetchPDB(emd, report=True, format='emd',compressed=False) if filename is None: raise IOError('EMD map file for {0} could not be downloaded.' .format(emd)) emd = filename else: raise IOError('EMD file {0} is not available in the directory {1}' .format(emd, os.getcwd())) if title is None: kwargs['title'], ext = os.path.splitext(os.path.split(emd)[1]) emdStream = openFile(emd, 'rb') result = parseEMDStream(emdStream, **kwargs) emdStream.close() return result
def parseMMCIF(pdb, **kwargs): """Returns an :class:`.AtomGroup` and/or a :class:`.StarDict` containing header data parsed from an mmCIF file. If not found, the mmCIF file will be downloaded from the PDB. It will be downloaded in uncompressed format regardless of the compressed keyword. This function extends :func:`.parseMMCIFStream`. :arg pdb: a PDB identifier or a filename If needed, mmCIF files are downloaded using :func:`.fetchPDB()` function. :type pdb: str :arg chain: comma separated string or list-like of chain IDs :type chain: str, tuple, list, :class:`~numpy.ndarray` """ chain = kwargs.pop('chain', None) title = kwargs.get('title', None) if not os.path.isfile(pdb): if len(pdb) == 5 and pdb.isalnum(): if chain is None: chain = pdb[-1] pdb = pdb[:4] else: raise ValueError('Please provide chain as a keyword argument ' 'or part of the PDB ID, not both') else: chain = chain if len(pdb) == 4 and pdb.isalnum(): if title is None: title = pdb kwargs['title'] = title if os.path.isfile(pdb + '.cif'): filename = pdb + '.cif' elif os.path.isfile(pdb + '.cif.gz'): filename = pdb + '.cif.gz' else: filename = fetchPDB(pdb, report=True, format='cif', compressed=False) if filename is None: raise IOError('mmCIF file for {0} could not be downloaded.' .format(pdb)) pdb = filename else: raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) if title is None: title, ext = os.path.splitext(os.path.split(pdb)[1]) if ext == '.gz': title, ext = os.path.splitext(title) if len(title) == 7 and title.startswith('pdb'): title = title[3:] kwargs['title'] = title cif = openFile(pdb, 'rt') result = parseMMCIFStream(cif, chain=chain, **kwargs) cif.close() return result
def writePSF(filename, atoms): """Write atoms in X-PLOR format PSF file with name *filename* and return *filename*. This function will write available atom and bond information only.""" try: n_atoms, segments, rnums, rnames, names, types, charges, masses = ( atoms.numAtoms(), atoms._getSegnames(), atoms._getResnums(), atoms._getResnames(), atoms._getNames(), atoms._getTypes(), atoms._getCharges(), atoms._getMasses()) except AttributeError: raise TypeError('atoms must be an Atomic instance') if segments is None: segments = atoms._getChids() if segments is None: segments = ['UNK'] * n_atoms if rnums is None: rnums = ones(n_atoms, int) if rnames is None: rnames = ['UNK'] * n_atoms if names is None: raise ValueError('atom names are not set') if types is None: atomtypes = zeros(n_atoms, array(['a']).dtype.char + '1') long_fields = array([len(tp) for tp in types]).max() > 4 out = openFile(filename, 'w') write = out.write write('PSF{0}\n'.format( ' NAMD' if long_fields else '')) write('\n') write('{0:8d} !NTITLE\n'.format(1)) write(' REMARKS {0}\n'.format(str(atoms))) write('\n') write('{0:8d} !NATOM\n'.format(n_atoms)) for i in range(n_atoms): write(PSFLINE % (i + 1, segments[i], rnums[i], rnames[i], names[i], types[i], charges[i], masses[i], 0)) bonds = list(atoms._iterBonds()) if bonds: bonds = array(bonds, int) + 1 write('\n') write('{0:8d} !NBOND: bonds\n'.format(len(bonds))) for i, bond in enumerate(bonds): write('%8s%8s' % (bond[0], bond[1])) if i % 4 == 3: write('\n') if i % 4 != 3: write('\n') out.close() return filename
def writePSF(filename, atoms): """Write atoms in X-PLOR format PSF file with name *filename* and return *filename*. This function will write available atom and bond information only.""" try: n_atoms, segments, rnums, rnames, names, types, charges, masses = ( atoms.numAtoms(), atoms._getSegnames(), atoms._getResnums(), atoms._getResnames(), atoms._getNames(), atoms._getTypes(), atoms._getCharges(), atoms._getMasses()) except AttributeError: raise TypeError('atoms must be an Atomic instance') if segments is None: segments = atoms._getChids() if segments is None: segments = ['UNK'] * n_atoms if rnums is None: rnums = ones(n_atoms, int) if rnames is None: rnames = ['UNK'] * n_atoms if names is None: raise ValueError('atom names are not set') if types is None: atomtypes = zeros(n_atoms, array(['a']).dtype.char + '1') long_fields = array([len(tp) for tp in types]).max() > 4 out = openFile(filename, 'w') write = out.write write('PSF{0}\n'.format(' NAMD' if long_fields else '')) write('\n') write('{0:8d} !NTITLE\n'.format(1)) write(' REMARKS {0}\n'.format(str(atoms))) write('\n') write('{0:8d} !NATOM\n'.format(n_atoms)) for i in range(n_atoms): write(PSFLINE % (i + 1, segments[i], rnums[i], rnames[i], names[i], types[i], charges[i], masses[i], 0)) bonds = list(atoms._iterBonds()) if bonds: bonds = array(bonds, int) + 1 write('\n') write('{0:8d} !NBOND: bonds\n'.format(len(bonds))) for i, bond in enumerate(bonds): write('%8s%8s' % (bond[0], bond[1])) if i % 4 == 3: write('\n') if i % 4 != 3: write('\n') out.close() return filename
def writePDB(filename, atoms, model=None): """Write *atoms* in PDB format to a file with name *filename* and return *filename*. If *filename* ends with :file:`.gz`, a compressed file will be written.""" out = openFile(filename, 'w') writePDBStream(out, atoms, model) out.close() return filename
def writePQR(filename, atoms, **kwargs): """Write *atoms* in PQR format to a file with name *filename*. Only current coordinate set is written. Returns *filename* upon success. If *filename* ends with :file:`.gz`, a compressed file will be written.""" stream = openFile(filename, 'w') writePQRStream(stream, atoms, **kwargs) stream.close() return filename
def writeOverlapTable(filename, rows, cols): """Write table of overlaps (correlations) between two sets of modes to a file. *rows* and *cols* are sets of normal modes, and correspond to rows and columns of the overlap table. See also :func:`.printOverlapTable`.""" assert isinstance(filename, str), 'filename must be a string' out = openFile(filename, 'w') out.write(getOverlapTable(rows, cols)) out.close() return filename
def saveModel(nma, filename=None, matrices=False, **kwargs): """Save *nma* model data as :file:`filename.nma.npz`. By default, eigenvalues, eigenvectors, variances, trace of covariance matrix, and name of the model will be saved. If *matrices* is ``True``, covariance, Hessian or Kirchhoff matrices are saved too, whichever are available. If *filename* is ``None``, name of the NMA instance will be used as the filename, after ``" "`` (white spaces) in the name are replaced with ``"_"`` (underscores). Extension may differ based on the type of the NMA model. For ANM models, it is :file:`.anm.npz`. Upon successful completion of saving, filename is returned. This function makes use of :func:`numpy.savez` function.""" if not isinstance(nma, NMA): raise TypeError('invalid type for nma, {0}'.format(type(nma))) if len(nma) == 0: raise ValueError('nma instance does not contain data') dict_ = nma.__dict__ attr_list = [ '_title', '_trace', '_array', '_eigvals', '_vars', '_n_atoms', '_dof', '_n_modes' ] if filename is None: filename = nma.getTitle().replace(' ', '_') if isinstance(nma, GNMBase): attr_list.append('_cutoff') attr_list.append('_gamma') if matrices: attr_list.append('_kirchhoff') if isinstance(nma, ANM): attr_list.append('_hessian') if isinstance(nma, ANM): type_ = 'ANM' else: type_ = 'GNM' elif isinstance(nma, EDA): type_ = 'EDA' elif isinstance(nma, PCA): type_ = 'PCA' else: type_ = 'NMA' if matrices: attr_list.append('_cov') attr_dict = {'type': type_} for attr in attr_list: value = dict_[attr] if value is not None: attr_dict[attr] = value filename += '.' + type_.lower() + '.npz' ostream = openFile(filename, 'wb', **kwargs) np.savez(ostream, **attr_dict) ostream.close() return filename
def saveAtoms(atoms, filename=None, **kwargs): """Save *atoms* in ProDy internal format. All atomic classes are accepted as *atoms* argument. This function saves user set atomic data as well. Note that title of the AtomGroup instance is used as the filename when *atoms* is not an AtomGroup. To avoid overwriting an existing file with the same name, specify a *filename*.""" if not isinstance(atoms, Atomic): raise TypeError('atoms must be Atomic instance, not {0:s}' .format(type(atoms))) if isinstance(atoms, AtomGroup): ag = atoms title = ag.getTitle() SKIP = SAVE_SKIP_ATOMGROUP else: ag = atoms.getAtomGroup() title = str(atoms) SKIP = SAVE_SKIP_POINTER if filename is None: filename = ag.getTitle().replace(' ', '_') filename += '.ag.npz' attr_dict = {'title': title} attr_dict['n_atoms'] = atoms.numAtoms() attr_dict['n_csets'] = atoms.numCoordsets() attr_dict['cslabels'] = atoms.getCSLabels() coords = atoms._getCoordsets() if coords is not None: attr_dict['coordinates'] = coords bonds = ag._bonds bmap = ag._bmap if bonds is not None and bmap is not None: if isinstance(atoms, AtomGroup): attr_dict['bonds'] = bonds attr_dict['bmap'] = bmap attr_dict['numbonds'] = ag._data['numbonds'] frags = ag._data['fragindices'] if frags is not None: attr_dict['fragindices'] = frags else: bonds = trimBonds(bonds, atoms._getIndices()) attr_dict['bonds'] = bonds attr_dict['bmap'], attr_dict['numbonds'] = \ evalBonds(bonds, len(atoms)) for key, data in ag._data.iteritems(): if key in SKIP: continue if data is not None: attr_dict[key] = data ostream = openFile(filename, 'wb', **kwargs) savez(ostream, **attr_dict) ostream.close() return filename
def setUp(self): self.pref = join(TEMPDIR, 'compressed.txt') self.gzfn = self.pref + '.gz' self.text = ''.join(['some random text '] * 100) try: self.bytes = bytes(self.text, encoding='utf-8') except TypeError: self.bytes = self.text out = openFile(self.gzfn, 'wt') out.write(self.text) out.close()
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if isListLike(sqid): for s in sqid: if s not in PDB_CLUSTERS: raise ValueError('sqid must be one or more of ' + PDB_CLUSTERS_SQID_STR) keys = list(sqid) else: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one or more of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(keys), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.'.format(x)) continue else: out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, label='_prody_fetchPDBClusters') LOGGER.finish() if len(keys) == count: LOGGER.info('All selected PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def writePDB(filename, atoms, csets=None, autoext=True, **kwargs): """Write *atoms* in PDB format to a file with name *filename* and return *filename*. If *filename* ends with :file:`.gz`, a compressed file will be written.""" if not ('.pdb' in filename or '.pdb.gz' in filename or '.ent' in filename or '.ent.gz' in filename): filename += '.pdb' out = openFile(filename, 'wt') writePDBStream(out, atoms, csets, **kwargs) out.close() return filename
def saveModel(nma, filename=None, matrices=False, **kwargs): """Save *nma* model data as :file:`filename.nma.npz`. By default, eigenvalues, eigenvectors, variances, trace of covariance matrix, and name of the model will be saved. If *matrices* is ``True``, covariance, Hessian or Kirchhoff matrices are saved too, whichever are available. If *filename* is ``None``, name of the NMA instance will be used as the filename, after ``" "`` (white spaces) in the name are replaced with ``"_"`` (underscores). Extension may differ based on the type of the NMA model. For ANM models, it is :file:`.anm.npz`. Upon successful completion of saving, filename is returned. This function makes use of :func:`numpy.savez` function.""" if not isinstance(nma, NMA): raise TypeError('invalid type for nma, {0}'.format(type(nma))) if len(nma) == 0: raise ValueError('nma instance does not contain data') dict_ = nma.__dict__ attr_list = ['_title', '_trace', '_array', '_eigvals', '_vars', '_n_atoms', '_dof', '_n_modes'] if filename is None: filename = nma.getTitle().replace(' ', '_') if isinstance(nma, GNMBase): attr_list.append('_cutoff') attr_list.append('_gamma') if matrices: attr_list.append('_kirchhoff') if isinstance(nma, ANM): attr_list.append('_hessian') if isinstance(nma, ANM): type_ = 'ANM' else: type_ = 'GNM' elif isinstance(nma, EDA): type_ = 'EDA' elif isinstance(nma, PCA): type_ = 'PCA' else: type_ = 'NMA' if matrices: attr_list.append('_cov') attr_dict = {'type': type_} for attr in attr_list: value = dict_[attr] if value is not None: attr_dict[attr] = value filename += '.' + type_.lower() + '.npz' ostream = openFile(filename, 'wb', **kwargs) np.savez(ostream, **attr_dict) ostream.close() return filename
def saveModel(nma, filename=None, matrices=False, **kwargs): """Save *nma* model data as :file:`filename.nma.npz`. By default, eigenvalues, eigenvectors, variances, trace of covariance matrix, and name of the model will be saved. If *matrices* is ``True``, covariance, Hessian or Kirchhoff matrices are saved too, whichever are available. If *filename* is ``None``, name of the NMA instance will be used as the filename, after ``" "`` (white spaces) in the name are replaced with ``"_"`` (underscores). Extension may differ based on the type of the NMA model. For ANM models, it is :file:`.anm.npz`. Upon successful completion of saving, filename is returned. This function makes use of :func:`numpy.savez` function.""" if not isinstance(nma, NMA): raise TypeError("invalid type for nma, {0:s}".format(type(nma))) if len(nma) == 0: raise ValueError("nma instance does not contain data") dict_ = nma.__dict__ attr_list = ["_title", "_trace", "_array", "_eigvals", "_vars", "_n_atoms", "_dof", "_n_modes"] if filename is None: filename = nma.getTitle().replace(" ", "_") if isinstance(nma, GNMBase): attr_list.append("_cutoff") attr_list.append("_gamma") if matrices: attr_list.append("_kirchhoff") if isinstance(nma, ANM): attr_list.append("_hessian") if isinstance(nma, ANM): type_ = "ANM" else: type_ = "GNM" elif isinstance(nma, EDA): type_ = "EDA" elif isinstance(nma, PCA): type_ = "PCA" else: type_ = "NMA" if matrices: attr_list.append("_cov") attr_dict = {"type": type_} for attr in attr_list: value = dict_[attr] if value is not None: attr_dict[attr] = value filename += "." + type_.lower() + ".npz" ostream = openFile(filename, "wb", **kwargs) np.savez(ostream, **attr_dict) ostream.close() return filename
def parseCIF(pdb, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from an mmCIF file. If not found, the mmCIF file will be downloaded from the PDB. It will be downloaded in uncompressed format regardless of the compressed keyword. This function extends :func:`.parseCIFStream`. See :ref:`parsecif` for a detailed usage example. :arg pdb: a PDB identifier or a filename If needed, mmCIF files are downloaded using :func:`.fetchPDB()` function. :type pdb: str """ title = kwargs.get('title', None) if not os.path.isfile(pdb): if len(pdb) == 4 and pdb.isalnum(): if title is None: title = pdb kwargs['title'] = title if os.path.isfile(pdb + '.cif'): filename = pdb + '.cif' elif os.path.isfile(pdb + '.cif.gz'): filename = pdb + '.cif.gz' else: filename = fetchPDB(pdb, report=True, format='cif', compressed=False) if filename is None: raise IOError( 'mmCIF file for {0} could not be downloaded.'.format( pdb)) pdb = filename else: raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) if title is None: title, ext = os.path.splitext(os.path.split(pdb)[1]) if ext == '.gz': title, ext = os.path.splitext(title) if len(title) == 7 and title.startswith('pdb'): title = title[3:] kwargs['title'] = title cif = openFile(pdb, 'rt') result = parseCIFStream(cif, **kwargs) cif.close() return result
def saveVector(vector, filename, **kwargs): """Save *vector* data as :file:`filename.vec.npz`. Upon successful completion of saving, filename is returned. This function makes use of :func:`numpy.savez` function.""" if not isinstance(vector, Vector): raise TypeError('invalid type for vector, {0}'.format(type(vector))) attr_dict = {} attr_dict['title'] = vector.getTitle() attr_dict['array'] = vector._getArray() attr_dict['is3d'] = vector.is3d() filename += '.vec.npz' ostream = openFile(filename, 'wb', **kwargs) np.savez(ostream, **attr_dict) ostream.close() return filename
def writePDB(filename, atoms, csets=None, autoext=True, **kwargs): """Write *atoms* in PDB format to a file with name *filename* and return *filename*. If *filename* ends with :file:`.gz`, a compressed file will be written. :arg renumber: whether to renumber atoms with serial indices Default is **True** :type renumber: bool """ if not ('.pdb' in filename or '.pdb.gz' in filename or '.ent' in filename or '.ent.gz' in filename): filename += '.pdb' out = openFile(filename, 'wt') writePDBStream(out, atoms, csets, **kwargs) out.close() return filename
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, '_prody_fetchPDBClusters') LOGGER.clear() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def parseCIF(pdb, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from an mmCIF file. If not found, the mmCIF file will be downloaded from the PDB. It will be downloaded in uncompressed format regardless of the compressed keyword. This function extends :func:`.parseCIFStream`. See :ref:`parsecif` for a detailed usage example. :arg pdb: a PDB identifier or a filename If needed, mmCIF files are downloaded using :func:`.fetchPDB()` function. :type pdb: str """ title = kwargs.get('title', None) if not os.path.isfile(pdb): if len(pdb) == 4 and pdb.isalnum(): if title is None: title = pdb kwargs['title'] = title if os.path.isfile(pdb + '.cif'): filename = pdb + '.cif' elif os.path.isfile(pdb + '.cif.gz'): filename = pdb + '.cif.gz' else: filename = fetchPDB(pdb, report=True, format='cif',compressed=False) if filename is None: raise IOError('mmCIF file for {0} could not be downloaded.' .format(pdb)) pdb = filename else: raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) if title is None: title, ext = os.path.splitext(os.path.split(pdb)[1]) if ext == '.gz': title, ext = os.path.splitext(title) if len(title) == 7 and title.startswith('pdb'): title = title[3:] kwargs['title'] = title cif = openFile(pdb, 'rt') result = parseCIFStream(cif, **kwargs) cif.close() return result
def saveModeEnsemble(mode_ensemble, filename=None, atoms=False, **kwargs): """Save *mode_ensemble* as :file:`filename.modeens.npz`. If *filename* is **None**, title of the ModeEnsemble instance will be used as the filename, after ``" "`` (white spaces) in the title are replaced with ``"_"`` (underscores). Upon successful completion of saving, filename is returned. This function makes use of :func:`~numpy.savez_compressed` function.""" if not isinstance(mode_ensemble, ModeEnsemble): raise TypeError('invalid type for mode_ensemble, {0}'.format( type(mode_ensemble))) if len(mode_ensemble) == 0: raise ValueError('mode_ensemble instance does not contain data') attr_list = ['_modesets', '_title', '_labels', '_weights', '_matched'] attr_dict = {} if atoms: attr_list.append('_atoms') for attr in attr_list: value = getattr(mode_ensemble, attr) if value is not None: if attr == '_atoms': value = [value, None] if attr == '_modesets': value = list(value) value.append(None) attr_dict[attr] = value if filename is None: filename = mode_ensemble.getTitle().replace(' ', '_') suffix = '.modeens' if not filename.lower().endswith('.npz'): if not filename.lower().endswith(suffix): filename += suffix + '.npz' else: filename += '.npz' ostream = openFile(filename, 'wb', **kwargs) np.savez_compressed(ostream, **attr_dict) ostream.close() return filename
def parseSTAR(filename, **kwargs): """Returns a dictionary containing data parsed from a Relion STAR file. :arg filename: a filename The .star extension can be omitted. :type filename: str :arg start: line number for starting Default is **None**, meaning start at the beginning :type start: int, None :arg stop: line number for stopping Default is **None**, meaning don't stop. :type stop: int, None :arg shlex: whether to use shlex for splitting lines so as to preserve quoted substrings Default is **False** :type shlex: bool """ if not os.path.isfile(filename) and not os.path.isfile(filename + '.star'): raise IOError('There is no file called {0}.'.format(filename)) start = kwargs.get('start', None) if start is not None and not isinstance(start, Integral): raise TypeError('start should be an integer or None') stop = kwargs.get('stop', None) if stop is not None and not isinstance(stop, Integral): raise TypeError('stop should be an integer or None') shlex = kwargs.get('shlex', False) if not isinstance(shlex, bool): raise TypeError('shlex should be a boolean') starfile = openFile(filename, 'r') lines = starfile.readlines() lines = [pystr(line) for line in lines] starfile.close() parsingDict, prog = parseSTARLines(lines, **kwargs) return StarDict(parsingDict, prog, filename)
def parseEMD(emd, **kwargs): """Returns an :class:`.AtomGroup` containing the information parsed from EMD file. This function extends :func:`.parseEMDStream`. See :ref:`parseEMD` for a detailed usage example. :arg emd: an EMD identifier or a file name, EMD files should be locally available. :arg cutoff: density cutoff to read EMD map. The regions with lower density than given cutoff are discarded. :type cutoff: float :arg n_nodes: A bead based network will be constructed over provided density map. n_nodes parameter will show the number of beads that will fit to density map. :type n_nodes: integer :arg num_iter: After reading density map, coordinates are predicted with topological domain reconstruction method. This parameter is the total number of iterations of this algorithm: :type num_iter: integer """ title = kwargs.get('title', None) cutoff = float(kwargs.get('cutoff', 1.20)) n_nodes = int(kwargs.get('n_nodes', 1000)) num_iter = int(kwargs.get('num_iter', 20)) if not os.path.isfile(emd): raise IOError('EMD file {0} is not available in the directory {1}' .format(emd),os.getcwd()) if title is None: title, ext = os.path.splitext(os.path.split(emd)[1]) kwargs['title'] = title kwargs['cutoff'] = cutoff kwargs['n_nodes'] = n_nodes emdStream = openFile(emd, 'rb') result = parseEMDStream(emdStream, **kwargs) emdStream.close() return result
def fetchPDBClusters(): """Downloads PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`getPDBCluster`.""" import urllib2 PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS)) count = 0 for i, x in enumerate(PDB_CLUSTERS.keys()): filename = 'bc-{0:d}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = urllib2.urlopen(url) except urllib2.HTTPError: LOGGER.warning('Clusters at {0:d}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i) LOGGER.clear() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warning('PDB clusters could not be downloaded.')
def parsePDB(pdb, **kwargs): """Return an :class:`.AtomGroup` and/or dictionary containing header data parsed from a PDB file. This function extends :func:`.parsePDBStream`. See :ref:`parsepdb` for a detailed usage example. :arg pdb: a PDB identifier or a filename If needed, PDB files are downloaded using :func:`.fetchPDB()` function. """ print "tria" title = kwargs.get('title', None) if not os.path.isfile(pdb): if len(pdb) == 4 and pdb.isalnum(): if title is None: title = pdb kwargs['title'] = title filename = fetchPDB(pdb, report=True) if filename is None: raise IOError( 'PDB file for {0} could not be downloaded.'.format(pdb)) pdb = filename else: raise IOError('{0} is not a valid filename or a valid PDB ' 'identifier.'.format(pdb)) if title is None: title, ext = os.path.splitext(os.path.split(pdb)[1]) if ext == '.gz': title, ext = os.path.splitext(title) if len(title) == 7 and title.startswith('pdb'): title = title[3:] kwargs['title'] = title pdb = openFile(pdb, 'rt') result = parsePDBStream(pdb, **kwargs) pdb.close() return result
def prody_fetch(*pdb, **kwargs): """Fetch PDB files from PDB FTP server. :arg pdbs: PDB identifier(s) or filename(s) :arg dir: target directory for saving PDB file(s), default is ``'.'`` :arg gzip: gzip fetched files or not, default is ``False``""" import prody pdblist = pdb if len(pdblist) == 1 and os.path.isfile(pdblist[0]): from prody.utilities import openFile with openFile(pdblist[0]) as inp: for item in inp.read().strip().split(): for pdb in item.split(','): if len(pdb) == 4 and pdb.isalnum(): pdblist.append(pdb) prody.fetchPDB(*pdblist, folder=kwargs.get('folder', '.'), compressed=kwargs.get('gzip', False), copy=True)
def parseHeatmap(heatmap, **kwargs): """Returns a two dimensional array and a dictionary with information parsed from *heatmap*, which may be an input stream or an :file:`.hm` file in VMD plugin Heat Mapper format.""" try: readline, close = heatmap.readline, lambda: None except AttributeError: heatmap = openFile(heatmap) readline, close = heatmap.readline, heatmap.close meta = {} arrs = [] line = readline() while line: if line.startswith('-'): label, data = line[1:].split(None, 1) data = data.strip() if data[0] == data[-1] == '"': data = data[1:-1] label = label.strip() try: meta[label] = HMTYPES[label](data) except KeyError: LOGGER.warn('Unrecognized label encountered: {0}'.format( repr(label))) meta[label] = HMTYPES[label](data) except TypeError: LOGGER.warn('Could not parse data with label {0}.'.format( repr(label))) else: arrs.append(line.rstrip()) line = readline() close() nnums = len(meta.get('numbering', '')) heatmap = [] numbers = [] for arr in arrs: if nnums: items = arr.split(':', nnums + 1) numbers.append(items[:nnums]) else: items = [arr] heatmap.append(fromstring(items[-1], float, sep=';')) heatmap = array(heatmap) if nnums: numbering = meta['numbering'] try: numbers = array(numbers, int) except ValueError: try: numbers = array(numbers, float) except ValueError: LOGGER.warn('Numbering for y-axis could not be parsed.') numbering = [] for i, label in enumerate(numbering): meta[label] = numbers[:, i].copy() return heatmap, meta
def writeHeatmap(filename, heatmap, **kwargs): """Returns *filename* that contains *heatmap* in Heat Mapper :file:`.hm` file (extension is automatically added when not found). *filename* may also be an output stream. :arg title: title of the heatmap :type title: str :arg xlabel: x-axis lab, default is ``'unknown'`` :type xlabel: str :arg ylabel: y-axis lab, default is ``'unknown'`` :type ylabel: str :arg xorigin: x-axis origin, default is 0 :type xorigin: float :arg xstep: x-axis step, default is 1 :type xstep: float :arg min: minimum value, default is minimum in *heatmap* :type min: float :arg max: maximum value, default is maximum in *heatmap* :type max: float :arg format: number format, default is ``'%f'`` :type format: str Other keyword arguments that are arrays with length equal to the y-axis (second dimension of heatmap) will be considered as *numbering*.""" try: ndim, shape = heatmap.ndim, heatmap.shape except: raise TypeError('heatmap must be an array object') if ndim != 2: raise TypeError('heatmap must be a 2D array') try: write, close, stream = filename.write, lambda: None, filename except AttributeError: out = openFile(addext(filename, '.hm'), 'w') write, close, stream = out.write, out.close, out format = kwargs.pop('format', '%f') write('-min "{0}"\n'.format(kwargs.pop('min', heatmap.min()))) write('-max "{0}"\n'.format(kwargs.pop('max', heatmap.max()))) for label, default in [ ('title', 'unknown'), ('xlabel', 'unknown'), ('xorigin', 0), ('xstep', 1), ('ylabel', 'unknown'), ]: write('-{0} "{1}"\n'.format(label, kwargs.pop(label, default))) numbering = [] numlabels = [] for key, val in kwargs.items(): try: length = len(val) except TypeError: LOGGER.warn('Keyword argument {0} is not used.'.format(key)) else: if length == shape[0]: numlabels.append(key) numbering.append(val) if not numbering: numlabels.append('unknown') numbering.append(arange(1, shape[0] + 1)) write('-numbering "{0}"\n'.format(':'.join(numlabels))) for i, row in enumerate(heatmap): write(':'.join(str(nums[i]) for nums in numbering) + ':') row.tofile(stream, sep=';', format=format) write(';\n') close() return filename
def parsePSF(filename, title=None, ag=None): """Return an :class:`.AtomGroup` instance storing data parsed from X-PLOR format PSF file *filename*. Atom and bond information is parsed from the file. If *title* is not given, *filename* will be set as the title of the :class:`.AtomGroup` instance. An :class:`.AtomGroup` instance may be provided as *ag* argument. When provided, *ag* must have the same number of atoms in the same order as the file. Data from PSF file will be added to the *ag*. This may overwrite present data if it overlaps with PSF file content. Note that this function does not evaluate angles, dihedrals, and impropers sections.""" if ag is not None: if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') psf = openFile(filename, 'rb') line = psf.readline() i_line = 1 while line: line = line.strip() if line.endswith('!NATOM'): n_atoms = int(line.split('!')[0]) break line = psf.readline() i_line += 1 if title is None: title = os.path.splitext(os.path.split(filename)[1])[0] else: title = str(title) if ag is None: ag = AtomGroup(title) else: if n_atoms != ag.numAtoms(): raise ValueError('ag and PSF file must have same number of atoms') serials = zeros(n_atoms, ATOMIC_FIELDS['serial'].dtype) segnames = zeros(n_atoms, ATOMIC_FIELDS['segment'].dtype) resnums = zeros(n_atoms, ATOMIC_FIELDS['resnum'].dtype) resnames = zeros(n_atoms, ATOMIC_FIELDS['resname'].dtype) atomnames = zeros(n_atoms, ATOMIC_FIELDS['name'].dtype) atomtypes = zeros(n_atoms, ATOMIC_FIELDS['type'].dtype) charges = zeros(n_atoms, ATOMIC_FIELDS['charge'].dtype) masses = zeros(n_atoms, ATOMIC_FIELDS['mass'].dtype) lines = psf.readlines(71 * (n_atoms + 5)) if len(lines) < n_atoms: raise IOError('number of lines in PSF is less than the number of ' 'atoms') for i, line in enumerate(lines): if i == n_atoms: break i_line += 1 if len(line) <= 71: serials[i] = line[:8] segnames[i] = line[9:13].strip() resnums[i] = line[14:19] resnames[i] = line[19:23].strip() atomnames[i] = line[24:28].strip() atomtypes[i] = line[29:35].strip() charges[i] = line[35:44] masses[i] = line[50:60] else: items = line.split() serials[i] = items[0] segnames[i] = items[1] resnums[i] = items[2] resnames[i] = items[3] atomnames[i] = items[4] atomtypes[i] = items[5] charges[i] = items[6] masses[i] = items[7] i = n_atoms while 1: line = lines[i].split() if len(line) >= 2 and line[1] == '!NBOND:': n_bonds = int(line[0]) break i += 1 lines = ''.join(lines[i+1:]) + psf.read(n_bonds/4 * 71) array = fromstring(lines, count=n_bonds*2, dtype=int, sep=' ') if len(array) != n_bonds*2: raise IOError('number of bonds expected and parsed do not match') psf.close() ag.setSerials(serials) ag.setSegnames(segnames) ag.setResnums(resnums) ag.setResnames(resnames) ag.setNames(atomnames) ag.setTypes(atomtypes) ag.setCharges(charges) ag.setMasses(masses) array = add(array, -1, array) ag.setBonds(array.reshape((n_bonds, 2))) return ag
def parseEMD(emd, **kwargs): """Returns an :class:`.AtomGroup` containing the information parsed from EMD file. This function extends :func:`.parseEMDStream`. See :ref:`parseEMD` for a detailed usage example. :arg emd: an EMD identifier or a file name. A 4-digit EMDataBank identifier can be provided to download it via FTP. :type emd: str :arg cutoff: density cutoff to read EMD map. The regions with lower density than given cutoff are discarded. :type cutoff: float :arg n_nodes: A bead based network will be constructed over provided density map. n_nodes parameter will show the number of beads that will fit to density map. :type n_nodes: int :arg num_iter: After reading density map, coordinates are predicted with topological domain reconstruction method. This parameter is the total number of iterations of this algorithm: :type num_iter: int :arg map: Return the density map itself. Default is **False** in line with previous behaviour. This value is reset to **True** if make_nodes is **False** as something must be returned. :type map: bool :arg make_nodes: Use the topology representing network algorithm to fit pseudoatom nodes to the map. Default is **False** and sets map to **True**. :type make_nodes: bool """ title = kwargs.get('title', None) if not os.path.isfile(emd): if len(emd) == 4 and emd.isdigit(): if title is None: title = emd kwargs['title'] = title if os.path.isfile(emd + '.map'): filename = emd + '.map' elif os.path.isfile(emd + '.map.gz'): filename = emd + '.map.gz' else: filename = fetchPDB(emd, report=True, format='emd',compressed=False) if filename is None: raise IOError('EMD map file for {0} could not be downloaded.' .format(emd)) emd = filename else: raise IOError('EMD file {0} is not available in the directory {1}' .format(emd, os.getcwd())) if title is None: kwargs['title'], ext = os.path.splitext(os.path.split(emd)[1]) emdStream = openFile(emd, 'rb') if emd.endswith('.stk'): result = parseSTKStream(emd, **kwargs) else: result = parseEMDStream(emdStream, **kwargs) emdStream.close() return result
def writePSF(filename, atoms): """Write atoms in X-PLOR format PSF file with name *filename* and return *filename*. This function will write available atom and bond information only.""" if not filename.endswith('.psf'): filename = filename + '.psf' try: n_atoms, segments, rnums, rnames, names, types, charges, masses = ( atoms.numAtoms(), atoms._getSegnames(), atoms._getResnums(), atoms._getResnames(), atoms._getNames(), atoms._getTypes(), atoms._getCharges(), atoms._getMasses()) except AttributeError: raise TypeError('atoms must be an Atomic instance') if segments is None: segments = atoms._getChids() if segments is None: segments = ['UNK'] * n_atoms if rnums is None: rnums = ones(n_atoms, int) if rnames is None: rnames = ['UNK'] * n_atoms if names is None: raise ValueError('atom names are not set') if types is None: types = zeros(n_atoms, ATOMIC_FIELDS['type'].dtype) if charges is None: charges = zeros(n_atoms, ATOMIC_FIELDS['charge'].dtype) if masses is None: masses = zeros(n_atoms, ATOMIC_FIELDS['mass'].dtype) long_fields = array([len(tp) for tp in types]).max() > 4 out = openFile(filename, 'w') write = out.write write('PSF{0}\n'.format(' NAMD' if long_fields else '')) write('\n') write('{0:8d} !NTITLE\n'.format(1)) write(' REMARKS {0}\n'.format(str(atoms))) write('\n') write('{0:8d} !NATOM\n'.format(n_atoms)) for i in range(n_atoms): write(PSFLINE % (i + 1, segments[i], rnums[i], rnames[i], names[i], types[i], charges[i], masses[i], 0)) bonds = list(atoms._iterBonds()) if len(bonds) > 0: bonds = array(bonds, int) + 1 write('\n') write('{0:8d} !NBOND: bonds\n'.format(len(bonds))) for i, bond in enumerate(bonds): write('%8s%8s' % (bond[0], bond[1])) if i % 4 == 3: write('\n') if i % 4 != 3: write('\n') angles = list(atoms._iterAngles()) if len(angles) > 0: angles = array(angles, int) + 1 write('\n') write('{0:8d} !NTHETA: angles\n'.format(len(angles))) for i, angle in enumerate(angles): write('%8s%8s%8s' % (angle[0], angle[1], angle[2])) if i % 3 == 2: write('\n') if i % 3 != 2: write('\n') dihedrals = list(atoms._iterDihedrals()) if len(dihedrals) > 0: dihedrals = array(dihedrals, int) + 1 write('\n') write('{0:8d} !NPHI: dihedrals\n'.format(len(dihedrals))) for i, dihedral in enumerate(dihedrals): write('%8s%8s%8s%8s' % (dihedral[0], dihedral[1], dihedral[2], dihedral[3])) if i % 4 == 3: write('\n') if i % 4 != 3: write('\n') impropers = list(atoms._iterImpropers()) if len(impropers) > 0: impropers = array(impropers, int) + 1 write('\n') write('{0:8d} !NIMPHI: impropers\n'.format(len(impropers))) for i, improper in enumerate(impropers): write('%8s%8s%8s%8s' % (improper[0], improper[1], improper[2], improper[3])) if i % 2 == 1: write('\n') if i % 2 != 1: write('\n') write('\n') donors = list(atoms._iterDonors()) if len(donors) > 0: donors = array(donors, int) + 1 write('\n') write('{0:8d} !NDON: donors\n'.format(len(donors))) for i, donor in enumerate(donors): write('%8s%8s' % (donor[0], donor[1])) if i % 4 == 3: write('\n') if i % 4 != 3: write('\n') else: write('{0:8d} !NDON: donors\n'.format(0)) write('\n') write('\n') acceptors = list(atoms._iterAcceptors()) if len(acceptors) > 0: acceptors = array(acceptors, int) + 1 write('\n') write('{0:8d} !NACC: acceptors\n'.format(len(acceptors))) for i, acceptor in enumerate(acceptors): write('%8s%8s' % (acceptor[0], acceptor[1])) if i % 4 == 3: write('\n') if i % 4 != 3: write('\n') else: write('{0:8d} !NACC: acceptors\n'.format(0)) write('\n') nbexclusions = list(atoms._iterNBExclusions()) if len(nbexclusions) > 0: nbexclusions = array(nbexclusions, int) + 1 write('\n') write('{0:8d} !NNB\n'.format(len(nbexclusions))) for i, nbexclusion in enumerate(nbexclusions): write('%8s%8s' % (nbexclusion[0], nbexclusion[1])) if i % 4 == 3: write('\n') if i % 4 != 3: write('\n') else: write('{0:8d} !NNB\n'.format(0)) write('\n') crossterms = list(atoms._iterCrossterms()) if len(crossterms) > 0: crossterms = array(crossterms, int) + 1 write('\n') write('{0:8d} !NCRTERM: crossterms\n'.format(len(crossterms))) for i, crossterm in enumerate(crossterms): write('%8s%8s%8s%8s' % (crossterm[0], crossterm[1], crossterm[2], crossterm[3])) if i % 2 == 1: write('\n') if i % 2 != 1: write('\n') write('\n') out.close() return filename
def writePQR(filename, atoms): """Write *atoms* in PQR format to a file with name *filename*. Only current coordinate set is written. Returns *filename* upon success. If *filename* ends with :file:`.gz`, a compressed file will be written.""" if not isinstance(atoms, Atomic): raise TypeError('atoms does not have a valid type') if isinstance(atoms, Atom): atoms = Selection(atoms.getAtomGroup(), [atoms.getIndex()], atoms.getACSIndex(), 'index ' + str(atoms.getIndex())) stream = openFile(filename, 'w') n_atoms = atoms.numAtoms() atomnames = atoms.getNames() if atomnames is None: raise RuntimeError('atom names are not set') for i, an in enumerate(atomnames): lenan = len(an) if lenan < 4: atomnames[i] = ' ' + an elif lenan > 4: atomnames[i] = an[:4] s_or_u = np.array(['a']).dtype.char resnames = atoms._getResnames() if resnames is None: resnames = ['UNK'] * n_atoms resnums = atoms._getResnums() if resnums is None: resnums = np.ones(n_atoms, int) chainids = atoms._getChids() if chainids is None: chainids = np.zeros(n_atoms, s_or_u + '1') charges = atoms._getCharges() if charges is None: charges = np.zeros(n_atoms, float) radii = atoms._getRadii() if radii is None: radii = np.zeros(n_atoms, float) icodes = atoms._getIcodes() if icodes is None: icodes = np.zeros(n_atoms, s_or_u + '1') hetero = ['ATOM'] * n_atoms heteroflags = atoms._getFlags('hetatm') if heteroflags is None: heteroflags = atoms._getFlags('hetero') if heteroflags is not None: hetero = np.array(hetero, s_or_u + '6') hetero[heteroflags] = 'HETATM' altlocs = atoms._getAltlocs() if altlocs is None: altlocs = np.zeros(n_atoms, s_or_u + '1') format = ('{0:6s}{1:5d} {2:4s}{3:1s}' + '{4:4s}{5:1s}{6:4d}{7:1s} ' + '{8:8.3f}{9:8.3f}{10:8.3f}' + '{11:8.4f}{12:7.4f}\n').format coords = atoms._getCoords() write = stream.write for i, xyz in enumerate(coords): write( format(hetero[i], i + 1, atomnames[i], altlocs[i], resnames[i], chainids[i], int(resnums[i]), icodes[i], xyz[0], xyz[1], xyz[2], charges[i], radii[i])) write('TER\nEND') stream.close() return filename
def prody_anm(pdb, **kwargs): """Perform ANM calculations for *pdb*. """ for key in DEFAULTS: if not key in kwargs: kwargs[key] = DEFAULTS[key] from os.path import isdir, join outdir = kwargs.get('outdir') if not isdir(outdir): raise IOError('{0} is not a valid path'.format(repr(outdir))) import numpy as np import prody LOGGER = prody.LOGGER selstr = kwargs.get('select') prefix = kwargs.get('prefix') cutoff = kwargs.get('cutoff') gamma = kwargs.get('gamma') nmodes = kwargs.get('nmodes') selstr = kwargs.get('select') model = kwargs.get('model') pdb = prody.parsePDB(pdb, model=model) if prefix == '_anm': prefix = pdb.getTitle() + '_anm' select = pdb.select(selstr) if select is None: LOGGER.warn('Selection {0} did not match any atoms.' .format(repr(selstr))) return LOGGER.info('{0} atoms will be used for ANM calculations.' .format(len(select))) anm = prody.ANM(pdb.getTitle()) anm.buildHessian(select, cutoff, gamma) anm.calcModes(nmodes) LOGGER.info('Writing numerical output.') if kwargs.get('outnpz'): prody.saveModel(anm, join(outdir, prefix)) prody.writeNMD(join(outdir, prefix + '.nmd'), anm, select) extend = kwargs.get('extend') if extend: if extend == 'all': extended = prody.extendModel(anm, select, pdb) else: extended = prody.extendModel(anm, select, select | pdb.bb) prody.writeNMD(join(outdir, prefix + '_extended_' + extend + '.nmd'), *extended) outall = kwargs.get('outall') delim = kwargs.get('numdelim') ext = kwargs.get('numext') format = kwargs.get('numformat') if outall or kwargs.get('outeig'): prody.writeArray(join(outdir, prefix + '_evectors'+ext), anm.getArray(), delimiter=delim, format=format) prody.writeArray(join(outdir, prefix + '_evalues'+ext), anm.getEigvals(), delimiter=delim, format=format) if outall or kwargs.get('outbeta'): from prody.utilities import openFile fout = openFile(prefix + '_beta.txt', 'w', folder=outdir) fout.write('{0[0]:1s} {0[1]:4s} {0[2]:4s} {0[3]:5s} {0[4]:5s}\n' .format(['C', 'RES', '####', 'Exp.', 'The.'])) for data in zip(select.getChids(), select.getResnames(), select.getResnums(), select.getBetas(), prody.calcTempFactors(anm, select)): fout.write('{0[0]:1s} {0[1]:4s} {0[2]:4d} {0[3]:5.2f} {0[4]:5.2f}\n' .format(data)) fout.close() if outall or kwargs.get('outcov'): prody.writeArray(join(outdir, prefix + '_covariance' + ext), anm.getCovariance(), delimiter=delim, format=format) if outall or kwargs.get('outcc') or kwargs.get('outhm'): cc = prody.calcCrossCorr(anm) if outall or kwargs.get('outcc'): prody.writeArray(join(outdir, prefix + '_cross-correlations' + ext), cc, delimiter=delim, format=format) if outall or kwargs.get('outhm'): prody.writeHeatmap(join(outdir, prefix + '_cross-correlations.hm'), cc, resnum=select.getResnums(), xlabel='Residue', ylabel='Residue', title=anm.getTitle() + ' cross-correlations') if outall or kwargs.get('hessian'): prody.writeArray(join(outdir, prefix + '_hessian'+ext), anm.getHessian(), delimiter=delim, format=format) if outall or kwargs.get('kirchhoff'): prody.writeArray(join(outdir, prefix + '_kirchhoff'+ext), anm.getKirchhoff(), delimiter=delim, format=format) if outall or kwargs.get('outsf'): prody.writeArray(join(outdir, prefix + '_sqflucts'+ext), prody.calcSqFlucts(anm), delimiter=delim, format=format) figall = kwargs.get('figall') cc = kwargs.get('figcc') sf = kwargs.get('figsf') bf = kwargs.get('figbeta') cm = kwargs.get('figcmap') if figall or cc or sf or bf or cm: try: import matplotlib.pyplot as plt except ImportError: LOGGER.warning('Matplotlib could not be imported. ' 'Figures are not saved.') else: prody.SETTINGS['auto_show'] = False LOGGER.info('Saving graphical output.') format = kwargs.get('figformat') width = kwargs.get('figwidth') height = kwargs.get('figheight') dpi = kwargs.get('figdpi') format = format.lower() if figall or cc: plt.figure(figsize=(width, height)) prody.showCrossCorr(anm) plt.savefig(join(outdir, prefix + '_cc.'+format), dpi=dpi, format=format) plt.close('all') if figall or cm: plt.figure(figsize=(width, height)) prody.showContactMap(anm) plt.savefig(join(outdir, prefix + '_cm.'+format), dpi=dpi, format=format) plt.close('all') if figall or sf: plt.figure(figsize=(width, height)) prody.showSqFlucts(anm) plt.savefig(join(outdir, prefix + '_sf.'+format), dpi=dpi, format=format) plt.close('all') if figall or bf: plt.figure(figsize=(width, height)) bexp = select.getBetas() bcal = prody.calcTempFactors(anm, select) plt.plot(bexp, label='Experimental') plt.plot(bcal, label=('Theoretical (R={0:.2f})' .format(np.corrcoef(bcal, bexp)[0,1]))) plt.legend(prop={'size': 10}) plt.xlabel('Node index') plt.ylabel('Experimental B-factors') plt.title(pdb.getTitle() + ' B-factors') plt.savefig(join(outdir, prefix + '_bf.'+format), dpi=dpi, format=format) plt.close('all')
def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs): """Return a path to the downloaded Pfam MSA file. :arg acc: Pfam ID or Accession Code :type acc: str :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``, ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``, or ``'rp75'`` where rp stands for representative proteomes :arg compressed: gzip the downloaded MSA file, default is **False** *Alignment Options* :arg format: a Pfam supported MSA file format, one of ``'selex'``, (default), ``'stockholm'`` or ``'fasta'`` :arg order: ordering of sequences, ``'tree'`` (default) or ``'alphabetical'`` :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'`` :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``, ``'mixed'`` or **None** for unaligned *Other Options* :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :arg outname: out filename, default is input ``'acc_alignment.format'`` :arg folder: output folder, default is ``'.'``""" url = 'http://pfam.sanger.ac.uk/family/acc?id=' + acc handle = openURL(url) orig_acc = acc acc = handle.readline().strip() if PY3K: acc = acc.decode() url_flag = False if not re.search('(?<=PF)[0-9]{5}$', acc): raise ValueError('{0} is not a valid Pfam ID or Accession Code' .format(repr(orig_acc))) if alignment not in DOWNLOAD_FORMATS: raise ValueError('alignment must be one of full, seed, ncbi or' ' metagenomics') if alignment == 'ncbi' or alignment == 'metagenomics': url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/gzipped') url_flag = True extension = '.sth' else: if not kwargs: url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/gzipped') url_flag = True extension = '.sth' else: align_format = kwargs.get('format', 'selex').lower() if align_format not in FORMAT_OPTIONS['format']: raise ValueError('alignment format must be of type selex' ' stockholm or fasta. MSF not supported') if align_format == SELEX: align_format, extension = 'pfam', '.slx' elif align_format == FASTA: extension = '.fasta' else: extension = '.sth' gaps = str(kwargs.get('gaps', 'dashes')).lower() if gaps not in FORMAT_OPTIONS['gaps']: raise ValueError('gaps must be of type mixed, dots, dashes, ' 'or None') inserts = kwargs.get('inserts', 'upper').lower() if(inserts not in FORMAT_OPTIONS['inserts']): raise ValueError('inserts must be of type lower or upper') order = kwargs.get('order', 'tree').lower() if order not in FORMAT_OPTIONS['order']: raise ValueError('order must be of type tree or alphabetical') url = ('http://pfam.sanger.ac.uk/family/' + acc + '/alignment/' + alignment + '/format?format=' + align_format + '&alnType=' + alignment + '&order=' + order[0] + '&case=' + inserts[0] + '&gaps=' + gaps + '&download=1') response = openURL(url, timeout=int(kwargs.get('timeout', 60))) outname = kwargs.get('outname', None) if not outname: outname = orig_acc folder = str(kwargs.get('folder', '.')) filepath = join(makePath(folder), outname + '_' + alignment + extension) if compressed: filepath = filepath + '.gz' if url_flag: f_out = open(filepath, 'wb') else: f_out = openFile(filepath, 'wb') f_out.write(response.read()) f_out.close() else: if url_flag: gunzip(response.read(), filepath) else: with open(filepath, 'wb') as f_out: f_out.write(response.read()) filepath = relpath(filepath) LOGGER.info('Pfam MSA for {0} is written as {1}.' .format(orig_acc, filepath)) return filepath
def writeNMD(filename, modes, atoms): """Returns *filename* that contains *modes* and *atoms* data in NMD format described in :ref:`nmd-format`. :file:`.nmd` extension is appended to filename, if it does not have an extension. .. note:: #. This function skips modes with zero eigenvalues. #. If a :class:`.Vector` instance is given, it will be normalized before it is written. It's length before normalization will be written as the scaling factor of the vector.""" if not isinstance(modes, (NMA, ModeSet, Mode, Vector)): raise TypeError('modes must be NMA, ModeSet, Mode, or Vector, ' 'not {0}'.format(type(modes))) if modes.numAtoms() != atoms.numAtoms(): raise Exception('number of atoms do not match') out = openFile(addext(filename, '.nmd'), 'w') #out.write('#!{0} -e\n'.format(VMDPATH)) out.write('nmwiz_load {0}\n'.format(abspath(filename))) name = modes.getTitle() name = name.replace(' ', '_').replace('.', '_') if not name.replace('_', '').isalnum() or len(name) > 30: name = str(atoms) name = name.replace(' ', '_').replace('.', '_') if not name.replace('_', '').isalnum() or len(name) > 30: name = splitext(split(filename)[1])[0] out.write('name {0}\n'.format(name)) try: coords = atoms.getCoords() except: raise ValueError('coordinates could not be retrieved from atoms') if coords is None: raise ValueError('atom coordinates are not set') try: data = atoms.getNames() if data is not None: out.write('atomnames {0}\n'.format(' '.join(data))) except: pass try: data = atoms.getResnames() if data is not None: out.write('resnames {0}\n'.format(' '.join(data))) except: pass try: data = atoms.getResnums() if data is not None: out.write('resids ') data.tofile(out, ' ') out.write('\n') except: pass try: data = atoms.getChids() if data is not None: out.write('chainids {0}\n'.format(' '.join(data))) except: pass try: data = atoms.getSegnames() if data is not None: out.write('segnames {0}\n'.format(' '.join(data))) except: pass try: data = atoms.getBetas() if data is not None: out.write('bfactors ') data.tofile(out, ' ', '%.2f') out.write('\n') except: pass format = '{0:.3f}'.format out.write('coordinates ') coords.tofile(out, ' ', '%.3f') out.write('\n') count = 0 if isinstance(modes, Vector): out.write('mode 1 {0:.2f} '.format(abs(modes))) modes.getNormed()._getArray().tofile(out, ' ', '%.3f') out.write('\n') count += 1 else: if isinstance(modes, Mode): modes = [modes] for mode in modes: if mode.getEigval() < ZERO: continue out.write('mode {0} {1:.2f} '.format(mode.getIndex() + 1, mode.getVariance()**0.5)) arr = mode._getArray().tofile(out, ' ', '%.3f') out.write('\n') count += 1 if count == 0: LOGGER.warning('No normal mode data was written. ' 'Given modes might have 0 eigenvalues.') out.close() return filename
def parseEMD(emd, **kwargs): """Parses an EM density map in EMD/MRC2015 format and optionally returns an :class:`.AtomGroup` containing beads built in the density using the TRN algorithm [_TM94]. This function extends :func:`.parseEMDStream`. See :ref:`cryoem_analysis` for a usage example. :arg emd: an EMD identifier or a file name. A 4-digit EMDataBank identifier can be provided to download it via FTP. :type emd: str :arg min_cutoff: minimum density cutoff to read EMD map. The regions with lower density than this cutoff are discarded. This corresponds to the previous cutoff and take values from it. :type min_cutoff: float :arg max_cutoff: maximum density cutoff to read EMD map. The regions with higher density than this cutoff are discarded. :type max_cutoff: float :arg n_nodes: A bead based network will be constructed into the provided density map. This parameter will set the number of beads to fit to density map. Default is 0. Please change it to some number to run the TRN algorithm. :type n_nodes: int :arg num_iter: After reading density map, coordinates are predicted with the topology representing network method. This parameter is the total number of iterations of this algorithm. :type num_iter: int :arg map: Return the density map itself. Default is **False** in line with previous behaviour. This value is reset to **True** if n_nodes is 0 or less. :type map: bool """ title = kwargs.get('title', None) if not os.path.isfile(emd): if emd.startswith('EMD-') and len(emd[4:]) in [4, 5]: emd = emd[4:] if len(emd) in [4, 5] and emd.isdigit(): if title is None: title = emd kwargs['title'] = title if os.path.isfile(emd + '.map'): filename = emd + '.map' elif os.path.isfile(emd + '.map.gz'): filename = emd + '.map.gz' else: filename = fetchPDB(emd, report=True, format='emd', compressed=False) if filename is None: raise IOError( 'EMD map file for {0} could not be downloaded.'.format( emd)) emd = filename else: raise IOError( 'EMD file {0} is not available in the directory {1}'.format( emd, os.getcwd())) if title is None: kwargs['title'], ext = os.path.splitext(os.path.split(emd)[1]) emdStream = openFile(emd, 'rb') result = parseEMDStream(emdStream, **kwargs) emdStream.close() return result
def searchPfam(query, search_b=False, skip_a=False, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = '{http://pfam.sanger.ac.uk/}' query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') urlextension = '' if kwargs: ga = int(kwargs.get('ga', 1)) if not (ga == 1 or ga == 0): raise ValueError('ga must be either 0 or 1') evalue = kwargs.get('evalue', None) if evalue: if not float(evalue) <= 10.0: raise ValueError('evalue must be a valid float < 10.0') urlextension = urlextension + '&evalue=' + str(evalue) else: urlextension = urlextension + '&ga=' + str(ga) search_b = int(bool(search_b)) skip_a = int(bool(skip_a)) if skip_a == 1: search_b = 1 urlextension = urlextension + '&searchBs=' + str(search_b) urlextension = urlextension + '&skipAs='******'http://pfam.sanger.ac.uk/search/sequence?seq=' + str(seq) + urlextension + '&output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".' .format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) try: url = dictElement(root[0], prefix)['result_url'] except (IndexError, KeyError): raise ValueError('failed to parse results XML, check URL: ' + url) else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})' .format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.' .format(idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' else: url = ('http://pfam.sanger.ac.uk/protein/' + idcode + '?output=xml') else: url = 'http://pfam.sanger.ac.uk/protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass #else: # if xml: # break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: results = dictElement(root[0], prefix) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def parsePSF(filename, title=None, ag=None): """Returns an :class:`.AtomGroup` instance storing data parsed from X-PLOR format PSF file *filename*. Atom and bond information is parsed from the file. If *title* is not given, *filename* will be set as the title of the :class:`.AtomGroup` instance. An :class:`.AtomGroup` instance may be provided as *ag* argument. When provided, *ag* must have the same number of atoms in the same order as the file. Data from PSF file will be added to the *ag*. This may overwrite present data if it overlaps with PSF file content. This function now includes the angles, dihedrals, and impropers sections as well as donors, acceptors and crossterms!""" if ag is not None: if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') psf = openFile(filename, 'rb') line = psf.readline() while line: line = line.strip() if line.endswith(b'!NATOM'): n_atoms = int(line.split(b'!')[0]) break line = psf.readline() if title is None: title = os.path.splitext(os.path.split(filename)[1])[0] else: title = str(title) if ag is None: ag = AtomGroup(title) else: if n_atoms != ag.numAtoms(): raise ValueError('ag and PSF file must have same number of atoms') serials = zeros(n_atoms, ATOMIC_FIELDS['serial'].dtype) segnames = zeros(n_atoms, ATOMIC_FIELDS['segment'].dtype) resnums = zeros(n_atoms, ATOMIC_FIELDS['resnum'].dtype) resnames = zeros(n_atoms, ATOMIC_FIELDS['resname'].dtype) atomnames = zeros(n_atoms, ATOMIC_FIELDS['name'].dtype) atomtypes = zeros(n_atoms, ATOMIC_FIELDS['type'].dtype) charges = zeros(n_atoms, ATOMIC_FIELDS['charge'].dtype) masses = zeros(n_atoms, ATOMIC_FIELDS['mass'].dtype) n = 0 n_bonds = 0 for line in psf: if line.strip() == b'': continue if b'!NBOND:' in line.upper(): items = line.split() n_bonds = int(items[0]) break if n + 1 > n_atoms: continue if len(line) <= 71: serials[n] = line[:8] segnames[n] = line[9:13].strip() resnums[n] = line[14:19] resnames[n] = line[19:23].strip() atomnames[n] = line[24:28].strip() atomtypes[n] = line[29:35].strip() charges[n] = line[35:44] masses[n] = line[50:60] else: items = line.split() serials[n] = items[0] segnames[n] = items[1] resnums[n] = items[2] resnames[n] = items[3] atomnames[n] = items[4] atomtypes[n] = items[5] charges[n] = items[6] masses[n] = items[7] n += 1 if n < n_atoms: raise IOError( 'number of lines in PSF atoms block is less than the number of ' 'atoms') n_angles = 0 lines = [] for i, line in enumerate(psf): if line.strip() == b'': continue if b'!NTHETA' in line: items = line.split() n_angles = int(items[0]) break lines.append(line.decode(encoding='UTF-8')) lines = ''.join(lines) b_array = fromstring(lines, count=n_bonds * 2, dtype=int, sep=' ') if len(b_array) != n_bonds * 2: raise IOError('number of bonds expected and parsed do not match') n_dihedrals = 0 lines = [] for i, line in enumerate(psf): if line.strip() == b'': continue if b'!NPHI' in line: items = line.split() n_dihedrals = int(items[0]) break lines.append(line.decode(encoding='UTF-8')) lines = ''.join(lines) a_array = fromstring(lines, count=n_angles * 3, dtype=int, sep=' ') if len(a_array) != n_angles * 3: raise IOError('number of angles expected and parsed do not match') n_impropers = 0 lines = [] for i, line in enumerate(psf): if line.strip() == b'': continue if b'!NIMPHI' in line: items = line.split() n_impropers = int(items[0]) break lines.append(line.decode(encoding='UTF-8')) lines = ''.join(lines) d_array = fromstring(lines, count=n_dihedrals * 4, dtype=int, sep=' ') if len(d_array) != n_dihedrals * 4: raise IOError('number of dihedrals expected and parsed do not match') n_donors = 0 lines = [] for i, line in enumerate(psf): if line.strip() == b'': continue if b'!NDON' in line: items = line.split() n_donors = int(items[0]) break lines.append(line.decode(encoding='UTF-8')) lines = ''.join(lines) i_array = fromstring(lines, count=n_impropers * 4, dtype=int, sep=' ') if len(i_array) != n_impropers * 4: raise IOError('number of impropers expected and parsed do not match') n_acceptors = 0 lines = [] for i, line in enumerate(psf): if line.strip() == b'': continue if b'!NACC' in line: items = line.split() n_acceptors = int(items[0]) break lines.append(line.decode(encoding='UTF-8')) lines = ''.join(lines) do_array = fromstring(lines, count=n_donors * 2, dtype=int, sep=' ') if len(do_array) != n_donors * 2: raise IOError('number of donors expected and parsed do not match') n_exclusions = 0 lines = [] for i, line in enumerate(psf): if line.strip() == b'': continue if b'!NNB' in line: items = line.split() n_exclusions = int(items[0]) break lines.append(line.decode(encoding='UTF-8')) lines = ''.join(lines) ac_array = fromstring(lines, count=n_acceptors * 2, dtype=int, sep=' ') if len(ac_array) != n_acceptors * 2: raise IOError('number of acceptors expected and parsed do not match') lines = [] for i, line in enumerate(psf): if line.strip() == b'': continue if b'!' in line: break lines.append(line.decode(encoding='UTF-8')) lines = ''.join(lines) nbe_array = fromstring(lines, count=n_exclusions * 2, dtype=int, sep=' ') if len(nbe_array) != n_exclusions * 2: raise IOError( 'number of nonbonded exclusions expected and parsed do not match') n_crossterms = 0 for i, line in enumerate(psf): if b'!NCRTERM' in line: items = line.split() n_crossterms = int(items[0]) break lines = [] for i, line in enumerate(psf): lines.append(line.decode(encoding='UTF-8')) lines = ''.join(lines) c_array = fromstring(lines, count=n_crossterms * 4, dtype=int, sep=' ') if len(c_array) != n_crossterms * 4: raise IOError('number of crossterms expected and parsed do not match') psf.close() ag.setSerials(serials) ag.setSegnames(segnames) ag.setResnums(resnums) ag.setResnames(resnames) ag.setNames(atomnames) ag.setTypes(atomtypes) ag.setCharges(charges) ag.setMasses(masses) if n_bonds > 0: b_array = add(b_array, -1, b_array) ag.setBonds(b_array.reshape((n_bonds, 2))) if n_angles > 0: a_array = add(a_array, -1, a_array) ag.setAngles(a_array.reshape((n_angles, 3))) if n_dihedrals > 0: d_array = add(d_array, -1, d_array) ag.setDihedrals(d_array.reshape((n_dihedrals, 4))) if n_impropers > 0: i_array = add(i_array, -1, i_array) ag.setImpropers(i_array.reshape((n_impropers, 4))) if n_donors > 0: do_array = add(do_array, -1, do_array) ag.setDonors(do_array.reshape((n_donors, 2))) if n_acceptors > 0: ac_array = add(ac_array, -1, ac_array) ag.setAcceptors(ac_array.reshape((n_acceptors, 2))) if n_exclusions > 0: nbe_array = add(nbe_array, -1, nbe_array) ag.setNBExclusions(nbe_array.reshape((n_exclusions, 2))) if n_crossterms > 0: c_array = add(c_array, -1, c_array) ag.setCrossterms(c_array.reshape((n_crossterms, 4))) return ag
def fetchPDBLigand(cci, filename=None): """Fetch PDB ligand data from PDB_ for chemical component *cci*. *cci* may be 3-letter chemical component identifier or a valid XML filename. If *filename* is given, XML file will be saved with that name. If you query ligand data frequently, you may configure ProDy to save XML files in your computer. Set ``ligand_xml_save`` option **True**, i.e. ``confProDy(ligand_xml_save=True)``. Compressed XML files will be save to ProDy package folder, e.g. :file:`/home/user/.prody/pdbligands`. Each file is around 5Kb when compressed. This function is compatible with PDBx/PDBML v 4.0. Ligand data is returned in a dictionary. Ligand coordinate atom data with *model* and *ideal* coordinate sets are also stored in this dictionary. Note that this dictionary will contain data that is present in the XML file and all Ligand Expo XML files do not contain every possible data field. So, it may be better if you use :meth:`dict.get` instead of indexing the dictionary, e.g. to retrieve formula weight (or relative molar mass) of the chemical component use ``data.get('formula_weight')`` instead of ``data['formula_weight']`` to avoid exceptions when this data field is not found in the XML file. URL and/or path of the XML file are returned in the dictionary with keys ``url`` and ``path``, respectively. Following example downloads data for ligand STI (a.k.a. Gleevec and Imatinib) and calculates RMSD between model (X-ray structure 1IEP) and ideal (energy minimized) coordinate sets: .. ipython:: python from prody import * ligand_data = fetchPDBLigand('STI') ligand_data['model_coordinates_db_code'] ligand_model = ligand_data['model'] ligand_ideal = ligand_data['ideal'] transformation = superpose(ligand_ideal.noh, ligand_model.noh) calcRMSD(ligand_ideal.noh, ligand_model.noh)""" if not isinstance(cci, str): raise TypeError('cci must be a string') if isfile(cci): inp = openFile(cci) xml = inp.read() inp.close() url = None path = cci cci = splitext(splitext(split(cci)[1])[0])[0].upper() elif len(cci) > 4 or not cci.isalnum(): raise ValueError('cci must be 3-letters long and alphanumeric or ' 'a valid filename') else: xml = None cci = cci.upper() if SETTINGS.get('ligand_xml_save'): folder = join(getPackagePath(), 'pdbligands') if not isdir(folder): makePath(folder) xmlgz = path = join(folder, cci + '.xml.gz') if isfile(xmlgz): with openFile(xmlgz) as inp: xml = inp.read() else: path = None #url = ('http://ligand-expo.rcsb.org/reports/{0[0]}/{0}/{0}' # '.xml'.format(cci.upper())) url = 'http://www.pdb.org/pdb/files/ligand/{0}.xml'.format(cci.upper()) if not xml: #'http://www.pdb.org/pdb/files/ligand/{0}.xml' try: inp = openURL(url) except IOError: raise IOError('XML file for ligand {0} is not found online' .format(cci)) else: xml = inp.read() inp.close() if filename: out = openFile(filename, mode='w', folder=folder) out.write(xml) out.close() if SETTINGS.get('ligand_xml_save'): with openFile(xmlgz, 'w') as out: out.write(xml) import xml.etree.cElementTree as ET root = ET.XML(xml) if (root.get('{http://www.w3.org/2001/XMLSchema-instance}' 'schemaLocation') != 'http://pdbml.pdb.org/schema/pdbx-v40.xsd pdbx-v40.xsd'): LOGGER.warn('XML is not in PDBx/PDBML v 4.0 format, resulting ' 'dictionary may not contain all data fields') ns = root.tag[:root.tag.rfind('}')+1] len_ns = len(ns) dict_ = {'url': url, 'path': path} for child in list(root.find(ns + 'chem_compCategory')[0]): tag = child.tag[len_ns:] if tag.startswith('pdbx_'): tag = tag[5:] dict_[tag] = child.text dict_['formula_weight'] = float(dict_.get('formula_weight')) identifiers_and_descriptors = [] results = root.find(ns + 'pdbx_chem_comp_identifierCategory') if results: identifiers_and_descriptors.extend(results) results = root.find(ns + 'pdbx_chem_comp_descriptorCategory') if results: identifiers_and_descriptors.extend(results) for child in identifiers_and_descriptors: program = child.get('program').replace(' ', '_') type_ = child.get('type').replace(' ', '_') dict_[program + '_' + type_] = child[0].text dict_[program + '_version'] = child.get('program_version') dict_['audits'] = [(audit.get('action_type'), audit.get('date')) for audit in list(root.find(ns + 'pdbx_chem_comp_auditCategory'))] atoms = list(root.find(ns + 'chem_comp_atomCategory')) n_atoms = len(atoms) ideal_coords = np.zeros((n_atoms, 3)) model_coords = np.zeros((n_atoms, 3)) atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) elements = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['element'].dtype) resnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['resname'].dtype) charges = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) resnums = np.ones(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) alternate_atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) leaving_atom_flags = np.zeros(n_atoms, np.bool) aromatic_flags = np.zeros(n_atoms, np.bool) stereo_configs = np.zeros(n_atoms, np.bool) ordinals = np.zeros(n_atoms, int) name2index = {} for i, atom in enumerate(atoms): data = dict([(child.tag[len_ns:], child.text) for child in list(atom)]) name = data.get('pdbx_component_atom_id', 'X') name2index[name] = i atomnames[i] = name elements[i] = data.get('type_symbol', 'X') resnames[i] = data.get('pdbx_component_comp_id', 'UNK') charges[i] = float(data.get('charge', 0)) alternate_atomnames[i] = data.get('alt_atom_id', 'X') leaving_atom_flags[i] = data.get('pdbx_leaving_atom_flag') == 'Y' aromatic_flags[i] = data.get('pdbx_atomatic_flag') == 'Y' stereo_configs[i] = data.get('pdbx_stereo_config') == 'Y' ordinals[i] = int(data.get('pdbx_ordinal', 0)) model_coords[i, 0] = float(data.get('model_Cartn_x', 0)) model_coords[i, 1] = float(data.get('model_Cartn_y', 0)) model_coords[i, 2] = float(data.get('model_Cartn_z', 0)) ideal_coords[i, 0] = float(data.get('pdbx_model_Cartn_x_ideal', 0)) ideal_coords[i, 1] = float(data.get('pdbx_model_Cartn_y_ideal', 0)) ideal_coords[i, 2] = float(data.get('pdbx_model_Cartn_z_ideal', 0)) pdbid = dict_.get('model_coordinates_db_code') if pdbid: model = AtomGroup(cci + ' model ({0})'.format(pdbid)) else: model = AtomGroup(cci + ' model') model.setCoords(model_coords) model.setNames(atomnames) model.setResnames(resnames) model.setResnums(resnums) model.setElements(elements) model.setCharges(charges) model.setFlags('leaving_atom_flags', leaving_atom_flags) model.setFlags('aromatic_flags', aromatic_flags) model.setFlags('stereo_configs', stereo_configs) model.setData('ordinals', ordinals) model.setData('alternate_atomnames', alternate_atomnames) dict_['model'] = model ideal = model.copy() ideal.setTitle(cci + ' ideal') ideal.setCoords(ideal_coords) dict_['ideal'] = ideal bonds = [] warned = set() for bond in list(root.find(ns + 'chem_comp_bondCategory') or bonds): name_1 = bond.get('atom_id_1') name_2 = bond.get('atom_id_2') try: bonds.append((name2index[name_1], name2index[name_2])) except KeyError: if name_1 not in warned and name_1 not in name2index: warned.add(name_1) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_1), cci)) if name_2 not in warned and name_2 not in name2index: warned.add(name_2) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_2), cci)) if bonds: bonds = np.array(bonds, int) model.setBonds(bonds) ideal.setBonds(bonds) return dict_
def __init__(self, msa, mode='r', format=None, aligned=True, **kwargs): """*msa* may be a filename or a stream. Multiple sequence alignments can be read from or written in FASTA (:file:`.fasta`), Stockholm (:file:`.sth`), or SELEX (:file:`.slx`) *format*. For spesified extensions, *format* argument is not needed. If *aligned* is **True**, unaligned sequences in the file or stream will cause an :exc:`IOError` exception. *filter*, a function that returns a boolean, can be used for filtering sequences, see :meth:`setFilter` for details. *slice* can be used to slice sequences, and is applied after filtering, see :meth:`setSlice` for details.""" if mode[0] not in 'rwa': raise ValueError("mode string must be one of 'r', 'w', or 'a', " "not {0}".format(repr(mode))) if 'b' in mode: mode = mode.replace('b', '') if 't' not in mode: mode += 't' self._format = None if format is not None: try: self._format = format = MSAFORMATS[format.lower()] except AttributeError: raise TypeError('format argument must be a string') except KeyError: raise ValueError('format argument is not recognized') self._filename = filename = None if mode.startswith('r'): try: torf = isfile(msa) except: pass else: if torf: self._filename = filename = msa else: try: msa.lower, msa.strip except AttributeError: pass else: self._filename = filename = msa if filename is not None: self._filename = filename title, ext = splitext(split(filename)[1]) if ext.lower() == '.gz': title, ext = splitext(split(title)[1]) if format is None: try: self._format = format = MSAEXTMAP[ext.lower()] except KeyError: raise TypeError('format is not specified and could not be ' 'determined from file extension') self._title = title self._stream = openFile(msa, mode) else: if self._format is None: raise ValueError('format must be specified when msa is a ' 'stream') self._stream = msa self._title = 'stream' try: closed = self._stream.closed except AttributeError: closed = self._stream.myfileobj.closed if closed: raise ValueError('msa stream must not be closed') self._lenseq = None self._closed = False self._readline = None self._aligned = bool(aligned) if mode.startswith('r'): self._readline = self._stream.readline try: self._readlines = self._stream.readlines except AttributeError: pass self.setFilter(kwargs.get('filter', None), kwargs.get('filter_full', False)) self.setSlice(kwargs.get('slice', None)) self._iter = self._itermap[format](self) else: try: self._write = write = self._stream.write except AttributeError: raise TypeError('msa must be a filename or a stream with ' 'write method') if mode.startswith('w') and format == STOCKHOLM: write('# STOCKHOLM 1.0\n') if format.startswith('S'): self._selex_line = '{0:' + str(LEN_SELEX_LABEL) + 's} {1}\n' self._mode = mode
def fetchPfamMSA(acc, alignment="full", compressed=False, **kwargs): """Return a path to the downloaded Pfam MSA file. :arg acc: Pfam ID or Accession Code :type acc: str :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``, ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``, or ``'rp75'`` where rp stands for representative proteomes :arg compressed: gzip the downloaded MSA file, default is **False** *Alignment Options* :arg format: a Pfam supported MSA file format, one of ``'selex'``, (default), ``'stockholm'`` or ``'fasta'`` :arg order: ordering of sequences, ``'tree'`` (default) or ``'alphabetical'`` :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'`` :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``, ``'mixed'`` or **None** for unaligned *Other Options* :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :arg outname: out filename, default is input ``'acc_alignment.format'`` :arg folder: output folder, default is ``'.'``""" url = "http://pfam.sanger.ac.uk/family/acc?id=" + acc handle = openURL(url) orig_acc = acc acc = handle.readline().strip() if PY3K: acc = acc.decode() url_flag = False if not re.search("(?<=PF)[0-9]{5}$", acc): raise ValueError("{0} is not a valid Pfam ID or Accession Code".format(repr(orig_acc))) if alignment not in DOWNLOAD_FORMATS: raise ValueError("alignment must be one of full, seed, ncbi or" " metagenomics") if alignment == "ncbi" or alignment == "metagenomics": url = "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/gzipped" url_flag = True extension = ".sth" else: if not kwargs: url = "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/gzipped" url_flag = True extension = ".sth" else: align_format = kwargs.get("format", "selex").lower() if align_format not in FORMAT_OPTIONS["format"]: raise ValueError("alignment format must be of type selex" " stockholm or fasta. MSF not supported") if align_format == SELEX: align_format, extension = "pfam", ".slx" elif align_format == FASTA: extension = ".fasta" else: extension = ".sth" gaps = str(kwargs.get("gaps", "dashes")).lower() if gaps not in FORMAT_OPTIONS["gaps"]: raise ValueError("gaps must be of type mixed, dots, dashes, " "or None") inserts = kwargs.get("inserts", "upper").lower() if inserts not in FORMAT_OPTIONS["inserts"]: raise ValueError("inserts must be of type lower or upper") order = kwargs.get("order", "tree").lower() if order not in FORMAT_OPTIONS["order"]: raise ValueError("order must be of type tree or alphabetical") url = ( "http://pfam.sanger.ac.uk/family/" + acc + "/alignment/" + alignment + "/format?format=" + align_format + "&alnType=" + alignment + "&order=" + order[0] + "&case=" + inserts[0] + "&gaps=" + gaps + "&download=1" ) response = openURL(url, timeout=int(kwargs.get("timeout", 60))) outname = kwargs.get("outname", None) if not outname: outname = orig_acc folder = str(kwargs.get("folder", ".")) filepath = join(makePath(folder), outname + "_" + alignment + extension) if compressed: filepath = filepath + ".gz" if url_flag: f_out = open(filepath, "wb") else: f_out = openFile(filepath, "wb") f_out.write(response.read()) f_out.close() else: if url_flag: gunzip(response.read(), filepath) else: with open(filepath, "wb") as f_out: f_out.write(response.read()) filepath = relpath(filepath) LOGGER.info("Pfam MSA for {0} is written as {1}.".format(orig_acc, filepath)) return filepath
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = "{http://pfam.xfam.org/}" query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = "".join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError("could not parse a sequence without gaps from " + query) else: seq = "".join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit("_pfam") timeout = int(kwargs.get("timeout", 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + " is not a valid sequence") fseq = ">Seq\n" + seq parameters = {"hmmdb": "pfam", "seq": fseq} enc_params = urllib.urlencode(parameters) request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params) url = urllib2.urlopen(request).geturl() + "?output=xml" LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) matches = {} for child in root[0]: if child.tag == "hits": accession = child.get("acc") pfam_id = accession.split(".")[0] matches[pfam_id] = {} matches[pfam_id]["accession"] = accession matches[pfam_id]["class"] = "Domain" matches[pfam_id]["id"] = child.get("name") matches[pfam_id]["locations"] = {} matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom") matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore") matches[pfam_id]["locations"]["end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["evalue"] = child.get("evalue") matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0" matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto") matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom") matches[pfam_id]["locations"]["significant"] = child[0].get("significant") matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom") matches[pfam_id]["type"] = "Pfam-A" return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], "polymers") except Exception as err: LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != "UniProt": continue idcode = dbref.idcode LOGGER.info( "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid) ) break if idcode is not None: break if idcode is None: LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq))) url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" LOGGER.debug("Retrieving Pfam search results: " + url) xml = None while LOGGER.timing("_pfam") < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url) else: LOGGER.report("Pfam search completed in %.2fs.", "_pfam") if xml.find(b"There was a system error on your last request.") > 0: LOGGER.warn("No Pfam matches found for: " + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError("failed to parse results XML, check URL: " + url) else: results = dictElement(root[0], prefix) try: xml_matches = results["matches"] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) matches = dict() for child in xml_matches: try: accession = child.attrib["accession"][:7] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) if not re.search("^P(F|B)[0-9]{5}$", accession): raise ValueError("{0} does not match pfam accession" " format".format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault("locations", []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = "Query " + repr(query) else: query = "Query sequence" if matches: LOGGER.info(query + " matched {0} Pfam families.".format(len(matches))) else: LOGGER.info(query + " did not match any Pfam families.") return matches
def writeNMD(filename, modes, atoms): """Writes an NMD file for given *modes* and includes applicable data from *atoms*. Returns *filename*, if file is successfully written. NMD file format is described at :ref:`nmd-format`. .. note:: #. This function skips modes with zero eigenvalues. #. If a :class:`~.Vector` instance is given, it will be normalized before it is written. It's length before normalization will be written as the scaling factor of the vector.""" if not isinstance(modes, (NMA, ModeSet, Mode, Vector)): raise TypeError('modes must be NMA, ModeSet, Mode, or Vector, ' 'not {0:s}'.format(type(modes))) if modes.numAtoms() != atoms.numAtoms(): raise Exception('number of atoms do not match') out = openFile(filename, 'w') #out.write('#!{0:s} -e\n'.format(VMDPATH)) out.write('nmwiz_load {0:s}\n'.format(os.path.abspath(filename))) name = modes.getTitle() name = name.replace(' ', '_').replace('.', '_') if not name.replace('_', '').isalnum() or len(name) > 30: name = str(atoms) name = name.replace(' ', '_').replace('.', '_') if not name.replace('_', '').isalnum() or len(name) > 30: name = os.path.splitext(os.path.split(filename)[1])[0] out.write('name {0:s}\n'.format(name)) try: coords = atoms.getCoords() except: raise ValueError('coordinates could not be retrieved from atoms') if coords is None: raise ValueError('atom coordinates are not set') try: data = atoms.getNames() if data is not None: out.write('atomnames {0:s}\n'.format(' '.join(data))) except: pass try: data = atoms.getResnames() if data is not None: out.write('resnames {0:s}\n'.format(' '.join(data))) except: pass try: data = atoms.getResnums() if data is not None: out.write('resids {0:s}\n'.format(' '.join(data.astype('|S5')))) except: pass try: data = atoms.getChids() if data is not None: out.write('chainids {0:s}\n'.format(' '.join(data))) except: pass try: data = atoms.getBetas() if data is not None: out.write('bfactors {0:s}\n'.format(' '.join( ['{0:.3f}'.format(x) for x in data.flatten()]))) except: pass if coords.dtype != float: coords = coords.astype(float) out.write('coordinates {0:s}\n'.format( ' '.join(['{0:.3f}'.format(x) for x in coords.flatten()]))) count = 0 if isinstance(modes, Vector): out.write('mode 1 {0:.2f} {1:s}\n'.format(abs(modes), ' '.join( ['{0:.3f}'.format(x) for x in modes.getNormed()._getArray()]))) count += 1 else: if isinstance(modes, Mode): modes = [modes] for mode in modes: if mode.getEigval() < ZERO: continue out.write('mode {0:d} {1:.2f} {2:s}\n'.format( mode.getIndex()+1, mode.getVariance()**0.5, ' '.join( ['{0:.3f}'.format(x) for x in mode._getArray()]))) count += 1 if count == 0: LOGGER.warning('No normal mode data was written. ' 'Given modes might have 0 eigenvalues.') out.close() return filename