def backupFile(filename, backup=None, backup_ext='.BAK', **kwargs): """Rename *filename* with *backup_ext* appended to its name for backup purposes, if *backup* is **True** or if automatic backups is turned on using :func:`.confProDy`. Default extension :file:`.BAK` is used when one is not set using :func:`.confProDy`. If *filename* does not exist, no action will be taken and *filename* will be returned. If file is successfully renamed, new filename will be returned.""" try: exists = isfile(filename) except Exception as err: raise TypeError('filename must be a string ({0})'.format(str(err))) from prody import SETTINGS if exists and (backup or SETTINGS.get('backup', False)): if backup_ext == '.BAK': backup_ext = SETTINGS.get('backup_ext', '.BAK') bak = filename + backup_ext if isfile(bak): try: os.remove(bak) except Exception as err: pass try: os.rename(filename, bak) except Exception as err: pass return bak else: return filename
def pathPDBMirror(path=None, format=None): """Returns or specify PDB mirror path to be used by :func:`.fetchPDB`. To release the current mirror, pass an invalid path, e.g. ``path=''``. If you are keeping a partial mirror, such as PDB files in :file:`/data/structures/divided/pdb/` folder, specify *format*, which is ``'pdb'`` in this case.""" if path is None: path = SETTINGS.get('pdb_mirror_path') format = SETTINGS.get('pdb_mirror_format', None) if path: if isdir(path): if format is None: return path else: return path, format else: LOGGER.warning('PDB mirror path {0} is not a accessible.' .format(repr(path))) else: if isdir(path): path = abspath(path) LOGGER.info('Local PDB mirror path is set: {0}' .format(repr(path))) SETTINGS['pdb_mirror_path'] = path SETTINGS['pdb_mirror_format'] = format SETTINGS.save() else: current = SETTINGS.pop('pdb_mirror_path') if current: LOGGER.info('PDB mirror {0} is released.' .format(repr(current))) SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(path)))
def pathPDBMirror(path=None, format=None): """Returns or specify PDB mirror path to be used by :func:`.fetchPDB`. To release the current mirror, pass an invalid path, e.g. ``path=''``. If you are keeping a partial mirror, such as PDB files in :file:`/data/structures/divided/pdb/` folder, specify *format*, which is ``'pdb'`` in this case.""" if path is None: path = SETTINGS.get('pdb_mirror_path') format = SETTINGS.get('pdb_mirror_format', None) if path: if isdir(path): if format is None: return path else: return path, format else: LOGGER.warning( 'PDB mirror path {0} is not a accessible.'.format( repr(path))) else: if isdir(path): path = abspath(path) LOGGER.info('Local PDB mirror path is set: {0}'.format(repr(path))) SETTINGS['pdb_mirror_path'] = path SETTINGS['pdb_mirror_format'] = format SETTINGS.save() else: current = SETTINGS.pop('pdb_mirror_path') if current: LOGGER.info('PDB mirror {0} is released.'.format( repr(current))) SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(path)))
def updateDefinitions(): """Update definitions and set some global variables. This function must be called at the end of the module.""" global DEFINITIONS, AMINOACIDS, BACKBONE, TIMESTAMP DEFINITIONS = {} user = SETTINGS.get('flag_definitions', {}) # nucleics nucleic = set() for key in ['nucleobase', 'nucleoside', 'nucleotide']: aset = set(user.get(key, DEFAULTS[key])) nucleic.update(aset) DEFINITIONS[key] = aset DEFINITIONS['nucleic'] = nucleic # heteros for key in ['water', 'lipid', 'ion', 'sugar', 'heme', 'at', 'cg', 'purine', 'pyrimidine',]: DEFINITIONS[key] = set(user.get(key, DEFAULTS[key])) DEFINITIONS['backbone'] = DEFINITIONS['bb'] = set(user.get(key, DEFAULTS['bb'])) DEFINITIONS['backbonefull'] = DEFINITIONS['bbfull'] = set(user.get(key, DEFAULTS['bbfull'])) # element regex for key in ['hydrogen', 'carbon', 'nitrogen', 'oxygen', 'sulfur']: DEFINITIONS[key] = recompile(user.get(key, DEFAULTS[key])) try: nonstd = SETTINGS[NONSTANDARD_KEY] except KeyError: nonstd = NONSTANDARD DEFINITIONS.update(CATEGORIZED) else: for cat in CATEGORIES: for key in CATEGORIES[cat]: DEFINITIONS[key] = set(DEFAULTS[key]) DEFINITIONS['charged'] = set(DEFINITIONS['acidic']) DEFINITIONS['charged'].update(DEFINITIONS['basic']) for resi, props in nonstd.iteritems(): for prop in props: DEFINITIONS[prop].add(resi) DEFINITIONS['stdaa'] = DEFAULTS['stdaa'] DEFINITIONS['nonstdaa'] = set(nonstd) AMINOACIDS = set(DEFINITIONS['stdaa']) AMINOACIDS.update(DEFINITIONS['nonstdaa']) DEFINITIONS['protein'] = DEFINITIONS['aminoacid'] = AMINOACIDS BACKBONE = DEFINITIONS['bb'] global TIMESTAMP TIMESTAMP = SETTINGS.get('flag_timestamp', 0)
def getPDBLocalFolder(): """Return the path to a local PDB folder and folder structure specifier. If a local folder is not set, **None** will be returned.""" folder = SETTINGS.get('pdb_local_folder') if folder: if isdir(folder): return folder, SETTINGS.get('pdb_local_divided', True) else: LOGGER.warning('PDB local folder {0:s} is not a accessible.' .format(repr(folder)))
def addNonstdAminoacid(resname, *properties): """Add non-standard amino acid *resname* with *properties* selected from: * {props} >>> addNonstdAminoacid('PTR', 'acidic', 'aromatic', 'cyclic', 'large', ... 'polar', 'surface') Default set of non-standard amino acids can be restored as follows: >>> flagDefinition(reset='nonstdaa')""" resname = str(resname) if len(resname) > 4: LOGGER.warn('Residue name {0:s} is unusually long.' .format(repr(resname))) propset = set(properties) for cat, val in CATEGORIES.items(): intersection = val.intersection(propset) if intersection: if len(intersection) > 1: raise ValueError('amino acid properties {0:s} cannot be ' 'present together' .format(', '.join([repr(prp) for prp in intersection]))) for prop in intersection: propset.remove(prop) if propset: raise ValueError('amino acid property {0:s} is not valid' .format(repr(propset.pop()))) nonstd = SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD) nonstd[resname] = set(properties) updateNonstandard(nonstd)
def wwPDBServer(*key): """Set/get `wwPDB`_ FTP/HTTP server location used for downloading PDB structures. Use one of the following keywords for setting a server: +---------------------------+-----------------------------+ | wwPDB FTP server | *Key* (case insensitive) | +===========================+=============================+ | RCSB PDB (USA) (default) | RCSB, USA, US | +---------------------------+-----------------------------+ | PDBe (Europe) | PDBe, Europe, Euro, EU | +---------------------------+-----------------------------+ | PDBj (Japan) | PDBj, Japan, Jp | +---------------------------+-----------------------------+ .. _wwPDB: http://www.wwpdb.org/""" if not key: return SETTINGS.get('wwpdb', None) elif len(key) == 1: try: key = key[0].lower() except AttributeError: raise TypeError('key must be a string') if key in WWPDB_FTP_SERVERS: SETTINGS['wwpdb'] = key SETTINGS.save() LOGGER.info('wwPDB server is set to {}.' .format(WWPDB_FTP_SERVERS[key][0])) else: raise ValueError('{0} is not a valid wwPDB server identifier' .format(repr(key))) else: raise TypeError('one wwPDB server identifier is expected, {0} given' .format(len(key)))
def pathRhapsodyFolder(folder=None): """Returns or sets path of local folder where files and pickles necessary to run Rhapsody will be stored. To release the current folder, pass an invalid path, e.g. ``folder=''``. """ if folder is None: folder = SETTINGS.get('rhapsody_local_folder') if folder: if isdir(folder): return folder else: LOGGER.warn('Local folder {} is not accessible.'.format( repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local Rhapsody folder is set: {}'.format( repr(folder))) SETTINGS['rhapsody_local_folder'] = folder SETTINGS.save() else: current = SETTINGS.pop('rhapsody_local_folder') if current: LOGGER.info('Rhapsody folder {0} is released.'.format( repr(current))) SETTINGS.save() else: raise IOError('{} is not a valid path.'.format(repr(folder)))
def savePickle(self, folder=None, filename=None): if folder is None: # define folder where to look for pickles folder = SETTINGS.get('rhapsody_local_folder', '.') if filename is None: # use the default filename, if possible if self.PDBID is None: # when a custom structure is used, there is no # default filename: the user should provide it raise ValueError('Please provide a filename.') filename = 'PDBfeatures-' + self.PDBID + '.pkl' pickle_path = os.path.join(folder, filename) # do not store GNM and ANM instances. # If a valid PDBID is present, do not store parsed PDB # as well, since it can be easily fetched again cache = (self._pdb, self._gnm, self._anm) if self.PDBID is not None: self._pdb = None self._gnm = {} self._anm = {} for env in ['chain', 'reduced', 'sliced']: self._gnm[env] = {chID: None for chID in self.chids} self._anm[env] = {chID: None for chID in self.chids} # write pickle pickle.dump(self, open(pickle_path, "wb")) # restore temporarily cached data self._pdb, self._gnm, self._anm = cache LOGGER.info("Pickle '{}' saved.".format(filename)) return pickle_path
def pathVMD(*path): """Returns VMD path, or set it to be a user specified *path*.""" if not path: path = SETTINGS.get('vmd', None) if isExecutable(path): return path else: LOGGER.warning('VMD path is not set by user, looking for it.') vmdbin = None vmddir = None if PLATFORM == 'Windows': if PY3K: import winreg else: import _winreg as winreg # PY3K: OK for vmdversion in ('1.8.7', '1.9', '1.9.1'): try: key = winreg.OpenKey( winreg.HKEY_LOCAL_MACHINE, 'Software\\University of Illinois\\VMD\\' + vmdversion) vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0] vmdbin = join(vmddir, 'vmd.exe') except: pass try: key = winreg.OpenKey( winreg.HKEY_LOCAL_MACHINE, 'Software\\WOW6432node\\University of Illinois\\VMD\\' + vmdversion) vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0] vmdbin = join(vmddir, 'vmd.exe') except: pass else: vmdbin = which('vmd') if False: pipe = os.popen('which vmd') vmdbin = pipe.next().strip() vmdfile = open(vmdbin) for line in vmdfile: if line.startswith('defaultvmddir='): vmddir = line.split('=')[1].replace('"', '') break vmdfile.close() if isExecutable(vmdbin): setVMDpath(vmdbin) return vmdbin elif len(path) == 1: path = path[0] if isExecutable(path): SETTINGS['vmd'] = path SETTINGS.save() LOGGER.info("VMD path is set to '{0}'.".format(path)) else: raise OSError('{0} is not executable.'.format(str(path))) else: raise ValueError('specify a single path string')
def pathEVmutationFolder(folder=None): """Returns or sets path of local folder where EVmutation data are stored. To release the current folder, pass an invalid path, e.g. ``folder=''``. """ if folder is None: folder = SETTINGS.get('EVmutation_local_folder') if folder: if isdir(folder): return folder else: LOGGER.warn('Local folder {} is not accessible.'.format( repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local EVmutation folder is set: {}'.format( repr(folder))) SETTINGS['EVmutation_local_folder'] = folder SETTINGS.save() else: current = SETTINGS.pop('EVmutation_local_folder') if current: LOGGER.info('EVmutation folder {0} is released.'.format( repr(current))) SETTINGS.save() else: raise IOError('{} is not a valid path.'.format(repr(folder)))
def changeDefinitions(**kwargs): defs = SETTINGS.get(DEFINITIONS_KEY, {}) defs.update(kwargs) SETTINGS[DEFINITIONS_KEY] = defs SETTINGS[TIMESTAMP_KEY] = int(time()) SETTINGS.save() updateDefinitions()
def pathPDBFolder(folder=None, divided=False): """Returns or specify local PDB folder for storing PDB files downloaded from `wwPDB <http://www.wwpdb.org/>`_ servers. Files stored in this folder can be accessed via :func:`.fetchPDB` from any working directory. To release the current folder, pass an invalid path, e.g. ``folder=''``. If *divided* is **True**, the divided folder structure of wwPDB servers will be assumed when reading from and writing to the local folder. For example, a structure with identifier **1XYZ** will be present as :file:`pdblocalfolder/yz/pdb1xyz.pdb.gz`. If *divided* is **False**, a plain folder structure will be expected and adopted when saving files. For example, the same structure will be present as :file:`pdblocalfolder/1xyz.pdb.gz`. Finally, in either case, lower case letters will be used and compressed files will be stored.""" if folder is None: folder = SETTINGS.get('pdb_local_folder') if folder: if isdir(folder): return folder, SETTINGS.get('pdb_local_divided', True) else: LOGGER.warn('PDB local folder {0} is not a accessible.' .format(repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local PDB folder is set: {0}'.format(repr(folder))) if divided: LOGGER.info('wwPDB divided folder structure will be assumed.') else: LOGGER.info('A plain folder structure will be assumed.') SETTINGS['pdb_local_folder'] = folder SETTINGS['pdb_local_divided'] = bool(divided) SETTINGS.save() else: current = SETTINGS.pop('pdb_local_folder') if current: LOGGER.info('PDB folder {0} is released.' .format(repr(current))) SETTINGS.pop('pdb_local_divided') SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(folder)))
def pathPDBFolder(folder=None, divided=False): """Returns or specify local PDB folder for storing PDB files downloaded from `wwPDB <http://www.wwpdb.org/>`_ servers. Files stored in this folder can be accessed via :func:`.fetchPDB` from any working directory. To release the current folder, pass an invalid path, e.g. ``folder=''``. If *divided* is **True**, the divided folder structure of wwPDB servers will be assumed when reading from and writing to the local folder. For example, a structure with identifier **1XYZ** will be present as :file:`pdblocalfolder/yz/pdb1xyz.pdb.gz`. If *divided* is **False**, a plain folder structure will be expected and adopted when saving files. For example, the same structure will be present as :file:`pdblocalfolder/1xyz.pdb.gz`. Finally, in either case, lower case letters will be used and compressed files will be stored.""" if folder is None: folder = SETTINGS.get('pdb_local_folder') if folder: if isdir(folder): return folder, SETTINGS.get('pdb_local_divided', True) else: LOGGER.warn('PDB local folder {0} is not a accessible.'.format( repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local PDB folder is set: {0}'.format(repr(folder))) if divided: LOGGER.info('wwPDB divided folder structure will be assumed.') else: LOGGER.info('A plain folder structure will be assumed.') SETTINGS['pdb_local_folder'] = folder SETTINGS['pdb_local_divided'] = bool(divided) SETTINGS.save() else: current = SETTINGS.pop('pdb_local_folder') if current: LOGGER.info('PDB folder {0} is released.'.format( repr(current))) SETTINGS.pop('pdb_local_divided') SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(folder)))
def pathVMD(*path): """Return VMD path, or set it to be a user specified *path*.""" if not path: path = SETTINGS.get('vmd', None) if isExecutable(path): return path else: LOGGER.warning('VMD path is not set by user, looking for it.') vmdbin = None vmddir = None if PLATFORM == 'Windows': if PY3K: import winreg else: import _winreg as winreg # PY3K: OK for vmdversion in ('1.8.7', '1.9', '1.9.1'): try: key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, 'Software\\University of Illinois\\VMD\\' + vmdversion) vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0] vmdbin = join(vmddir, 'vmd.exe') except: pass try: key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, 'Software\\WOW6432node\\University of Illinois\\VMD\\' + vmdversion) vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0] vmdbin = join(vmddir, 'vmd.exe') except: pass else: vmdbin = which('vmd') if False: pipe = os.popen('which vmd') vmdbin = pipe.next().strip() vmdfile = open(vmdbin) for line in vmdfile: if line.startswith('defaultvmddir='): vmddir = line.split('=')[1].replace('"', '') break vmdfile.close() if isExecutable(vmdbin): setVMDpath(vmdbin) return vmdbin elif len(path) == 1: path = path[0] if isExecutable(path): SETTINGS['vmd'] = path SETTINGS.save() LOGGER.info("VMD path is set to '{0}'.".format(path)) else: raise OSError('{0} is not executable.'.format(str(path))) else: raise ValueError('specify a single path string')
def getVMDpath(): """Return VMD path set by user or one identified automatically.""" path = SETTINGS.get("vmd", None) if isExecutable(path): return path else: LOGGER.warning("VMD path is not set by user, looking for it.") from types import StringType, UnicodeType vmdbin = None vmddir = None if PLATFORM == "Windows": import _winreg for vmdversion in ("1.8.7", "1.9", "1.9.1"): try: key = _winreg.OpenKey( _winreg.HKEY_LOCAL_MACHINE, "Software\\University of Illinois\\VMD\\" + vmdversion ) vmddir = _winreg.QueryValueEx(key, "VMDDIR")[0] vmdbin = os.path.join(vmddir, "vmd.exe") except: pass try: key = _winreg.OpenKey( _winreg.HKEY_LOCAL_MACHINE, "Software\\WOW6432node\\University of Illinois\\VMD\\" + vmdversion ) vmddir = _winreg.QueryValueEx(key, "VMDDIR")[0] vmdbin = os.path.join(vmddir, "vmd.exe") except: pass else: vmdbin = which("vmd") if False: pipe = os.popen("which vmd") vmdbin = pipe.next().strip() vmdfile = open(vmdbin) for line in vmdfile: if line.startswith("defaultvmddir="): vmddir = line.split("=")[1].replace('"', "") break vmdfile.close() if ( False and isinstance(vmdbin, (StringType, UnicodeType)) and isinstance(vmddir, (StringType, UnicodeType)) and os.path.isfile(vmdbin) and os.path.isdir(vmddir) ): pass # return vmdbin, vmddir if isExecutable(vmdbin): setVMDpath(vmdbin) return vmdbin
def getPDBMirrorPath(): """Return the path to a local PDB mirror, or **None** if a mirror path is not set.""" path = SETTINGS.get('pdb_mirror_path') if path: if isdir(path): return path else: LOGGER.warning('PDB mirror path {0:s} is not a accessible.' .format(repr(path)))
def recoverPickle(self, filename=None, folder=None, days=30, **kwargs): acc = self.uniq_acc if acc is None: # assume acc is equal to uniq_acc acc = self.acc if folder is None: folder = SETTINGS.get('rhapsody_local_folder') if folder is None: folder = '.' else: folder = os.path.join(folder, 'pickles') if filename is None: # assume acc is equal to uniq_acc acc = self.acc filename = 'UniprotMap-' + acc + '.pkl' pickle_path = os.path.join(folder, filename) if not os.path.isfile(pickle_path): # import unique accession number acc = pd.queryUniprot(self.acc)['accession 0'] filename = 'UniprotMap-' + acc + '.pkl' pickle_path = os.path.join(folder, filename) else: pickle_path = os.path.join(folder, filename) # check if pickle exists if not os.path.isfile(pickle_path): raise IOError("File '{}' not found".format(filename)) # load pickle recovered_self = pickle.load(open(pickle_path, "rb")) if acc not in [recovered_self.acc, recovered_self.uniq_acc]: raise ValueError('Accession number in recovered pickle (%s) ' % recovered_self.uniq_acc + 'does not match.') # check timestamp and ignore pickles that are too old date_format = "%Y-%m-%d %H:%M:%S.%f" t_old = datetime.datetime.strptime(recovered_self.timestamp, date_format) t_now = datetime.datetime.utcnow() Delta_t = datetime.timedelta(days=days) if t_old + Delta_t < t_now: raise RuntimeError( 'Pickle {} was too old and was ignored.'.format(filename)) self.fullRecord = recovered_self.fullRecord self.uniq_acc = recovered_self.uniq_acc self.sequence = recovered_self.sequence self.PDBrecords = recovered_self.PDBrecords self.PDBmappings = recovered_self.PDBmappings self.customPDBmappings = recovered_self.customPDBmappings self._align_algo_args = recovered_self._align_algo_args self._align_algo_kwargs = recovered_self._align_algo_kwargs self.timestamp = recovered_self.timestamp self.Pfam = recovered_self.Pfam LOGGER.info("Pickle '{}' recovered.".format(filename)) return
def getNonstdProperties(resname): """Return properties of non-standard amino acid *resname*. >>> getNonstdProperties('PTR') ['acidic', 'aromatic', 'cyclic', 'large', 'polar', 'surface']""" try: alist = list(SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD)[resname]) except KeyError: raise ValueError('{0:s} is not a non-standard residue name' .format(repr(resname))) else: alist.sort() return alist
def listNonstdAAProps(resname): """Returns properties of non-standard amino acid *resname*. .. ipython:: python listNonstdAAProps('PTR')""" try: alist = list(SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD)[resname]) except KeyError: raise ValueError('{0} is not a non-standard residue name' .format(repr(resname))) else: alist.sort() return alist
def savePickle(self, filename=None, folder=None, store_custom_PDBs=False): if folder is None: folder = SETTINGS.get('rhapsody_local_folder', '.') if filename is None: filename = 'UniprotMap-' + self.uniq_acc + '.pkl' pickle_path = os.path.join(folder, filename) cache = self.customPDBmappings if store_custom_PDBs is not True: # do not store alignments of custom PDBs self.customPDBmappings = [] # save pickle pickle.dump(self, open(pickle_path, "wb")) self.customPDBmappings = cache LOGGER.info("Pickle '{}' saved.".format(filename)) return pickle_path
def listNonstdAAProps(resname): """Returns properties of non-standard amino acid *resname*. .. ipython:: python listNonstdAAProps('PTR')""" try: alist = list(SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD)[resname]) except KeyError: raise ValueError('{0} is not a non-standard residue name'.format( repr(resname))) else: alist.sort() return alist
def savePickle(self, folder=None, filename=None): """Stores a pickle of the current class instance. The pickle will contain all information and precomputed features, but not GNM and ANM models. In case a PDBID is missing, the parsed PDB :class:`AtomGroup` is stored as well. :arg folder: path of the folder where the pickle will be saved. If not specified, the local Rhapsody installation folder will be used. :type folder: str :arg filename: name of the pickle. By default, the pickle will be saved as ``'PDBfeatures-[PDBID].pkl'``. If a PDBID is not defined, the user must provide a filename. :type filename: str :return: pickle path :rtype: str """ if folder is None: # define folder where to look for pickles folder = SETTINGS.get('rhapsody_local_folder') if folder is None: folder = '.' else: folder = os.path.join(folder, 'pickles') if filename is None: # use the default filename, if possible if self.PDBID is None: # when a custom structure is used, there is no # default filename: the user should provide it raise ValueError('Please provide a filename.') filename = 'PDBfeatures-' + self.PDBID + '.pkl' pickle_path = os.path.join(folder, filename) # do not store GNM and ANM instances. # If a valid PDBID is present, do not store parsed PDB # as well, since it can be easily fetched again cache = (self._pdb, self._gnm, self._anm) if self.PDBID is not None: self._pdb = None self._gnm = {} self._anm = {} for env in ['chain', 'reduced', 'sliced']: self._gnm[env] = {chID: None for chID in self.chids} self._anm[env] = {chID: None for chID in self.chids} # write pickle pickle.dump(self, open(pickle_path, "wb")) # restore temporarily cached data self._pdb, self._gnm, self._anm = cache LOGGER.info("Pickle '{}' saved.".format(filename)) return pickle_path
def getWWPDBFTPServer(): """Return a tuple containing name, host, and path of the currently set `wwPDB <http://www.wwpdb.org/>`_ FTP server.""" server = SETTINGS.get('wwpdb_ftp', None) if server is None: LOGGER.warning('A wwPDB FTP server is not set, default FTP server ' 'RCSB PDB is used. Use `setWWPDBFTPServer` function ' 'to set a server close to your location.') return _WWPDB_RCSB else: if server[2].endswith('data/structures/divided/pdb/'): return (server[0], server[1], server[2][:-len('data/structures/divided/pdb/')]) else: return server
def recoverPickle(self, folder=None, filename=None, days=30, **kwargs): if folder is None: # define folder where to look for pickles folder = SETTINGS.get('rhapsody_local_folder', '.') if filename is None: # use the default filename, if possible if self.PDBID is not None: filename = 'PDBfeatures-' + self.PDBID + '.pkl' else: # when a custom structure is used, there is no # default filename: the user should provide it raise ValueError('Please provide a filename.') pickle_path = os.path.join(folder, filename) if not os.path.isfile(pickle_path): raise IOError("File '{}' not found".format(filename)) recovered_self = pickle.load(open(pickle_path, "rb")) # check consistency of recovered data if self.PDBID is None: if self._pdb != recovered_self._pdb: raise ValueError( 'Incompatible PDB structure in recovered pickle.') elif self.PDBID != recovered_self.PDBID: raise ValueError( 'PDBID in recovered pickle ({}) does not match.'.format( recovered_self.PDBID)) if self.n_modes != recovered_self.n_modes: raise ValueError( 'Num. of modes in recovered pickle ({}) does not match.'. format(recovered_self.n_modes)) # check timestamp and ignore pickles that are too old date_format = "%Y-%m-%d %H:%M:%S.%f" t_old = datetime.datetime.strptime(recovered_self.timestamp, date_format) t_now = datetime.datetime.utcnow() Delta_t = datetime.timedelta(days=days) if t_old + Delta_t < t_now: raise RuntimeError('Pickle was too old and was ignored.') # import recovered data self.chids = recovered_self.chids self.resids = recovered_self.resids self.feats = recovered_self.feats self._gnm = recovered_self._gnm self._anm = recovered_self._anm self.timestamp = recovered_self.timestamp LOGGER.info("Pickle '{}' recovered.".format(filename)) return
def delNonstdAminoacid(resname): """Delete non-standard amino acid *resname*. >>> delNonstdAminoacid('PTR') >>> flagDefinition('nonstdaa') # doctest: +ELLIPSIS ['ASX', 'CSO', 'GLX', ..., 'TPO', 'XAA', 'XLE'] Default set of non-standard amino acids can be restored as follows: >>> flagDefinition(reset='nonstdaa')""" nonstd = SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD) try: nonstd.pop(resname) except KeyError: raise ValueError('{0:s} is not a non-standard residue name' .format(repr(resname))) else: updateNonstandard(nonstd)
def delNonstdAminoacid(resname): """Delete non-standard amino acid *resname*. .. ipython:: python delNonstdAminoacid('PTR') flagDefinition('nonstdaa') Default set of non-standard amino acids can be restored as follows: .. ipython:: python flagDefinition(reset='nonstdaa')""" nonstd = SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD) try: nonstd.pop(resname) except KeyError: raise ValueError('{0} is not a non-standard residue name' .format(repr(resname))) else: updateNonstandard(nonstd)
def delNonstdAminoacid(resname): """Delete non-standard amino acid *resname*. .. ipython:: python delNonstdAminoacid('PTR') flagDefinition('nonstdaa') Default set of non-standard amino acids can be restored as follows: .. ipython:: python flagDefinition(reset='nonstdaa')""" nonstd = SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD) try: nonstd.pop(resname) except KeyError: raise ValueError('{0} is not a non-standard residue name'.format( repr(resname))) else: updateNonstandard(nonstd)
def addNonstdAminoacid(resname, *properties): """Add non-standard amino acid *resname* with *properties* selected from: * {props} .. ipython:: python addNonstdAminoacid('PTR', 'acidic', 'aromatic', 'cyclic', 'large', 'polar', 'surface') Default set of non-standard amino acids can be restored as follows: .. ipython:: python flagDefinition(reset='nonstdaa')""" resname = str(resname) if len(resname) > 4: LOGGER.warn('Residue name {0} is unusually long.'.format( repr(resname))) propset = set(properties) for cat, val in CATEGORIES.items(): intersection = val.intersection(propset) if intersection: if len(intersection) > 1: raise ValueError('amino acid properties {0} cannot be ' 'present together'.format(', '.join( [repr(prp) for prp in intersection]))) for prop in intersection: propset.remove(prop) if propset: raise ValueError('amino acid property {0} is not valid'.format( repr(propset.pop()))) nonstd = SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD) nonstd[resname] = set(properties) updateNonstandard(nonstd)
def parsePDBStream(stream, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a stream of PDB lines. :arg stream: Anything that implements the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" model = kwargs.get('model') header = kwargs.get('header', False) assert isinstance(header, bool), 'header must be a boolean' chain = kwargs.get('chain') subset = kwargs.get('subset') altloc = kwargs.get('altloc', 'A') if model is not None: if isinstance(model, Integral): if model < 0: raise ValueError('model must be greater than 0') else: raise TypeError('model must be an integer, {0} is invalid' .format(str(model))) title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset' .format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = chain + title_suffix ag = kwargs.pop('ag', None) if ag is not None: if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() elif model != 0: ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) n_csets = 0 biomol = kwargs.get('biomol', False) auto_secondary = None secondary = kwargs.get('secondary') if not secondary: auto_secondary = SETTINGS.get('auto_secondary') secondary = auto_secondary split = 0 hd = None if model != 0: LOGGER.timeit() try: lines = stream.readlines() except AttributeError as err: try: lines = stream.read().split('\n') except AttributeError: raise err if not len(lines): raise ValueError('empty PDB file or stream') if header or biomol or secondary: hd, split = getHeaderDict(lines) _parsePDBLines(ag, lines, split, model, chain, subset, altloc) if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate set(s) were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) else: ag = None LOGGER.warn('Atomic data could not be parsed, please ' 'check the input file.') elif header: hd, split = getHeaderDict(stream) if ag is not None and isinstance(hd, dict): if secondary: if auto_secondary: try: ag = assignSecstr(hd, ag) except ValueError: pass else: ag = assignSecstr(hd, ag) if biomol: ag = buildBiomolecules(hd, ag) if isinstance(ag, list): LOGGER.info('Biomolecular transformations were applied, {0} ' 'biomolecule(s) are returned.'.format(len(ag))) else: LOGGER.info('Biomolecular transformations were applied to the ' 'coordinate data.') if model != 0: if header: return ag, hd else: return ag else: return hd
def parsePDBStream(stream, **kwargs): """Returns an :class:`.AtomGroup` and/or dictionary containing header data parsed from a stream of PDB lines. :arg stream: Anything that implements the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" model = kwargs.get('model') header = kwargs.get('header', False) assert isinstance(header, bool), 'header must be a boolean' chain = kwargs.get('chain') subset = kwargs.get('subset') altloc = kwargs.get('altloc', 'A') if model is not None: if isinstance(model, Integral): if model < 0: raise ValueError('model must be greater than 0') else: raise TypeError('model must be an integer, {0} is invalid'.format( str(model))) title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset'.format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = chain + title_suffix ag = None if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() elif model != 0: ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) n_csets = 0 biomol = kwargs.get('biomol', False) auto_secondary = None secondary = kwargs.get('secondary') if not secondary: auto_secondary = SETTINGS.get('auto_secondary') secondary = auto_secondary split = 0 hd = None if model != 0: LOGGER.timeit() try: lines = stream.readlines() except AttributeError as err: try: lines = stream.read().split('\n') except AttributeError: raise err if not len(lines): raise ValueError('empty PDB file or stream') if header or biomol or secondary: hd, split = getHeaderDict(lines) _parsePDBLines(ag, lines, split, model, chain, subset, altloc) if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate set(s) were ' 'parsed in %.2fs.'.format( ag.numAtoms(), ag.numCoordsets() - n_csets)) else: ag = None LOGGER.warn('Atomic data could not be parsed, please ' 'check the input file.') elif header: hd, split = getHeaderDict(stream) if ag is not None and isinstance(hd, dict): if secondary: if auto_secondary: try: ag = assignSecstr(hd, ag) except ValueError: pass else: ag = assignSecstr(hd, ag) if biomol: ag = buildBiomolecules(hd, ag) if isinstance(ag, list): LOGGER.info('Biomolecular transformations were applied, {0} ' 'biomolecule(s) are returned.'.format(len(ag))) else: LOGGER.info('Biomolecular transformations were applied to the ' 'coordinate data.') if model != 0: if header: return ag, hd else: return ag else: return hd
def calcEVmutPathClasses(EVmut_score): c = -SETTINGS.get('EVmutation_metrics')['optimal cutoff'] EVmut_class = np.where(EVmut_score < c, 'deleterious', 'neutral') EVmut_class[np.isnan(EVmut_score)] = '?' return EVmut_class
def print_sat_mutagen_figure(filename, rhapsody_obj, res_interval=None, PolyPhen2=True, EVmutation=True, extra_plot=None, fig_height=8, fig_width=None, dpi=300, min_interval_size=15, html=False, main_clsf='main', aux_clsf='aux.'): # check inputs assert isinstance(filename, str), 'filename must be a string' assert isinstance(rhapsody_obj, Rhapsody), 'not a Rhapsody object' assert rhapsody_obj._isColSet('main score'), 'predictions not found' assert rhapsody_obj._isSaturationMutagenesis(), 'unable to create figure' if res_interval is not None: assert isinstance(res_interval, tuple) and len(res_interval) == 2, \ 'res_interval must be a tuple of 2 values' assert res_interval[1] >= res_interval[0], 'invalid res_interval' if extra_plot is not None: assert len(extra_plot) == rhapsody_obj.numSAVs, \ 'length of additional predictions array is incorrect' assert isinstance(fig_height, (int, float)) assert isinstance(dpi, int) matplotlib = _try_import_matplotlib() if matplotlib is None: return # delete extension from filename filename = os.path.splitext(filename)[0] # make sure that all variants belong to the same Uniprot sequence accs = [s.split()[0] for s in rhapsody_obj.data['SAV coords']] if len(set(accs)) != 1: m = 'Only variants from a single Uniprot sequence can be accepted' raise ValueError(m) # select an appropriate interval, based on available predictions seq_pos = [int(s.split()[1]) for s in rhapsody_obj.data['SAV coords']] res_min = np.min(seq_pos) res_max = np.max(seq_pos) upper_lim = res_max + min_interval_size # create empty (20 x num_res) mutagenesis tables table_best = np.zeros((20, upper_lim), dtype=float) table_best[:] = 'nan' table_main = table_best.copy() if extra_plot is not None: table_other = table_best.copy() if PolyPhen2: table_PP2 = table_best.copy() if EVmutation: table_EVmut = table_best.copy() # import pathogenicity probabilities from Rhapsody object p_best = rhapsody_obj.getPredictions(classifier='best')['path. prob.'] p_main = rhapsody_obj.data['main path. prob.'] if PolyPhen2: rhapsody_obj._calcPolyPhen2Predictions() p_PP2 = rhapsody_obj.data['PolyPhen-2 score'] if EVmutation: rhapsody_obj._calcEVmutationPredictions() EVmut_score = np.array(rhapsody_obj.data['EVmutation score']) EVmut_cutoff = SETTINGS.get('EVmutation_metrics')['optimal cutoff'] p_EVmut = -EVmut_score / EVmut_cutoff * 0.5 # fill tables with predicted probability # 1: deleterious # 0: neutral # 'nan': no prediction/wt aa_list = 'ACDEFGHIKLMNPQRSTVWY' aa_map = {aa: i for i, aa in enumerate(aa_list)} for i, SAV in enumerate(rhapsody_obj.data['SAV coords']): aa_mut = SAV.split()[3] index = int(SAV.split()[1]) - 1 table_best[aa_map[aa_mut], index] = p_best[i] table_main[aa_map[aa_mut], index] = p_main[i] if extra_plot is not None: table_other[aa_map[aa_mut], index] = extra_plot[i] if PolyPhen2: table_PP2[aa_map[aa_mut], index] = p_PP2[i] if EVmutation: table_EVmut[aa_map[aa_mut], index] = p_EVmut[i] # compute average pathogenicity profiles # NB: I expect to see RuntimeWarnings in this block with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) avg_p_best = np.nanmean(table_best, axis=0) avg_p_main = np.nanmean(table_main, axis=0) min_p = np.nanmin(table_best, axis=0) max_p = np.nanmax(table_best, axis=0) if extra_plot is not None: avg_p_other = np.nanmean(table_other, axis=0) if PolyPhen2: avg_p_PP2 = np.nanmean(table_PP2, axis=0) if EVmutation: avg_p_EVmut = np.nanmean(table_EVmut, axis=0) # use upper strip for showing additional info, such as PDB lengths upper_strip = np.zeros((1, upper_lim)) upper_strip[:] = 'nan' PDB_sizes = np.zeros(upper_lim, dtype=int) PDB_coords = [''] * upper_lim for s in rhapsody_obj.data: index = int(s['SAV coords'].split()[1]) - 1 if s['PDB size'] != 0: PDB_length = int(s['PDB size']) PDBID_chain = ':'.join(s['PDB SAV coords'][0].split()[:2]) upper_strip[0, index] = PDB_length PDB_sizes[index] = PDB_length PDB_coords[index] = PDBID_chain max_PDB_size = max(PDB_sizes) if max_PDB_size != 0: upper_strip[0, :] /= max_PDB_size # PLOT FIGURE from matplotlib import pyplot as plt from matplotlib import gridspec as gridspec # portion of the sequence to display if res_interval is None: res_interval = (res_min, res_max) # adjust interval res_i, res_f = _adjust_res_interval(res_interval, upper_lim, min_interval_size) nres_shown = res_f - res_i + 1 # figure proportions if fig_width is None: fig_width = fig_height / 2 # inches fig_width *= nres_shown / 20 fig, ax = plt.subplots(3, 2, figsize=(fig_width, fig_height)) wspace = 0.5 # inches plt.subplots_adjust(wspace=wspace / fig_width, hspace=0.15) # figure structure gs = gridspec.GridSpec(3, 2, width_ratios=[nres_shown, 1], height_ratios=[1, 20, 10]) ax0 = plt.subplot(gs[0, 0]) # secondary structure strip ax1 = plt.subplot(gs[1, 0]) # mutagenesis table axcb = plt.subplot(gs[1, 1]) # colorbar ax2 = plt.subplot(gs[2, 0]) # average profile # padding for tick labels pad = 0.2 / fig_width # top strip matplotlib.cm.YlGn.set_bad(color='antiquewhite') ax0.imshow(upper_strip[0:1, res_i - 1:res_f], aspect='auto', cmap='YlGn', vmin=0, vmax=1) ax0.set_ylim((-0.45, .45)) ax0.set_yticks([]) ax0.set_ylabel(f'PDB size \n[0-{max_PDB_size} res] ', fontsize=14, ha='right', va='center', rotation=0) ax0.set_xticks(np.arange(5 - res_i % 5, res_f - res_i + 1, 5)) ax0.set_xticklabels([]) # add white grid ax0.set_xticks(np.arange(-.5, res_f - res_i + 1, 1), minor=True) ax0.tick_params(axis='both', which='minor', length=0) ax0.grid(which='minor', color='w', linestyle='-', linewidth=.5) # mutagenesis table (heatmap) matplotlib.cm.coolwarm.set_bad(color='#F9E79F') im = ax1.imshow(table_best[:, res_i - 1:res_f], aspect='auto', cmap='coolwarm', vmin=0, vmax=1) axcb.figure.colorbar(im, cax=axcb) ax1.set_yticks(np.arange(len(aa_list))) ax1.set_yticklabels(aa_list, ha='center', position=(-pad, 0), fontsize=14) ax1.set_xticks(np.arange(5 - res_i % 5, res_f - res_i + 1, 5)) ax1.set_xticklabels([]) ax1.set_ylabel('pathog. probability', labelpad=10) # add white grid ax1.set_xticks(np.arange(-.5, res_f - res_i + 1, 1), minor=True) ax1.set_yticks(np.arange(-.5, 20, 1), minor=True) ax1.tick_params(axis='both', which='minor', length=0) ax1.grid(which='minor', color='w', linestyle='-', linewidth=.5) # average pathogenicity profile x_resids = np.arange(1, upper_lim + 1) # shading showing range of values # NB: a bug in pyplot.fill_between() arises when selecting a region with # set_xlim() in a large plot (e.g. > 1000), causing the shaded area to # be plotted even though it's outside the selected region. As a workaround, # here I slice the plot to fit the selected region. sl = slice(max(0, res_i - 2), min(res_f + 2, upper_lim + 1)) ax2.fill_between(x_resids[sl], min_p[sl], max_p[sl], alpha=0.5, edgecolor='salmon', facecolor='salmon') # plot average profile for other predictions, if available if extra_plot is not None: ax2.plot(x_resids, avg_p_other, color='gray', lw=1) if PolyPhen2: ax2.plot(x_resids, avg_p_PP2, color='blue', lw=1) if EVmutation: ax2.plot(x_resids, avg_p_EVmut, color='green', lw=1) # solid line for predictions obtained with full classifier ax2.plot(x_resids, avg_p_main, 'ro-') # dotted line for predictions obtained with auxiliary classifier ax2.plot(x_resids, avg_p_best, 'ro-', markerfacecolor='none', ls='dotted') # cutoff line ax2.axhline(y=0.5, color='grey', lw=.8, linestyle='dashed') ax2.set_xlim((res_i - .5, res_f + .5)) ax2.set_xlabel('residue number') ax2.set_ylim((-0.05, 1.05)) ax2.set_ylabel('average', rotation=90, labelpad=10) ax2.set_yticklabels([]) ax2r = ax2.twinx() ax2r.set_ylim((-0.05, 1.05)) ax2r.set_yticks([0, .5, 1]) ax2r.set_yticklabels(['0', '0.5', '1']) ax2r.tick_params(axis='both', which='major', pad=15) tight_padding = 0.1 fig.savefig(filename + '.png', format='png', bbox_inches='tight', pad_inches=tight_padding, dpi=dpi) plt.close() plt.rcParams.update(plt.rcParamsDefault) LOGGER.info(f'Saturation mutagenesis figure saved to {filename}.png') # write a map in html format, to make figure clickable if html: all_axis = {'strip': ax0, 'table': ax1, 'bplot': ax2} # precompute some useful quantities for html code html_data = {} # dpi of printed figure html_data["dpi"] = dpi # figure size *before* tight html_data["fig_size"] = fig.get_size_inches() # tight bbox as used by fig.savefig() html_data["tight_bbox"] = fig.get_tightbbox(fig.canvas.get_renderer()) # compute new origin and height, based on tight box and padding html_data["new_orig"] = html_data["tight_bbox"].min - tight_padding html_data["new_height"] = (html_data["tight_bbox"].height + 2 * tight_padding) def get_area_coords(ax, d): assert ax_type in ("strip", "table", "bplot") # get bbox coordinates (x0, y0, x1, y1) bbox = ax.get_position().get_points() # get bbox coordinates in inches b_inch = bbox * d["fig_size"] # adjust bbox coordinates based on tight bbox b_adj = b_inch - d["new_orig"] # use html reference system (y = 1 - y) b_html = b_adj * np.array([1, -1]) + np.array([0, d["new_height"]]) # convert to pixels b_px = (d["dpi"] * b_html).astype(int) b_px = np.sort(b_px, axis=0) # put in html format coords = '{},{},{},{}'.format(*b_px.flatten()) # output return coords # html templates area_html = Template('<area shape="rect" coords="$coords" ' 'id="{{map_id}}_$areaid" {{area_attrs}}> \n') # write html with open(filename + '.html', 'w') as f: f.write('<div>\n') f.write('<map name="{{map_id}}" id="{{map_id}}" {{map_attrs}}>\n') for ax_type, ax in all_axis.items(): fields = {'areaid': ax_type} fields['coords'] = get_area_coords(ax, html_data) f.write(area_html.substitute(fields)) f.write('</map>\n') f.write('</div>\n') # populate info table that will be passed as a javascript variable best_preds = rhapsody_obj.getPredictions() best_avg_preds = rhapsody_obj.getResAvgPredictions() PDB_coords = rhapsody_obj.getPDBcoords() abbrev = { '?': '?', 'deleterious': 'del', 'neutral': 'neu', 'prob.delet.': 'p.del', 'prob.neutral': 'p.neu' } info = {} for k in ['strip', 'table', 'bplot']: n_cols = 20 if k == 'table' else 1 info[k] = [[''] * nres_shown for i in range(n_cols)] for i, row in enumerate(rhapsody_obj.data): SAV = row['SAV coords'] acc, resid, aa_wt, aa_mut = SAV.split() resid = int(resid) # consider only residues shown in figure if not (res_i <= resid <= res_f): continue # SAV coordinates SAV_code = f'{aa_wt}{resid}{aa_mut}' # coordinates on table t_i = aa_map[aa_mut] t_j = resid - 1 # coordinates on *shown* table ts_i = t_i ts_j = resid - res_i # compose message for table bp = best_preds[i] pprob = bp['path. prob.'] pclass = bp['path. class'] clsf = main_clsf if row['best classifier'] == 'main' else aux_clsf m = f'{SAV_code}: Rhapsody-{clsf} = {pprob:<3.2f} ({pclass})' if PolyPhen2: score = bp['PolyPhen-2 score'] pclass = abbrev[bp['PolyPhen-2 path. class']] m += f', PolyPhen-2 = {score:<3.2f} ({pclass})' if EVmutation: score = bp['EVmutation score'] pclass = abbrev[bp['EVmutation path. class']] m += f', EVmutation = {score:<3.2f} ({pclass})' if extra_plot is not None: score = table_other[t_i, t_j] m += f', other = {score:<3.2f}' info['table'][ts_i][ts_j] = m info['table'][aa_map[aa_wt]][ts_j] = f'{SAV_code[:-1]}: wild-type' if i % 19 == 0: # compose message for upper strip PDBID, ch, resid, aa, size = PDB_coords[i][[ 'PDBID', 'chain', 'resid', 'resname', 'PDB size' ]] if size > 0: m = f'{PDBID}:{ch}, resid {resid}, aa {aa}, size {size}' else: m = 'no PDB found' info['strip'][0][ts_j] = m # compose message for bottom plot (residue-averages) bap = best_avg_preds[int(i / 19)] pprob = bap['path. prob.'] pcl = bap['path. class'] m = f'{SAV_code[:-1]}: Rhapsody-{clsf} = {pprob:<3.2f} ({pcl})' if PolyPhen2: score = bap['PolyPhen-2 score'] pcl = abbrev[bap['PolyPhen-2 path. class']] m += f', PolyPhen-2 = {score:<3.2f} ({pcl})' if EVmutation: score = bap['EVmutation score'] pcl = abbrev[bap['EVmutation path. class']] m += f', EVmutation = {score:<3.2f} ({pcl})' if extra_plot is not None: score = avg_p_other[t_j] m += f', other = {score:<3.2f}' info['bplot'][0][ts_j] = m def create_info_msg(ax_type, d): text = '[ \n' for row in d: text += ' [' for m in row: text += f'"{m}",' text += '], \n' text += ']' return text area_js = Template('{{map_data}}["{{map_id}}_$areaid"] = { \n' ' "img_id": "{{img_id}}", \n' ' "map_id": "{{map_id}}", \n' ' "coords": [$coords], \n' ' "num_rows": $num_rows, \n' ' "num_cols": $num_cols, \n' ' "info_msg": $info_msg, \n' '}; \n') # dump info in javascript format with open(filename + '.js', 'w') as f: f.write('var {{map_data}} = {}; \n') for ax_type, d in info.items(): vars = {'areaid': ax_type} vars['coords'] = get_area_coords(all_axis[ax_type], html_data) vars['num_rows'] = 20 if ax_type == 'table' else 1 vars['num_cols'] = nres_shown vars['info_msg'] = create_info_msg(ax_type, d) f.write(area_js.substitute(vars)) return info return
def updateDefinitions(): """Update definitions and set some global variables. This function must be called at the end of the module.""" global DEFINITIONS, AMINOACIDS, BACKBONE, TIMESTAMP DEFINITIONS = {} user = SETTINGS.get('flag_definitions', {}) # nucleics nucleic = set() for key in ['nucleobase', 'nucleoside', 'nucleotide']: aset = set(user.get(key, DEFAULTS[key])) nucleic.update(aset) DEFINITIONS[key] = aset DEFINITIONS['nucleic'] = nucleic # heteros for key in [ 'water', 'lipid', 'ion', 'sugar', 'heme', 'at', 'cg', 'purine', 'pyrimidine' ]: DEFINITIONS[key] = set(user.get(key, DEFAULTS[key])) DEFINITIONS['backbone'] = DEFINITIONS['bb'] = set( user.get(key, DEFAULTS['bb'])) DEFINITIONS['backbonefull'] = DEFINITIONS['bbfull'] = set( user.get(key, DEFAULTS['bbfull'])) # element regex for key in ['hydrogen', 'carbon', 'nitrogen', 'oxygen', 'sulfur']: DEFINITIONS[key] = recompile(user.get(key, DEFAULTS[key])) try: nonstd = SETTINGS[NONSTANDARD_KEY] except KeyError: nonstd = NONSTANDARD DEFINITIONS.update(CATEGORIZED) else: for cat in CATEGORIES: for key in CATEGORIES[cat]: DEFINITIONS[key] = set(DEFAULTS[key]) DEFINITIONS['charged'] = set(DEFINITIONS['acidic']) DEFINITIONS['charged'].update(DEFINITIONS['basic']) for resi, props in nonstd.items(): for prop in props: DEFINITIONS[prop].add(resi) DEFINITIONS['stdaa'] = DEFAULTS['stdaa'] DEFINITIONS['nonstdaa'] = set(nonstd) AMINOACIDS = set(DEFINITIONS['stdaa']) AMINOACIDS.update(DEFINITIONS['nonstdaa']) DEFINITIONS['protein'] = DEFINITIONS['aminoacid'] = AMINOACIDS BACKBONE = DEFINITIONS['bb'] global TIMESTAMP TIMESTAMP = SETTINGS.get(TIMESTAMP_KEY, 0)
def recoverPickle(self, folder=None, filename=None, days=30, **kwargs): """Looks for precomputed pickle for the current PDB structure. :arg folder: path of folder where pickles are stored. If not specified, pickles will be searched for in the local Rhapsody installation folder. :type folder: str :arg filename: name of the pickle. If not specified, the default filename ``'PDBfeatures-[PDBID].pkl'`` will be used. If a PDBID is not found, user must specify a valid filename. :type filename: str :arg days: number of days after which a pickle will be considered too old and won't be recovered. :type days: int """ if folder is None: # define folder where to look for pickles folder = SETTINGS.get('rhapsody_local_folder') if folder is None: folder = '.' else: folder = os.path.join(folder, 'pickles') if filename is None: # use the default filename, if possible if self.PDBID is not None: filename = 'PDBfeatures-' + self.PDBID + '.pkl' else: # when a custom structure is used, there is no # default filename: the user should provide it raise ValueError('Please provide a filename.') pickle_path = os.path.join(folder, filename) if not os.path.isfile(pickle_path): raise IOError("File '{}' not found".format(filename)) recovered_self = pickle.load(open(pickle_path, "rb")) # check consistency of recovered data if self.PDBID is None: if self._pdb != recovered_self._pdb: raise ValueError('Incompatible PDB structure in recovered pickle.') elif self.PDBID != recovered_self.PDBID: raise ValueError('PDBID in recovered pickle ({}) does not match.' .format(recovered_self.PDBID)) if self.n_modes != recovered_self.n_modes: raise ValueError('Num. of modes in recovered pickle ({}) does not match.' .format(recovered_self.n_modes)) # check timestamp and ignore pickles that are too old date_format = "%Y-%m-%d %H:%M:%S.%f" t_old = datetime.datetime.strptime( recovered_self.timestamp, date_format) t_now = datetime.datetime.utcnow() Delta_t = datetime.timedelta(days=days) if t_old + Delta_t < t_now: raise RuntimeError('Pickle was too old and was ignored.') # import recovered data self.chids = recovered_self.chids self.resids = recovered_self.resids self.feats = recovered_self.feats self._gnm = recovered_self._gnm self._anm = recovered_self._anm self.timestamp = recovered_self.timestamp LOGGER.info("Pickle '{}' recovered.".format(filename)) return
def recoverEVmutFeatures(SAVs): """Compute EVmutation features by fetching precomputed scores from the downloaded local folder. If multiple values are found for a given variant, the average will be taken. :arg SAVs: list of SAV coordinates, e.g. ``'P17516 135 G E'``. :type SAVs: list or tuple of strings :return: an array of EVmutation features for each SAV :rtype: NumPy structured array """ LOGGER.timeit('_EVmut') LOGGER.info('Recovering EVmutation data...') # extracts precomputed EVmutation scores for given mutants # NB: # negative DeltaE_epist --> deleterious effect # DeltaE_epist == 0 --> neutral effect (wild-type) # positive DeltaE_epist --> neutral/benign effect def find_matching_files(file_list, acc, pos): match_files = [] for fname in [f for f in file_list if f.startswith(acc)]: basename = splitext(fname)[0] res_range = basename.split("_")[-1] res_i = int(res_range.split("-")[0]) res_f = int(res_range.split("-")[1]) if res_i <= int(pos) <= res_f: match_files.append(fname) return match_files feat_dtype = np.dtype([(f, 'f') for f in EVMUT_FEATS]) features = np.zeros(len(SAVs), dtype=feat_dtype) features[:] = np.nan # recover EVmutation data EVmut_dir = SETTINGS.get('EVmutation_local_folder') if EVmut_dir is None: raise RuntimeError('EVmutation folder not set') file_list = [basename(f) for f in glob(join(EVmut_dir, '*.csv'))] if not file_list: raise RuntimeError('EVmutation folder does not contain any .csv files') for i, SAV in enumerate(SAVs): acc, pos, wt_aa, mut_aa = SAV.split() pos = int(pos) # LOGGER.info('Recovering EVmutation data for {}.'.format(SAV)) # find files containing given SAV coordinates match_files = find_matching_files(file_list, acc, pos) # recover data and average them if multiple values are found mutant = f'{wt_aa}{pos}{mut_aa}' data = [] for fname in match_files: with open(join(EVmut_dir, fname), 'r') as f: for line in f: if line.startswith(mutant): ll = line.strip().split(';')[4:8] data.append(ll) break data = np.array(data, dtype=float) if len(data) == 0: # LOGGER.warn(f"EVmutation data not found for '{SAV}'") continue else: features[i] = tuple(np.mean(data, axis=0)) LOGGER.report('EVmutation scores recovered in %.1fs.', '_EVmut') return features
def viewNMDinVMD(filename): """Start VMD in the current Python session and load NMD data.""" vmd = SETTINGS.get("vmd") if vmd: os.system("{0:s} -e {1:s}".format(vmd, os.path.abspath(filename)))
def fetchPDBLigand(cci, filename=None): """Fetch PDB ligand data from PDB_ for chemical component *cci*. *cci* may be 3-letter chemical component identifier or a valid XML filename. If *filename* is given, XML file will be saved with that name. If you query ligand data frequently, you may configure ProDy to save XML files in your computer. Set ``ligand_xml_save`` option **True**, i.e. ``confProDy(ligand_xml_save=True)``. Compressed XML files will be save to ProDy package folder, e.g. :file:`/home/user/.prody/pdbligands`. Each file is around 5Kb when compressed. This function is compatible with PDBx/PDBML v 4.0. Ligand data is returned in a dictionary. Ligand coordinate atom data with *model* and *ideal* coordinate sets are also stored in this dictionary. Note that this dictionary will contain data that is present in the XML file and all Ligand Expo XML files do not contain every possible data field. So, it may be better if you use :meth:`dict.get` instead of indexing the dictionary, e.g. to retrieve formula weight (or relative molar mass) of the chemical component use ``data.get('formula_weight')`` instead of ``data['formula_weight']`` to avoid exceptions when this data field is not found in the XML file. URL and/or path of the XML file are returned in the dictionary with keys ``url`` and ``path``, respectively. Following example downloads data for ligand STI (a.k.a. Gleevec and Imatinib) and calculates RMSD between model (X-ray structure 1IEP) and ideal (energy minimized) coordinate sets: .. ipython:: python from prody import * ligand_data = fetchPDBLigand('STI') ligand_data['model_coordinates_db_code'] ligand_model = ligand_data['model'] ligand_ideal = ligand_data['ideal'] transformation = superpose(ligand_ideal.noh, ligand_model.noh) calcRMSD(ligand_ideal.noh, ligand_model.noh)""" if not isinstance(cci, str): raise TypeError('cci must be a string') if isfile(cci): inp = openFile(cci) xml = inp.read() inp.close() url = None path = cci cci = splitext(splitext(split(cci)[1])[0])[0].upper() elif len(cci) > 4 or not cci.isalnum(): raise ValueError('cci must be 3-letters long and alphanumeric or ' 'a valid filename') else: xml = None cci = cci.upper() if SETTINGS.get('ligand_xml_save'): folder = join(getPackagePath(), 'pdbligands') if not isdir(folder): makePath(folder) xmlgz = path = join(folder, cci + '.xml.gz') if isfile(xmlgz): with openFile(xmlgz) as inp: xml = inp.read() else: path = None #url = ('http://ligand-expo.rcsb.org/reports/{0[0]}/{0}/{0}' # '.xml'.format(cci.upper())) url = 'http://www.pdb.org/pdb/files/ligand/{0}.xml'.format(cci.upper()) if not xml: #'http://www.pdb.org/pdb/files/ligand/{0}.xml' try: inp = openURL(url) except IOError: raise IOError('XML file for ligand {0} is not found online' .format(cci)) else: xml = inp.read() inp.close() if filename: out = openFile(filename, mode='w', folder=folder) out.write(xml) out.close() if SETTINGS.get('ligand_xml_save'): with openFile(xmlgz, 'w') as out: out.write(xml) import xml.etree.cElementTree as ET root = ET.XML(xml) if (root.get('{http://www.w3.org/2001/XMLSchema-instance}' 'schemaLocation') != 'http://pdbml.pdb.org/schema/pdbx-v40.xsd pdbx-v40.xsd'): LOGGER.warn('XML is not in PDBx/PDBML v 4.0 format, resulting ' 'dictionary may not contain all data fields') ns = root.tag[:root.tag.rfind('}')+1] len_ns = len(ns) dict_ = {'url': url, 'path': path} for child in list(root.find(ns + 'chem_compCategory')[0]): tag = child.tag[len_ns:] if tag.startswith('pdbx_'): tag = tag[5:] dict_[tag] = child.text dict_['formula_weight'] = float(dict_.get('formula_weight')) identifiers_and_descriptors = [] results = root.find(ns + 'pdbx_chem_comp_identifierCategory') if results: identifiers_and_descriptors.extend(results) results = root.find(ns + 'pdbx_chem_comp_descriptorCategory') if results: identifiers_and_descriptors.extend(results) for child in identifiers_and_descriptors: program = child.get('program').replace(' ', '_') type_ = child.get('type').replace(' ', '_') dict_[program + '_' + type_] = child[0].text dict_[program + '_version'] = child.get('program_version') dict_['audits'] = [(audit.get('action_type'), audit.get('date')) for audit in list(root.find(ns + 'pdbx_chem_comp_auditCategory'))] atoms = list(root.find(ns + 'chem_comp_atomCategory')) n_atoms = len(atoms) ideal_coords = np.zeros((n_atoms, 3)) model_coords = np.zeros((n_atoms, 3)) atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) elements = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['element'].dtype) resnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['resname'].dtype) charges = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) resnums = np.ones(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) alternate_atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) leaving_atom_flags = np.zeros(n_atoms, np.bool) aromatic_flags = np.zeros(n_atoms, np.bool) stereo_configs = np.zeros(n_atoms, np.bool) ordinals = np.zeros(n_atoms, int) name2index = {} for i, atom in enumerate(atoms): data = dict([(child.tag[len_ns:], child.text) for child in list(atom)]) name = data.get('pdbx_component_atom_id', 'X') name2index[name] = i atomnames[i] = name elements[i] = data.get('type_symbol', 'X') resnames[i] = data.get('pdbx_component_comp_id', 'UNK') charges[i] = float(data.get('charge', 0)) alternate_atomnames[i] = data.get('alt_atom_id', 'X') leaving_atom_flags[i] = data.get('pdbx_leaving_atom_flag') == 'Y' aromatic_flags[i] = data.get('pdbx_atomatic_flag') == 'Y' stereo_configs[i] = data.get('pdbx_stereo_config') == 'Y' ordinals[i] = int(data.get('pdbx_ordinal', 0)) model_coords[i, 0] = float(data.get('model_Cartn_x', 0)) model_coords[i, 1] = float(data.get('model_Cartn_y', 0)) model_coords[i, 2] = float(data.get('model_Cartn_z', 0)) ideal_coords[i, 0] = float(data.get('pdbx_model_Cartn_x_ideal', 0)) ideal_coords[i, 1] = float(data.get('pdbx_model_Cartn_y_ideal', 0)) ideal_coords[i, 2] = float(data.get('pdbx_model_Cartn_z_ideal', 0)) pdbid = dict_.get('model_coordinates_db_code') if pdbid: model = AtomGroup(cci + ' model ({0})'.format(pdbid)) else: model = AtomGroup(cci + ' model') model.setCoords(model_coords) model.setNames(atomnames) model.setResnames(resnames) model.setResnums(resnums) model.setElements(elements) model.setCharges(charges) model.setFlags('leaving_atom_flags', leaving_atom_flags) model.setFlags('aromatic_flags', aromatic_flags) model.setFlags('stereo_configs', stereo_configs) model.setData('ordinals', ordinals) model.setData('alternate_atomnames', alternate_atomnames) dict_['model'] = model ideal = model.copy() ideal.setTitle(cci + ' ideal') ideal.setCoords(ideal_coords) dict_['ideal'] = ideal bonds = [] warned = set() for bond in list(root.find(ns + 'chem_comp_bondCategory') or bonds): name_1 = bond.get('atom_id_1') name_2 = bond.get('atom_id_2') try: bonds.append((name2index[name_1], name2index[name_2])) except KeyError: if name_1 not in warned and name_1 not in name2index: warned.add(name_1) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_1), cci)) if name_2 not in warned and name_2 not in name2index: warned.add(name_2) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_2), cci)) if bonds: bonds = np.array(bonds, int) model.setBonds(bonds) ideal.setBonds(bonds) return dict_
def getResAvgPredictions(self, resid=None, classifier='best', PolyPhen2=True, EVmutation=True, refresh=False): if not self._isSaturationMutagenesis(): return None # initialize output array cols = [('sequence index', 'i4'), ('PDB SAV coords', 'U100'), ('PDBID', 'U100'), ('chain', 'U1'), ('resid', 'i4'), ('resname', 'U1'), ('PDB size', 'i4'), ('score', 'f4'), ('path. prob.', 'f4'), ('path. class', 'U12')] if PolyPhen2: cols.extend([('PolyPhen-2 score', 'f4'), ('PolyPhen-2 path. class', 'U12')]) if EVmutation: cols.extend([('EVmutation score', 'f4'), ('EVmutation path. class', 'U12')]) output = np.empty(int(self.numSAVs / 19), dtype=np.dtype(cols)) # fetch unique SAV coords, PDB coords and predictions uSAVc = self.getUniqueSAVcoords() PDBc = self.getPDBcoords() preds = self.getPredictions(classifier=classifier, PolyPhen2=PolyPhen2, EVmutation=EVmutation, refresh=refresh) # compute residue-averaged quantities output['sequence index'] = self._calcResAvg(uSAVc['position']) for field in [ 'PDB SAV coords', 'PDBID', 'chain', 'resid', 'resname', 'PDB size' ]: output[field] = self._calcResAvg(PDBc[field]) # NB: I expect to see RuntimeWarnings in this block with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) output['score'] = self._calcResAvg(preds['score']) pp = self._calcResAvg(preds['path. prob.']) pc = np.where(pp > 0.5, 'deleterious', 'neutral') pc = np.where(np.isnan(pp), '?', pc) output['path. prob.'] = pp output['path. class'] = pc if PolyPhen2: ps = self._calcResAvg(preds['PolyPhen-2 score']) pc = np.where(ps > 0.5, 'deleterious', 'neutral') pc = np.where(np.isnan(ps), '?', pc) output['PolyPhen-2 score'] = ps output['PolyPhen-2 path. class'] = pc if EVmutation: ps = self._calcResAvg(preds['EVmutation score']) cutoff = -SETTINGS.get('EVmutation_metrics')['optimal cutoff'] pc = np.where(ps < cutoff, 'deleterious', 'neutral') pc = np.where(np.isnan(ps), '?', pc) output['EVmutation score'] = ps output['EVmutation path. class'] = pc if resid is None: return output elif isinstance(resid, int): return output[output['resid'] == resid][0] else: raise ValueError('Invalid resid.')
def fetchPDBLigand(cci, filename=None): """Fetch PDB ligand data from PDB_ for chemical component *cci*. *cci* may be 3-letter chemical component identifier or a valid XML filename. If *filename* is given, XML file will be saved with that name. If you query ligand data frequently, you may configure ProDy to save XML files in your computer. Set ``ligand_xml_save`` option **True**, i.e. ``confProDy(ligand_xml_save=True)``. Compressed XML files will be save to ProDy package folder, e.g. :file:`/home/user/.prody/pdbligands`. Each file is around 5Kb when compressed. This function is compatible with PDBx/PDBML v 4.0. Ligand data is returned in a dictionary. Ligand coordinate atom data with *model* and *ideal* coordinate sets are also stored in this dictionary. Note that this dictionary will contain data that is present in the XML file and all Ligand Expo XML files do not contain every possible data field. So, it may be better if you use :meth:`dict.get` instead of indexing the dictionary, e.g. to retrieve formula weight (or relative molar mass) of the chemical component use ``data.get('formula_weight')`` instead of ``data['formula_weight']`` to avoid exceptions when this data field is not found in the XML file. URL and/or path of the XML file are returned in the dictionary with keys ``url`` and ``path``, respectively. Following example downloads data for ligand STI (a.k.a. Gleevec and Imatinib) and calculates RMSD between model (X-ray structure 1IEP) and ideal (energy minimized) coordinate sets: .. ipython:: python from prody import * ligand_data = fetchPDBLigand('STI') ligand_data['model_coordinates_db_code'] ligand_model = ligand_data['model'] ligand_ideal = ligand_data['ideal'] transformation = superpose(ligand_ideal.noh, ligand_model.noh) calcRMSD(ligand_ideal.noh, ligand_model.noh)""" if not isinstance(cci, str): raise TypeError('cci must be a string') if isfile(cci): inp = openFile(cci) xml = inp.read() inp.close() url = None path = cci cci = splitext(splitext(split(cci)[1])[0])[0].upper() elif len(cci) > 4 or not cci.isalnum(): raise ValueError('cci must be 3-letters long and alphanumeric or ' 'a valid filename') else: xml = None cci = cci.upper() if SETTINGS.get('ligand_xml_save'): folder = join(getPackagePath(), 'pdbligands') if not isdir(folder): makePath(folder) xmlgz = path = join(folder, cci + '.xml.gz') if isfile(xmlgz): with openFile(xmlgz) as inp: xml = inp.read() else: path = None #url = ('http://ligand-expo.rcsb.org/reports/{0[0]}/{0}/{0}' # '.xml'.format(cci.upper())) url = 'http://files.rcsb.org/ligands/download/{0}.xml'.format( cci.upper()) if not xml: #'http://www.pdb.org/pdb/files/ligand/{0}.xml' try: inp = openURL(url) except IOError: raise IOError( 'XML file for ligand {0} is not found online'.format(cci)) else: xml = inp.read() inp.close() if filename: out = openFile(filename, mode='w', folder=folder) out.write(xml) out.close() if SETTINGS.get('ligand_xml_save'): with openFile(xmlgz, 'w') as out: out.write(xml) import xml.etree.cElementTree as ET root = ET.XML(xml) if (root.get('{http://www.w3.org/2001/XMLSchema-instance}' 'schemaLocation') != 'http://pdbml.pdb.org/schema/pdbx-v40.xsd pdbx-v40.xsd'): LOGGER.warn('XML is not in PDBx/PDBML v 4.0 format, resulting ' 'dictionary may not contain all data fields') ns = root.tag[:root.tag.rfind('}') + 1] len_ns = len(ns) dict_ = {'url': url, 'path': path} for child in list(root.find(ns + 'chem_compCategory')[0]): tag = child.tag[len_ns:] if tag.startswith('pdbx_'): tag = tag[5:] dict_[tag] = child.text dict_['formula_weight'] = float(dict_.get('formula_weight')) identifiers_and_descriptors = [] results = root.find(ns + 'pdbx_chem_comp_identifierCategory') if results: identifiers_and_descriptors.extend(results) results = root.find(ns + 'pdbx_chem_comp_descriptorCategory') if results: identifiers_and_descriptors.extend(results) for child in identifiers_and_descriptors: program = child.get('program').replace(' ', '_') type_ = child.get('type').replace(' ', '_') dict_[program + '_' + type_] = child[0].text dict_[program + '_version'] = child.get('program_version') dict_['audits'] = [ (audit.get('action_type'), audit.get('date')) for audit in list(root.find(ns + 'pdbx_chem_comp_auditCategory')) ] atoms = list(root.find(ns + 'chem_comp_atomCategory')) n_atoms = len(atoms) ideal_coords = np.zeros((n_atoms, 3)) model_coords = np.zeros((n_atoms, 3)) atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) elements = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['element'].dtype) resnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['resname'].dtype) charges = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) resnums = np.ones(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) alternate_atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) leaving_atom_flags = np.zeros(n_atoms, np.bool) aromatic_flags = np.zeros(n_atoms, np.bool) stereo_configs = np.zeros(n_atoms, np.bool) ordinals = np.zeros(n_atoms, int) name2index = {} for i, atom in enumerate(atoms): data = dict([(child.tag[len_ns:], child.text) for child in list(atom)]) name = data.get('pdbx_component_atom_id', 'X') name2index[name] = i atomnames[i] = name elements[i] = data.get('type_symbol', 'X') resnames[i] = data.get('pdbx_component_comp_id', 'UNK') charges[i] = float(data.get('charge', 0)) alternate_atomnames[i] = data.get('alt_atom_id', 'X') leaving_atom_flags[i] = data.get('pdbx_leaving_atom_flag') == 'Y' aromatic_flags[i] = data.get('pdbx_atomatic_flag') == 'Y' stereo_configs[i] = data.get('pdbx_stereo_config') == 'Y' ordinals[i] = int(data.get('pdbx_ordinal', 0)) model_coords[i, 0] = float(data.get('model_Cartn_x', 0)) model_coords[i, 1] = float(data.get('model_Cartn_y', 0)) model_coords[i, 2] = float(data.get('model_Cartn_z', 0)) ideal_coords[i, 0] = float(data.get('pdbx_model_Cartn_x_ideal', 0)) ideal_coords[i, 1] = float(data.get('pdbx_model_Cartn_y_ideal', 0)) ideal_coords[i, 2] = float(data.get('pdbx_model_Cartn_z_ideal', 0)) pdbid = dict_.get('model_coordinates_db_code') if pdbid: model = AtomGroup(cci + ' model ({0})'.format(pdbid)) else: model = AtomGroup(cci + ' model') model.setCoords(model_coords) model.setNames(atomnames) model.setResnames(resnames) model.setResnums(resnums) model.setElements(elements) model.setCharges(charges) model.setFlags('leaving_atom_flags', leaving_atom_flags) model.setFlags('aromatic_flags', aromatic_flags) model.setFlags('stereo_configs', stereo_configs) model.setData('ordinals', ordinals) model.setData('alternate_atomnames', alternate_atomnames) dict_['model'] = model ideal = model.copy() ideal.setTitle(cci + ' ideal') ideal.setCoords(ideal_coords) dict_['ideal'] = ideal bonds = [] warned = set() for bond in list(root.find(ns + 'chem_comp_bondCategory') or bonds): name_1 = bond.get('atom_id_1') name_2 = bond.get('atom_id_2') try: bonds.append((name2index[name_1], name2index[name_2])) except KeyError: if name_1 not in warned and name_1 not in name2index: warned.add(name_1) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_1), cci)) if name_2 not in warned and name_2 not in name2index: warned.add(name_2) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_2), cci)) if bonds: bonds = np.array(bonds, int) model.setBonds(bonds) ideal.setBonds(bonds) return dict_
def calcEvolProperties(self, resid='all', refresh=False, folder=None, max_cols=None, max_seqs=25000, **kwargs): ''' Computes Evol properties, i.e. Shannon entropy, Mutual Information and Direct Information, from Pfam Multiple Sequence Alignments, for a given residue. ''' assert type(refresh) is bool # recover Pfam mapping (if not found already) self._searchPfam(refresh=refresh) if resid == 'all': PF_list = self.Pfam.keys() else: # get list of Pfam domains containing resid PF_list = [ k for k in self.Pfam if any([ resid >= int(segment['start']) and resid <= int(segment['end']) for segment in self.Pfam[k]['locations'] ]) ] if len(PF_list) == 0: raise RuntimeError( 'No Pfam domain for resid {}.'.format(resid)) if len(PF_list) > 1: LOGGER.warn('Residue {} is found in multiple '.format(resid) + \ '({}) Pfam domains.'.format(len(PF_list))) if folder is None: folder = SETTINGS.get('rhapsody_local_folder', './') # iterate over Pfam families for PF in PF_list: d = self.Pfam[PF] # skip if properties are pre-computed if not refresh and d.get('mapping') is not None: continue d['mapping'] = None d['ref_MSA'] = None d['entropy'] = np.nan d['MutInfo'] = np.nan d['DirInfo'] = np.nan try: LOGGER.info('Processing {}...'.format(PF)) # fetch & parse MSA # fname = PF + '_full.sth' # fullname = os.path.join(folder, fname) # if not os.path.isfile(fullname): # f = fetchPfamMSA(PF) # os.rename(f, fullname) # msa = parseMSA(fullname, **kwargs) # fetch & parse MSA without saving downloaded MSA f = fetchPfamMSA(PF) msa = parseMSA(f, **kwargs) os.remove(f) # slice MSA to match all segments of the Uniprot sequence sliced_msa, indexes = self._sliceMSA(msa) # if max_cols is not None and sliced_msa.numResidues() > max_cols: # raise Exception('Unable to compute DI: MSA has ' +\ # 'too many columns (max: {}).'.format(max_cols)) # get mapping between Uniprot sequence and Pfam domain d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) d['mapping'] = str(e) continue try: # refine MSA ('seqid' param. is set as in PolyPhen-2) rowocc = 0.6 while True: sliced_msa = refineMSA(sliced_msa, rowocc=rowocc) rowocc += 0.02 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1: break ref_msa = refineMSA(sliced_msa, seqid=0.94, **kwargs) d['ref_MSA'] = ref_msa # compute evolutionary properties d['entropy'] = calcShannonEntropy(ref_msa) d['MutInfo'] = buildMutinfoMatrix(ref_msa) # d['DirInfo'] = buildDirectInfoMatrix(ref_msa) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) return {k: self.Pfam[k] for k in PF_list}