def _superpose(self, **kwargs): """Superpose conformations and update coordinates.""" calcT = getTransformation if kwargs.get('trans', False): if self._trans is not None: LOGGER.info('Existing transformations will be overwritten.') trans = np.zeros((self._n_csets, 4, 4)) else: trans = None indices = self._indices if indices is None: weights = self._weights coords = self._coords confs = self._confs confs_selected = self._confs else: weights = self._weights[:, indices] coords = self._coords[indices] confs = self._confs confs_selected = self._confs[:, indices] for i, conf in enumerate(confs_selected): rmat, tvec = calcT(conf, coords, weights[i]) if trans is not None: trans[i][:3, :3] = rmat trans[i][:3, 3] = tvec confs[i] = tvec + np.dot(confs[i], rmat.T) self._trans = trans
def wwPDBServer(*key): """Set/get `wwPDB`_ FTP/HTTP server location used for downloading PDB structures. Use one of the following keywords for setting a server: +---------------------------+-----------------------------+ | wwPDB FTP server | *Key* (case insensitive) | +===========================+=============================+ | RCSB PDB (USA) (default) | RCSB, USA, US | +---------------------------+-----------------------------+ | PDBe (Europe) | PDBe, Europe, Euro, EU | +---------------------------+-----------------------------+ | PDBj (Japan) | PDBj, Japan, Jp | +---------------------------+-----------------------------+ .. _wwPDB: http://www.wwpdb.org/""" if not key: return SETTINGS.get('wwpdb', None) elif len(key) == 1: try: key = key[0].lower() except AttributeError: raise TypeError('key must be a string') if key in WWPDB_FTP_SERVERS: SETTINGS['wwpdb'] = key SETTINGS.save() LOGGER.info('wwPDB server is set to {}.' .format(WWPDB_FTP_SERVERS[key][0])) else: raise ValueError('{0} is not a valid wwPDB server identifier' .format(repr(key))) else: raise TypeError('one wwPDB server identifier is expected, {0} given' .format(len(key)))
def pathPDBMirror(path=None, format=None): """Returns or specify PDB mirror path to be used by :func:`.fetchPDB`. To release the current mirror, pass an invalid path, e.g. ``path=''``. If you are keeping a partial mirror, such as PDB files in :file:`/data/structures/divided/pdb/` folder, specify *format*, which is ``'pdb'`` in this case.""" if path is None: path = SETTINGS.get('pdb_mirror_path') format = SETTINGS.get('pdb_mirror_format', None) if path: if isdir(path): if format is None: return path else: return path, format else: LOGGER.warning( 'PDB mirror path {0} is not a accessible.'.format( repr(path))) else: if isdir(path): path = abspath(path) LOGGER.info('Local PDB mirror path is set: {0}'.format(repr(path))) SETTINGS['pdb_mirror_path'] = path SETTINGS['pdb_mirror_format'] = format SETTINGS.save() else: current = SETTINGS.pop('pdb_mirror_path') if current: LOGGER.info('PDB mirror {0} is released.'.format( repr(current))) SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(path)))
def __add__(self, other): """Concatenate ensembles. The reference coordinates, atoms, and weights of *self* is used in the resulting ensemble.""" if not isinstance(other, Ensemble): raise TypeError('an Ensemble instance cannot be added to an {0} ' 'instance'.format(type(other))) elif self._n_atoms != other._n_atoms: raise ValueError('Ensembles must have same number of atoms.') ensemble = Ensemble('{0} + {1}'.format(self.getTitle(), other.getTitle())) if self._coords is not None: ensemble.setCoords(self._coords.copy()) if self._confs is not None: ensemble.addCoordset(self._confs.copy()) if other._confs is not None: ensemble.addCoordset(other._confs.copy()) all_keys = list(self._data.keys()) + list(other._data.keys()) for key in all_keys: if key in self._data and key in other._data: self_data = self._data[key] other_data = other._data[key] elif key in self._data: self_data = self._data[key] other_data = zeros(other.numConfs(), dtype=self_data.dtype) elif key in other._data: other_data = other._data[key] self_data = zeros(other.numConfs(), dtype=other_data.dtype) ensemble._data[key] = concatenate((self_data, other_data), axis=0) if self._weights is not None: LOGGER.info('Atom weights from {0} are used in {1}.'.format( repr(self._title), repr(ensemble.getTitle()))) ensemble.setWeights(self._weights.copy()) elif other._weights is not None: ensemble.setWeights(other._weights.copy()) if self._atoms is not None: ensemble.setAtoms(self._atoms) ensemble._indices = self._indices else: ensemble.setAtoms(other._atoms) ensemble._indices = other._indices return ensemble
def iterpose(self, rmsd=0.0001): """Iteratively superpose the ensemble until convergence. Initially, all conformations are aligned with the reference coordinates. Then mean coordinates are calculated, and are set as the new reference coordinates. This is repeated until reference coordinates do not change. This is determined by the value of RMSD between the new and old reference coordinates. Note that at the end of the iterative procedure the reference coordinate set will be average of conformations in the ensemble. :arg rmsd: change in reference coordinates to determine convergence, default is 0.0001 Å RMSD :type rmsd: float""" if self._coords is None: raise AttributeError('coordinates are not set, use `setCoords`') if self._confs is None or len(self._confs) == 0: raise AttributeError('conformations are not set, use' '`addCoordset`') LOGGER.info('Starting iterative superposition:') LOGGER.timeit('_prody_ensemble') rmsdif = 1 step = 0 weights = self._weights length = len(self) if weights is not None: if weights.ndim == 3: weightsum = weights.sum(axis=0) weightsum[weightsum == 0.] = 1. # add pseudocount to avoid nan else: weightsum = length while rmsdif > rmsd: self._superpose() if weights is None: newxyz = self._confs.sum(0) / length else: newxyz = (self._confs * weights).sum(0) / weightsum rmsdif = getRMSD(self._coords, newxyz) self._coords = newxyz step += 1 LOGGER.info('Step #{0}: RMSD difference = {1:.4e}'.format( step, rmsdif)) LOGGER.report('Iterative superposition completed in %.2fs.', '_prody_ensemble')
def run(self, tmax=200, li=0.2, lf=0.01, ei=0.3, ef=0.05, Ti=0.1, Tf=2, c=0, calcC=False): LOGGER.info('Building coordinates from electron density map. This may take a while.') LOGGER.timeit('_prody_make_nodes') tmax = int(tmax * self.N) li = li * self.N if calcC: Ti = Ti * self.N Tf = Tf * self.N for t in range(1, tmax + 1): # calc the parameters tt = float(t) / tmax l = li * np.power(lf / li, tt) ep = ei * np.power(ef / ei, tt) if calcC: T = Ti * np.power(Tf / Ti, tt) else: T = -1 self.runOnce(t, l, ep, T, c) LOGGER.report('{0} pseudoatoms were fitted in %.2fs.'.format( self.N), '_prody_make_nodes') return
def pathPDBFolder(folder=None, divided=False): """Returns or specify local PDB folder for storing PDB files downloaded from `wwPDB <http://www.wwpdb.org/>`_ servers. Files stored in this folder can be accessed via :func:`.fetchPDB` from any working directory. To release the current folder, pass an invalid path, e.g. ``folder=''``. If *divided* is **True**, the divided folder structure of wwPDB servers will be assumed when reading from and writing to the local folder. For example, a structure with identifier **1XYZ** will be present as :file:`pdblocalfolder/yz/pdb1xyz.pdb.gz`. If *divided* is **False**, a plain folder structure will be expected and adopted when saving files. For example, the same structure will be present as :file:`pdblocalfolder/1xyz.pdb.gz`. Finally, in either case, lower case letters will be used and compressed files will be stored.""" if folder is None: folder = SETTINGS.get('pdb_local_folder') if folder: if isdir(folder): return folder, SETTINGS.get('pdb_local_divided', True) else: LOGGER.warn('PDB local folder {0} is not a accessible.'.format( repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local PDB folder is set: {0}'.format(repr(folder))) if divided: LOGGER.info('wwPDB divided folder structure will be assumed.') else: LOGGER.info('A plain folder structure will be assumed.') SETTINGS['pdb_local_folder'] = folder SETTINGS['pdb_local_divided'] = bool(divided) SETTINGS.save() else: current = SETTINGS.pop('pdb_local_folder') if current: LOGGER.info('PDB folder {0} is released.'.format( repr(current))) SETTINGS.pop('pdb_local_divided') SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(folder)))
def parseDCD(filename, start=None, stop=None, step=None, astype=None): """Parse CHARMM format DCD files (also NAMD 2.1 and later). Returns an :class:`Ensemble` instance. Conformations in the ensemble will be ordered as they appear in the trajectory file. Use :class:`DCDFile` class for parsing coordinates of a subset of atoms. :arg filename: DCD filename :type filename: str :arg start: index of first frame to read :type start: int :arg stop: index of the frame that stops reading :type stop: int :arg step: steps between reading frames, default is 1 meaning every frame :type step: int :arg astype: cast coordinate array to specified type :type astype: type""" dcd = DCDFile(filename, astype=astype) time_ = time() n_frames = dcd.numFrames() LOGGER.info('DCD file contains {0} coordinate sets for {1} atoms.'.format( n_frames, dcd.numAtoms())) ensemble = dcd[slice(start, stop, step)] dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * dcd.numFrames() * dcd._bytes_per_frame / (1024 * 1024) LOGGER.info('DCD file was parsed in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB parsed at input rate {1:.2f} MB/s.'.format( dcd_size, dcd_size / time_)) LOGGER.info('{0} coordinate sets parsed at input rate {1} frame/s.'.format( n_frames, int(n_frames / time_))) return ensemble
def _parseMMCIFLines(atomgroup, lines, model, chain, subset, altloc_torf, header): """Returns an AtomGroup. See also :func:`.parsePDBStream()`. :arg lines: mmCIF lines """ if subset is not None: if subset == 'ca': subset = set(('CA',)) elif subset in 'bb': subset = flags.BACKBONE protein_resnames = flags.AMINOACIDS asize = 0 i = 0 models = [] nModels = 0 fields = dict() fieldCounter = -1 foundAtomBlock = False doneAtomBlock = False start = 0 stop = 0 while not doneAtomBlock: line = lines[i] if line[:11] == '_atom_site.': fieldCounter += 1 fields[line.split('.')[1].strip()] = fieldCounter if line.startswith('ATOM') or line.startswith('HETATM'): if not foundAtomBlock: foundAtomBlock = True start = i models.append(line.split()[fields['pdbx_PDB_model_num']]) if models[asize] != models[asize-1]: nModels += 1 asize += 1 else: if foundAtomBlock: doneAtomBlock = True stop = i i += 1 if nModels == 0: nModels = 1 if model is not None and model != 1: for i in range(start, stop): if str(models[i]) != model and str(models[i+1]) == model: start = i+1 if str(models[i]) == model and str(models[i+1]) != model: stop = i+1 break if not str(model) in models: raise mmCIFParseError('model {0} is not found'.format(model)) addcoords = False if atomgroup.numCoordsets() > 0: addcoords = True if isinstance(altloc_torf, str): if altloc_torf.strip() != 'A': LOGGER.info('Parsing alternate locations {0}.' .format(altloc_torf)) which_altlocs = '.' + ''.join(altloc_torf.split()) else: which_altlocs = '.A' altloc_torf = False else: which_altlocs = '.A' altloc_torf = True coordinates = np.zeros((asize, 3), dtype=float) atomnames = np.zeros(asize, dtype=ATOMIC_FIELDS['name'].dtype) resnames = np.zeros(asize, dtype=ATOMIC_FIELDS['resname'].dtype) resnums = np.zeros(asize, dtype=ATOMIC_FIELDS['resnum'].dtype) chainids = np.zeros(asize, dtype=ATOMIC_FIELDS['chain'].dtype) segnames = np.zeros(asize, dtype=ATOMIC_FIELDS['segment'].dtype) hetero = np.zeros(asize, dtype=bool) termini = np.zeros(asize, dtype=bool) altlocs = np.zeros(asize, dtype=ATOMIC_FIELDS['altloc'].dtype) icodes = np.zeros(asize, dtype=ATOMIC_FIELDS['icode'].dtype) serials = np.zeros(asize, dtype=ATOMIC_FIELDS['serial'].dtype) elements = np.zeros(asize, dtype=ATOMIC_FIELDS['element'].dtype) bfactors = np.zeros(asize, dtype=ATOMIC_FIELDS['beta'].dtype) occupancies = np.zeros(asize, dtype=ATOMIC_FIELDS['occupancy'].dtype) n_atoms = atomgroup.numAtoms() if n_atoms > 0: asize = n_atoms acount = 0 for line in lines[start:stop]: startswith = line.split()[fields['group_PDB']] atomname = line.split()[fields['auth_atom_id']] resname = line.split()[fields['auth_comp_id']] if subset is not None: if not (atomname in subset and resname in protein_resnames): continue chID = line.split()[fields['auth_asym_id']] if chain is not None: if isinstance(chain, str): chain = chain.split(',') if not chID in chain: continue segID = line.split()[fields['label_asym_id']] alt = line.split()[fields['label_alt_id']] if alt not in which_altlocs: continue if model is not None: if int(models[acount]) < model: continue elif int(models[acount]) > model: break coordinates[acount] = [line.split()[fields['Cartn_x']], line.split()[fields['Cartn_y']], line.split()[fields['Cartn_z']]] atomnames[acount] = atomname resnames[acount] = resname resnums[acount] = line.split()[fields['auth_seq_id']] chainids[acount] = chID segnames[acount] = segID hetero[acount] = startswith == 'HETATM' # True or False if chainids[acount] != chainids[acount-1]: termini[acount-1] = True altlocs[acount] = alt icodes[acount] = line.split()[fields['pdbx_PDB_ins_code']] if icodes[acount] == '?': icodes[acount] = '' serials[acount] = line.split()[fields['id']] elements[acount] = line.split()[fields['type_symbol']] bfactors[acount] = line.split()[fields['B_iso_or_equiv']] occupancies[acount] = line.split()[fields['occupancy']] acount += 1 if model is not None: nModels = 1 modelSize = acount//nModels if addcoords: atomgroup.addCoordset(coordinates[:modelSize]) else: atomgroup._setCoords(coordinates[:modelSize]) atomgroup.setNames(atomnames[:modelSize]) atomgroup.setResnames(resnames[:modelSize]) atomgroup.setResnums(resnums[:modelSize]) atomgroup.setSegnames(segnames[:modelSize]) atomgroup.setChids(chainids[:modelSize]) atomgroup.setFlags('hetatm', hetero[:modelSize]) atomgroup.setFlags('pdbter', termini[:modelSize]) atomgroup.setAltlocs(altlocs[:modelSize]) atomgroup.setIcodes(icodes[:modelSize]) atomgroup.setSerials(serials[:modelSize]) atomgroup.setElements(elements[:modelSize]) from caviar.prody_parser.utilities.misctools import getMasses atomgroup.setMasses(getMasses(elements[:modelSize])) atomgroup.setBetas(bfactors[:modelSize]) atomgroup.setOccupancies(occupancies[:modelSize]) for n in range(1, nModels): atomgroup.addCoordset(coordinates[n*modelSize:(n+1)*modelSize]) if header: header = parseSTARLines(lines[:start-fieldCounter-2] + lines[stop:], shlex=True) return atomgroup, header return atomgroup
def parseEMDStream(stream, **kwargs): """Parse lines of data stream from an EMD/MRC2014 file and optionally return an :class:`.AtomGroup` containing TRN nodes based on it. :arg stream: Any object with the method ``readlines`` (e.g. :class:`file`, buffer, stdin) """ cutoff = kwargs.get('cutoff', None) if cutoff is not None: cutoff = float(cutoff) n_nodes = kwargs.get('n_nodes', 0) num_iter = int(kwargs.get('num_iter', 20)) map = kwargs.get('map', False) if not isinstance(n_nodes, int): raise TypeError('n_nodes should be an integer') if n_nodes > 0: make_nodes = True else: make_nodes = False map = True LOGGER.info('As n_nodes is less than or equal to 0, no nodes will be' ' made and the raw map will be returned') emd = EMDMAP(stream, cutoff) if make_nodes: title_suffix = kwargs.get('title_suffix', '') atomgroup = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) atomgroup._n_atoms = n_nodes coordinates = np.zeros((n_nodes, 3), dtype=float) atomnames = np.zeros(n_nodes, dtype=ATOMIC_FIELDS['name'].dtype) resnames = np.zeros(n_nodes, dtype=ATOMIC_FIELDS['resname'].dtype) resnums = np.zeros(n_nodes, dtype=ATOMIC_FIELDS['resnum'].dtype) chainids = np.zeros(n_nodes, dtype=ATOMIC_FIELDS['chain'].dtype) trn = TRNET(n_nodes=n_nodes) trn.inputMap(emd, sample='density') trn.run(tmax=num_iter) for i in range(n_nodes): coordinates[i, :] = trn.W[i, :] atomnames[i] = 'B' resnames[i] = 'CGB' resnums[i] = i+1 chainids[i] = 'X' atomgroup.setCoords(coordinates) atomgroup.setNames(atomnames) atomgroup.setResnames(resnames) atomgroup.setResnums(resnums) atomgroup.setChids(chainids) if make_nodes: if map: return atomgroup, emd else: return atomgroup else: return emd
def __init__(self, coords, **kwargs): """ :arg coords: coordinate array with shape ``(N, 3)``, where N is number of atoms :type coords: :class:`numpy.ndarray`, :class:`.Atomic`, :class:`.Frame` :arg unitcell: orthorhombic unitcell dimension array with shape ``(3,)`` :type unitcell: :class:`numpy.ndarray` :arg bucketsize: number of points per tree node, default is 10 :type bucketsize: int""" unitcell = kwargs.get('unitcell') if not isinstance(coords, ndarray): if unitcell is None: try: unitcell = coords.getUnitcell() except AttributeError: pass else: if unitcell is not None: LOGGER.info('Unitcell information from {0} will be ' 'used.'.format(str(coords))) try: # using getCoords() because coords will be stored internally # and reused when needed, this will avoid unexpected results # due to changes made to coordinates externally coords = coords.getCoords() except AttributeError: raise TypeError('coords must be a Numpy array or must have ' 'getCoords attribute') else: coords = coords.copy() if coords.ndim != 2: raise Exception('coords.ndim must be 2') if coords.shape[-1] != 3: raise Exception('coords.shape must be (N,3)') if coords.min() <= -1e6 or coords.max() >= 1e6: raise Exception('coords must be between -1e6 and 1e6') self._bucketsize = kwargs.get('bucketsize', 10) if not isinstance(self._bucketsize, int): raise TypeError('bucketsize must be an integer') if self._bucketsize < 1: raise ValueError('bucketsize must be a positive integer') self._coords = None self._unitcell = None self._neighbors = None if unitcell is None: self._kdtree = CKDTree(coords, self._bucketsize) else: if not isinstance(unitcell, ndarray): raise TypeError('unitcell must be a Numpy array') if unitcell.shape != (3, ): raise ValueError('unitcell.shape must be (3,)') self._kdtree = CKDTree(coords, self._bucketsize) self._coords = coords self._unitcell = unitcell self._replicate = REPLICATE * unitcell self._kdtree2 = None self._pbcdict = {} self._pbckeys = [] self._n_atoms = coords.shape[0] self._none = kwargs.pop('none', lambda: None) try: self._none() except TypeError: raise TypeError('none argument must be callable') self._oncall = kwargs.pop('oncall', 'both') assert self._oncall in ('both', 'dist'), 'oncall must be both or dist'
def _parseHeader(self): """Read the header information from a dcd file. Input: fd - a file struct opened for binary reading. Output: 0 on success, negative error code on failure. Side effects: *natoms set to number of atoms per frame *nsets set to number of frames in dcd file *istart set to starting timestep of dcd file *nsavc set to timesteps between dcd saves *delta set to value of trajectory timestep *nfixed set to number of fixed atoms *freeind may be set to heap-allocated space *reverse set to one if reverse-endian, zero if not. *charmm set to internal code for handling charmm data. """ dcd = self._file endian = b'' #'=' # native endian rec_scale = RECSCALE32BIT charmm = None dcdcordmagic = unpack(endian + b'i', b'CORD')[0] # Check magic number in file header and determine byte order bits = dcd.read(calcsize('ii')) temp = unpack(endian + b'ii', bits) if temp[0] + temp[1] == 84: LOGGER.info('Detected CHARMM -i8 64-bit DCD file of native ' 'endianness.') rec_scale = RECSCALE64BIT elif temp[0] == 84 and temp[1] == dcdcordmagic: pass #LOGGER.info('Detected standard 32-bit DCD file of native ' # 'endianness.') else: if unpack(b'>ii', bits) == temp: endian = '>' else: endian = '<' temp = unpack(endian + b'ii', bits) if temp[0] + temp[1] == 84: rec_scale = RECSCALE64BIT LOGGER.info('Detected CHARMM -i8 64-bit DCD file of opposite ' 'endianness.') else: endian = '' temp = unpack(endian + b'ii', bits) if temp[0] == 84 and temp[1] == dcdcordmagic: LOGGER.info('Detected standard 32-bit DCD file of ' 'opposite endianness.') else: raise IOError('Unrecognized DCD header or unsupported ' 'DCD format.') # check for magic string, in case of long record markers if rec_scale == RECSCALE64BIT: raise IOError('CHARMM 64-bit DCD files are not yet supported.') temp = unpack(b'I', dcd.read(calcsize('I'))) if temp[0] != dcdcordmagic: raise IOError('Failed to find CORD magic in CHARMM -i8 64-bit ' 'DCD file.') # Buffer the entire header for random access bits = dcd.read(80) # CHARMm-genereate DCD files set the last integer in the # header, which is unused by X-PLOR, to its version number. # Checking if this is nonzero tells us this is a CHARMm file # and to look for other CHARMm flags. temp = unpack(endian + b'i' * 20, bits) if temp[-1] != 0: charmm = True if charmm: #LOGGER.info('CHARMM format DCD file (also NAMD 2.1 and later).') temp = unpack(endian + b'i' * 9 + b'f' + b'i' * 10, bits) else: LOGGER.info('X-PLOR format DCD file (also NAMD 2.0 and earlier) ' 'is not supported.') return None # Store the number of sets of coordinates (NSET) self._n_csets = temp[0] # Store ISTART, the starting timestep self._first_ts = temp[1] # Store NSAVC, the number of timesteps between dcd saves self._framefreq = temp[2] # Store NAMNF, the number of fixed atoms self._n_fixed = temp[8] if self._n_fixed > 0: raise IOError('DCD files with fixed atoms is not yet supported.') # Read in the timestep, DELTA # Note: DELTA is stored as double with X-PLOR but as float with CHARMm self._timestep = temp[9] self._unitcell = temp[10] == 1 # Get the end size of the first block if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 84: raise IOError('Unrecognized DCD format.') # Read in the size of the next block temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i'))) if (temp[0] - 4) % 80 != 0: raise IOError('Unrecognized DCD format.') noremarks = temp[0] == 84 # Read NTITLE, the number of 80 character title strings there are temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i'))) self._dcdtitle = dcd.read(80) if not noremarks: self._remarks = dcd.read(80) # Get the ending size for this block temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i'))) if (temp[0] - 4) % 80 != 0: raise IOError('Unrecognized DCD format.') # Read in an integer '4' if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 4: raise IOError('Unrecognized DCD format.') # Read in the number of atoms self._n_atoms = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] # Read in an integer '4' if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 4: raise IOError('Bad DCD format.') self._is64bit = rec_scale == RECSCALE64BIT self._endian = endian self._n_floats = (self._n_atoms + 2) * 3 if self._is64bit: if self._unitcell: self._bytes_per_frame = 56 + self._n_floats * 8 else: self._bytes_per_frame = self._n_floats * 8 LOGGER.warning('Reading of 64 bit DCD files has not been tested. ' 'Please report any problems that you may find.') self._dtype = np.float64 self._itemsize = 8 else: if self._unitcell: self._bytes_per_frame = 56 + self._n_floats * 4 else: self._bytes_per_frame = self._n_floats * 4 self._dtype = np.float32 self._itemsize = 4 self._first_byte = self._file.tell() n_csets = (getsize(self._filename) - self._first_byte) // self._bytes_per_frame if n_csets != self._n_csets: LOGGER.warning('DCD header claims {0} frames, file size ' 'indicates there are actually {1} frames.'.format( self._n_csets, n_csets)) self._n_csets = n_csets self._coords = self.nextCoordset() self._file.seek(self._first_byte) self._nfi = 0
def refineEnsemble(ensemble, lower=.5, upper=10., **kwargs): """Refine a :class:`.PDBEnsemble` based on RMSD criterions. :arg ensemble: the ensemble to be refined :type ensemble: :class:`.Ensemble`, :class:`.PDBEnsemble` :arg lower: the smallest allowed RMSD between two conformations with the exception of **protected** :type lower: float :arg upper: the highest allowed RMSD between two conformations with the exception of **protected** :type upper: float :keyword protected: a list of either the indices or labels of the conformations needed to be kept in the refined ensemble :type protected: list :arg ref: the index or label of the reference conformation which will also be kept. Default is 0 :type ref: int or str """ protected = kwargs.pop('protected', []) P = [] if len(protected): labels = ensemble.getLabels() for p in protected: if isinstance(p, Integral): i = p else: if p in labels: i = labels.index(p) else: LOGGER.warn( 'could not find any conformation with the label %s in the ensemble' % str(p)) P.append(i) LOGGER.timeit('_prody_refineEnsemble') from numpy import argsort ### obtain reference index # rmsd = ensemble.getRMSDs() # ref_i = np.argmin(rmsd) ref_i = kwargs.pop('ref', 0) if isinstance(ref_i, Integral): pass elif isinstance(ref_i, str): labels = ensemble.getLabels() ref_i = labels.index(ref_i) else: LOGGER.warn( 'could not find any conformation with the label %s in the ensemble' % str(ref_i)) if not ref_i in P: P = [ref_i] + P ### calculate pairwise RMSDs ### RMSDs = ensemble.getRMSDs(pairwise=True) def getRefinedIndices(A): deg = A.sum(axis=0) sorted_indices = list(argsort(deg)) # sorted_indices = P + [x for x in sorted_indices if x not in P] sorted_indices.remove(ref_i) sorted_indices.insert(0, ref_i) n_confs = ensemble.numConfs() isdel_temp = np.zeros(n_confs) for a in range(n_confs): i = sorted_indices[a] for b in range(n_confs): if a >= b: continue j = sorted_indices[b] if isdel_temp[i] or isdel_temp[j]: continue else: if A[i, j]: # isdel_temp[j] = 1 if not j in P: isdel_temp[j] = 1 elif not i in P: isdel_temp[i] = 1 temp_list = isdel_temp.tolist() ind_list = [] for i in range(n_confs): if not temp_list[i]: ind_list.append(i) return ind_list L = list(range(len(ensemble))) U = list(range(len(ensemble))) if lower is not None: A = RMSDs < lower L = getRefinedIndices(A) if upper is not None: B = RMSDs > upper U = getRefinedIndices(B) # find common indices from L and U I = list(set(L) - (set(L) - set(U))) # for p in P: # if p not in I: # I.append(p) I.sort() reens = ensemble[I] LOGGER.report('Ensemble was refined in %.2fs.', '_prody_refineEnsemble') LOGGER.info('%d conformations were removed from ensemble.' % (len(ensemble) - len(I))) return reens
def buildPDBEnsemble(atomics, ref=None, title='Unknown', labels=None, unmapped=None, **kwargs): """Builds a :class:`.PDBEnsemble` from a given reference structure and a list of structures (:class:`.Atomic` instances). Note that the reference should be included in the list as well. :arg atomics: a list of :class:`.Atomic` instances :type atomics: list :arg ref: reference structure or the index to the reference in *atomics*. If **None**, then the first item in *atomics* will be considered as the reference. If it is a :class:`.PDBEnsemble` instance, then *atomics* will be appended to the existing ensemble. Default is **None** :type ref: int, :class:`.Chain`, :class:`.Selection`, or :class:`.AtomGroup` :arg title: the title of the ensemble :type title: str :arg labels: labels of the conformations :type labels: list :arg degeneracy: whether only the active coordinate set (**True**) or all the coordinate sets (**False**) of each structure should be added to the ensemble. Default is **True** :type degeneracy: bool :arg occupancy: minimal occupancy of columns (range from 0 to 1). Columns whose occupancy is below this value will be trimmed :type occupancy: float :arg unmapped: labels of *atomics* that cannot be included in the ensemble. This is an output argument :type unmapped: list :arg subset: a subset for selecting particular atoms from the input structures. Default is ``"all"`` :type subset: str :arg superpose: if set to ``'iter'``, :func:`.PDBEnsemble.iterpose` will be used to superpose the structures, otherwise conformations will be superposed with respect to the reference specified by *ref* unless set to ``False``. Default is ``'iter'`` :type superpose: str, bool """ occupancy = kwargs.pop('occupancy', None) degeneracy = kwargs.pop('degeneracy', True) subset = str(kwargs.get('subset', 'all')).lower() superpose = kwargs.pop('superpose', 'iter') superpose = kwargs.pop('iterpose', superpose) debug = kwargs.pop('debug', {}) if 'mapping_func' in kwargs: raise DeprecationWarning( 'mapping_func is deprecated. Please see release notes for ' 'more details: http://prody.csb.pitt.edu/manual/release/v1.11_series.html' ) start = time.time() if not isListLike(atomics): raise TypeError('atomics should be list-like') if len(atomics) == 1 and degeneracy is True: raise ValueError('atomics should have at least two items') if labels is not None: if len(labels) != len(atomics): raise TypeError('Labels and atomics must have the same lengths.') else: labels = [] for atoms in atomics: if atoms is None: labels.append(None) else: labels.append(atoms.getTitle()) if ref is None: target = atomics[0] elif isinstance(ref, Integral): target = atomics[ref] elif isinstance(ref, PDBEnsemble): target = ref._atoms else: target = ref # initialize a PDBEnsemble with reference atoms and coordinates isrefset = False if isinstance(ref, PDBEnsemble): ensemble = ref else: # select the subset of reference beforehand for the sake of efficiency if subset != 'all': target = target.select(subset) ensemble = PDBEnsemble(title) if isinstance(target, Atomic): ensemble.setAtoms(target) ensemble.setCoords(target.getCoords()) isrefset = True else: ensemble._n_atoms = len(target) isrefset = False # build the ensemble if unmapped is None: unmapped = [] LOGGER.progress('Building the ensemble...', len(atomics), '_prody_buildPDBEnsemble') for i, atoms in enumerate(atomics): if atoms is None: unmapped.append(labels[i]) continue LOGGER.update(i, 'Mapping %s to the reference...' % atoms.getTitle(), label='_prody_buildPDBEnsemble') try: atoms.getHierView() except AttributeError: raise TypeError( 'atomics must be a list of instances having the access to getHierView' ) if subset != 'all': atoms = atoms.select(subset) # find the mapping of chains of atoms to those of target debug[labels[i]] = {} atommaps = alignChains(atoms, target, debug=debug[labels[i]], **kwargs) if len(atommaps) == 0: unmapped.append(labels[i]) continue # add the atommaps to the ensemble for atommap in atommaps: lbl = pystr(labels[i]) if len(atommaps) > 1: chids = np.unique(atommap.getChids()) strchids = ''.join(chids) lbl += '_%s' % strchids ensemble.addCoordset(atommap, weights=atommap.getFlags('mapped'), label=lbl, degeneracy=degeneracy) if not isrefset: ensemble.setCoords(atommap.getCoords()) isrefset = True LOGGER.finish() if occupancy is not None: ensemble = trimPDBEnsemble(ensemble, occupancy=occupancy) if superpose == 'iter': ensemble.iterpose() elif superpose is not False: ensemble.superpose() LOGGER.info('Ensemble ({0} conformations) were built in {1:.2f}s.'.format( ensemble.numConfs(), time.time() - start)) if unmapped: LOGGER.warn('{0} structures cannot be mapped.'.format(len(unmapped))) return ensemble
def fetchPDB(*pdb, **kwargs): """Returns path(s) to PDB file(s) for specified *pdb* identifier(s). Files will be sought in user specified *folder* or current working directory, and then in local PDB folder and mirror, if they are available. If *copy* is set **True**, files will be copied into *folder*. If *compressed* is **False**, all files will be decompressed into *folder*. See :func:`pathPDBFolder` and :func:`pathPDBMirror` for managing local resources, :func:`.fetchPDBviaFTP` and :func:`.fetchPDBviaHTTP` for downloading files from PDB servers.""" if len(pdb) == 1 and isinstance(pdb[0], list): pdb = pdb[0] if 'format' in kwargs and kwargs.get('format') != 'pdb': return fetchPDBviaFTP(*pdb, **kwargs) identifiers = checkIdentifiers(*pdb) folder = kwargs.get('folder', '.') compressed = kwargs.get('compressed') # check *folder* specified by the user, usually pwd ('.') filedict = findPDBFiles(folder, compressed=compressed) filenames = [] not_found = [] exists = 0 for i, pdb in enumerate(identifiers): if pdb is None: filenames.append(None) elif pdb in filedict: filenames.append(filedict[pdb]) exists += 1 else: filenames.append(None) not_found.append((i, pdb)) if not not_found: if len(filenames) == 1: filenames = filenames[0] if exists: LOGGER.debug( 'PDB file is found in working directory ({0}).'.format( sympath(filenames))) return filenames if not isWritable(folder): raise IOError('permission to write in {0} is denied, please ' 'specify another folder'.format(folder)) if compressed is not None and not compressed: filedict = findPDBFiles(folder, compressed=True) not_found, decompress = [], not_found for i, pdb in decompress: if pdb in filedict: fn = filedict[pdb] filenames[i] = gunzip(fn, splitext(fn)[0]) else: not_found.append((i, pdb)) if not not_found: return filenames[0] if len(identifiers) == 1 else filenames local_folder = pathPDBFolder() copy = kwargs.setdefault('copy', False) if local_folder: local_folder, is_divided = local_folder temp, not_found = not_found, [] for i, pdb in temp: if is_divided: fn = join(local_folder, pdb[1:3], 'pdb' + pdb + '.pdb.gz') else: fn = join(local_folder, pdb + '.pdb.gz') if isfile(fn): if copy or not compressed and compressed is not None: if compressed: fn = copyFile(fn, join(folder, pdb + 'pdb.gz')) else: fn = gunzip(fn, join(folder, pdb + '.pdb')) filenames[i] = normpath(fn) else: not_found.append((i, pdb)) if not not_found: if len(identifiers) == 1: fn = filenames[0] items = fn.split(pathsep) if len(items) > 5: fndisp = pathsep.join(items[:3] + ['...'] + items[-1:]) else: fndisp = relpath(fn) LOGGER.debug( 'PDB file is found in the local folder ({0}).'.format(fndisp)) return fn else: return filenames if kwargs['copy'] or (compressed is not None and not compressed): kwargs['folder'] = folder downloads = [pdb for i, pdb in not_found] fns = None try: fns = fetchPDBfromMirror(*downloads, **kwargs) except IOError: pass else: if len(downloads) == 1: fns = [fns] temp, not_found = not_found, [] for i, fn in enumerate(fns): if fn is None: not_found.append(temp[i]) else: i, _ = temp[i] filenames[i] = fn if not not_found: return filenames[0] if len(identifiers) == 1 else filenames if fns: downloads = [pdb for i, pdb in not_found] fns = None tp = kwargs.pop('tp', None) if tp is not None: tp = tp.lower() if tp == 'http': try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP failed ' '({0}).'.format(str(err))) elif tp == 'ftp': try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via FTP failed ' '({0}).'.format(str(err))) else: tryHTTP = False try: fns = fetchPDBviaFTP(*downloads, check=False, **kwargs) except Exception as err: tryHTTP = True if fns is None or isinstance(fns, list) and None in fns: tryHTTP = True elif isinstance(fns, list): downloads = [ not_found[i][1] for i in range(len(fns)) if fns[i] is None ] if len(downloads) > 0: tryHTTP = True if tryHTTP: LOGGER.info('Downloading PDB files via FTP failed, ' 'trying HTTP.') try: fns = fetchPDBviaHTTP(*downloads, check=False, **kwargs) except Exception as err: LOGGER.warn('Downloading PDB files via HTTP also failed ' '({0}).'.format(str(err))) if len(downloads) == 1: fns = [fns] if fns: for i, fn in zip([i for i, pdb in not_found], fns): filenames[i] = fn return filenames[0] if len(identifiers) == 1 else filenames
def fetchPDBviaFTP(*pdb, **kwargs): """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*. *format* keyword argument can be used to retrieve `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_ and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) format = str(kwargs.pop('format', 'pdb')).lower() noatom = bool(kwargs.pop('noatom', False)) if format == 'pdb': ftp_divided = 'pdb/data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if noatom: ftp_divided = 'pdb/data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'pdb/data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'pdb/data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' elif format == 'emd' or format == 'map': ftp_divided = 'emdb/structures' ftp_pdbext = '.map.gz' ftp_prefix = 'emd_' extension = '.map' else: raise ValueError(repr(format) + ' is not valid format') local_folder = pathPDBFolder() if format == 'pdb' and local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile(filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip(filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us'] LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name)) from ftplib import FTP try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: success = 0 failure = 0 filenames = [] ftp.login('') for pdb in identifiers: if pdb is None: filenames.append(None) continue data = [] ftp_fn = ftp_prefix + pdb + ftp_pdbext try: ftp.cwd(ftp_path) ftp.cwd(ftp_divided) if format == 'emd': ftp.cwd('EMD-{0}/map'.format(pdb)) else: ftp.cwd(pdb[1:3]) ftp.retrbinary('RETR ' + ftp_fn, data.append) except Exception as error: if ftp_fn in ftp.nlst(): LOGGER.warn('{0} download failed ({1}). It is ' 'possible that you do not have rights to ' 'download .gz files in the current network.' .format(pdb, str(error))) else: LOGGER.info('{0} download failed. {1} does not exist ' 'on {2}.'.format(ftp_fn, pdb, ftp_host)) failure += 1 filenames.append(None) else: if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: write = pdbfile.write [write(block) for block in data] filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})' .format(pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.' .format(pdb)) failure += 1 filenames.append(None) ftp.quit() LOGGER.debug('PDB download via FTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def iterpose(self, rmsd=0.0001): confs = copy(self._confs) Ensemble.iterpose(self, rmsd) self._confs = confs LOGGER.info('Final superposition to calculate transformations.') self.superpose()
def assignSecstr(header, atoms, coil=False): """Assign secondary structure from *header* dictionary to *atoms*. *header* must be a dictionary parsed using the :func:`.parsePDB`. *atoms* may be an instance of :class:`.AtomGroup`, :class:`.Selection`, :class:`.Chain` or :class:`.Residue`. ProDy can be configured to automatically parse and assign secondary structure information using ``confProDy(auto_secondary=True)`` command. See also :func:`.confProDy` function. The Dictionary of Protein Secondary Structure, in short DSSP, type single letter code assignments are used: * **G** = 3-turn helix (310 helix). Min length 3 residues. * **H** = 4-turn helix (alpha helix). Min length 4 residues. * **I** = 5-turn helix (pi helix). Min length 5 residues. * **T** = hydrogen bonded turn (3, 4 or 5 turn) * **E** = extended strand in parallel and/or anti-parallel beta-sheet conformation. Min length 2 residues. * **B** = residue in isolated beta-bridge (single pair beta-sheet hydrogen bond formation) * **S** = bend (the only non-hydrogen-bond based assignment). * **C** = residues not in one of above conformations. See http://en.wikipedia.org/wiki/Protein_secondary_structure#The_DSSP_code for more details. Following PDB helix classes are omitted: * Right-handed omega (2, class number) * Right-handed gamma (4) * Left-handed alpha (6) * Left-handed omega (7) * Left-handed gamma (8) * 2 - 7 ribbon/helix (9) * Polyproline (10) Secondary structures are assigned to all atoms in a residue. Amino acid residues without any secondary structure assignments in the header section will be assigned coil (C) conformation. This can be prevented by passing ``coil=False`` argument.""" if not isinstance(header, dict): raise TypeError('header must be a dictionary') helix = header.get('helix', {}) sheet = header.get('sheet', {}) if len(helix) == 0 and len(sheet) == 0: LOGGER.warn('header does not contain secondary structure data') return atoms ssa = atoms.getSecstrs() if ssa is None: if isinstance(atoms, AtomGroup): ag = atoms else: ag = atoms.getAtomGroup() ag.setSecstrs(np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secondary'].dtype)) ag.setSecids(np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secid'].dtype)) ag.setSecclasses( np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secclass'].dtype)) ag.setSecindices( np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secindex'].dtype)) prot = atoms.select('protein') if prot is not None: prot.setSecstrs('C') hierview = atoms.getHierView() count = 0 getResidue = hierview.getResidue for key, value in helix.items(): # PY3K: OK res = getResidue(*key) if res is None: continue res.setSecids(value[2]) res.setSecclasses(value[0]) res.setSecindices(value[1]) res.setSecstrs(mapHelix[value[0]]) count += 1 for key, value in sheet.items(): # PY3K: OK res = getResidue(*key) if res is None: continue res.setSecids(value[2]) res.setSecclasses(value[0]) res.setSecindices(value[1]) res.setSecstrs('E') count += 1 LOGGER.info( 'Secondary structures were assigned to {0} residues.'.format(count)) return atoms
def writeDCD(filename, trajectory, start=None, stop=None, step=None, align=False): """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later). *trajectory* can be an :class:`Trajectory`, :class:`DCDFile`, or :class:`Ensemble` instance. *filename* is returned upon successful output of file.""" if not filename.lower().endswith('.dcd'): filename += '.dcd' if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)): raise TypeError('{0} is not a valid type for trajectory'.format( type(trajectory))) irange = list( range(*slice(start, stop, step).indices(trajectory.numCoordsets()))) n_csets = len(irange) if n_csets == 0: raise ValueError('trajectory does not have any coordinate sets, or ' 'no coordinate sets are selected') if isinstance(trajectory, Atomic): isEnsemble = False isAtomic = True n_atoms = trajectory.numAtoms() else: isEnsemble = True isAtomic = False n_atoms = trajectory.numSelected() if n_atoms == 0: raise ValueError('no atoms are selected in the trajectory') if isinstance(trajectory, TrajBase): isTrajectory = True unitcell = trajectory.hasUnitcell() nfi = trajectory.nextIndex() trajectory.reset() pack_i_48 = pack('i', 48) if isinstance(trajectory, Trajectory): timestep = trajectory.getTimestep()[0] first_ts = trajectory.getFirstTimestep()[0] framefreq = trajectory.getFrameFreq()[0] n_fixed = trajectory.numFixed()[0] else: timestep = trajectory.getTimestep() first_ts = trajectory.getFirstTimestep() framefreq = trajectory.getFrameFreq() n_fixed = trajectory.numFixed() else: isTrajectory = False unitcell = False if isinstance(trajectory, Ensemble): frame = trajectory[0] else: frame = trajectory acsi = trajectory.getACSIndex() timestep = 1 first_ts = 0 framefreq = 1 n_fixed = 0 dcd = DCDFile(filename, mode='w') LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD') prev = -1 uc = None time_ = time() for j, i in enumerate(irange): diff = i - prev prev = i if isTrajectory: if diff > 1: trajectory.skip(diff - 1) frame = next(trajectory) if frame is None: break if unitcell: uc = frame._getUnitcell() uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:])) uc = uc[[0, 3, 1, 4, 5, 2]] elif isEnsemble: frame._index = i else: frame.setACSIndex(i) if align: frame.superpose() if j == 0: dcd.write(frame._getCoords(), uc, timestep=timestep, firsttimestep=first_ts, framefreq=framefreq) else: dcd.write(frame._getCoords(), uc) LOGGER.update(i, label='_prody_writeDCD') if isAtomic: trajectory.setACSIndex(acsi) j += 1 LOGGER.finish() dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024) LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format( dcd_size, dcd_size / time_)) LOGGER.info( '{0} coordinate sets written at output rate {1} frame/s.'.format( n_csets, int(n_csets / time_))) if j != n_csets: LOGGER.warn('Warning: {0} frames expected, {1} written.'.format( n_csets, j)) if isTrajectory: trajectory.goto(nfi) return filename