def resetTicks(x, y=None): """Reset X (and Y) axis ticks using values in given *array*. Ticks in the current figure should not be fractional values for this function to work as expected.""" import matplotlib.pyplot as plt if x is not None: try: xticks = plt.xticks()[0] xlist = list(xticks.astype(int)) if xlist[-1] > len(x): xlist.pop() if xlist: xlist = list(x[xlist]) plt.xticks(xticks, xlist + [''] * (len(xticks) - len(xlist))) except: LOGGER.warning('xticks could not be reset.') if y is not None: try: yticks = plt.yticks()[0] ylist = list(yticks.astype(int)) if ylist[-1] > len(y): ylist.pop() if ylist: ylist = list(y[ylist]) plt.yticks(yticks, ylist + [''] * (len(yticks) - len(ylist))) except: LOGGER.warning('xticks could not be reset.')
def loadAtoms(filename): """Return :class:`AtomGroup` instance from *filename*. This function makes use of :func:`numpy.load` function. See also :func:`saveAtoms`.""" LOGGER.timeit() attr_dict = load(filename) files = set(attr_dict.files) if not 'n_atoms' in files: raise ValueError("'{0:s}' is not a valid atomic data file" .format(filename)) title = str(attr_dict['title']) if 'coordinates' in files: coords = attr_dict['coordinates'] ag = AtomGroup(title) ag._n_csets = int(attr_dict['n_csets']) ag._coords = coords ag._n_atoms = int(attr_dict['n_atoms']) ag._setTimeStamp() if 'bonds' in files and 'bmap' in files and 'numbonds' in files: ag._bonds = attr_dict['bonds'] ag._bmap = attr_dict['bmap'] ag._data['numbonds'] = attr_dict['numbonds'] for key, data in attr_dict.iteritems(): if key in SKIPLOAD: continue if key in ATOMIC_ATTRIBUTES: ag._data[key] = data else: ag.setData(key, data) if ag.numCoordsets() > 0: ag._acsi = 0 if 'cslabels' in files: ag.setCSLabels(list(attr_dict['cslabels'])) LOGGER.timing('Atom group was loaded in %.2fs.') return ag
def _superpose(self, **kwargs): """Superpose conformations and update coordinates.""" calcT = getTransformation if kwargs.get('trans', False): if self._trans is not None: LOGGER.info('Existing transformations will be overwritten.') trans = np.zeros((self._n_csets, 4, 4)) else: trans = None indices = self._indices if indices is None: weights = self._weights coords = self._coords confs = self._confs confs_selected = self._confs else: weights = self._weights[:, indices] coords = self._coords[indices] confs = self._confs confs_selected = self._confs[:, indices] for i, conf in enumerate(confs_selected): rmat, tvec = calcT(conf, coords, weights[i]) if trans is not None: trans[i][:3, :3] = rmat trans[i][:3, 3] = tvec confs[i] = tvec + np.dot(confs[i], rmat.T) self._trans = trans
def psiBlastRun(sequence, cycles=2, filename=None, **kwargs): """Returns the results from a full PSI-BLAST run (multiple cycles). All arguments are the same as psiBlastCycle and are passed to it except for cycles. :arg cycles: the number of cycles to run default is 2 :type cycles: int """ psithr = kwargs.get('psithr', 1.0e-3) job_id = kwargs.get('previousjobid','') selectedHits = kwargs.get('selectedHits','') cycles_done = 0 results_list = [] job_ids = [] while cycles_done < cycles: if cycles_done > 0: selectedHits = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/result/' \ + job_id + '/preselected_seq' sequence = None job_id, results, sequence = psiBlastCycle(sequence, filename, \ previousjobid=job_id, \ selectedHits=selectedHits, \ cycle=cycles_done, **kwargs) results_list.append(results) job_ids.append(job_id) cycles_done += 1 LOGGER.info('Finished cycle {0} with job ID {1}.'.format(cycles_done, job_id)) return job_ids, results_list, sequence
def alignCoordsets(atoms, weights=None): """Return *atoms* after superposing coordinate sets onto its active coordinate set. Transformations will be calculated for *atoms* and applied to its :class:`.AtomGroup`, when applicable. Optionally, atomic *weights* can be passed for weighted superposition.""" try: acsi, n_csets = atoms.getACSIndex(), atoms.numCoordsets() except AttributeError: raise TypeError('atoms must have type Atomic, not {0:s}' .format(type(atoms))) if n_csets < 2: LOGGER.warning('{0:s} contains fewer than two coordinate sets, ' 'alignment was not performed.'.format(str(atoms))) return try: ag = atoms.getAtomGroup() except AttributeError: ag = atoms agacsi = ag.getACSIndex() tar = atoms._getCoords() for i in range(n_csets): if i == acsi: continue atoms.setACSIndex(i) ag.setACSIndex(i) calcTransformation(atoms, tar, weights).apply(ag) atoms.setACSIndex(acsi) ag.setACSIndex(agacsi) return atoms
def deformAtoms(atoms, mode, rmsd=None): """Generate a new coordinate set for *atoms* along the *mode*. *atoms* must be a :class:`.AtomGroup` instance. New coordinate set will be appended to *atoms*. If *rmsd* is provided, *mode* will be scaled to generate a coordinate set with given RMSD distance to the active coordinate set.""" if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup, not {0}' .format(type(atoms))) if not isinstance(mode, VectorBase): raise TypeError('mode must be a Mode or Vector instance, ' 'not {0}'.format(type(mode))) if not mode.is3d(): raise ValueError('mode must be from a 3-dimensional model.') if atoms.numAtoms() != mode.numAtoms(): raise ValueError('number of atoms do not match') array = mode.getArrayNx3() if rmsd is not None: rmsd = float(rmsd) # rmsd = ( ((scalar * array)**2).sum() / n_atoms )**0.5 scalar = (atoms.numAtoms() * rmsd**2 / (array**2).sum())**0.5 LOGGER.info('Mode is scaled by {0}.'.format(scalar)) atoms.addCoordset(atoms.getCoords() + array * scalar) else: atoms.addCoordset(atoms.getCoords() + array)
def __and__(self, other): if self is other: return self if not isinstance(other, AtomPointer): raise TypeError('other must be an AtomPointer') if self._ag != other.getAtomGroup(): raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warning('active coordinate set indices do not match, ' 'so it will be set to zero in the union.') acsi = 0 acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = set(self._getIndices()) indices = indices.intersection(other.getIndices()) if indices: indices = np.unique(indices) return Selection(self._ag, indices, '({0:s}) and ({1:s})'.format( self.getSelstr(), other.getSelstr()), acsi)
def calcMeff(msa, seqid=0.8, refine=False, weight=False, **kwargs): """Returns the Meff for *msa*, which may be an :class:`.MSA` instance or a 2D Numpy character array. Since similar sequences in an *msa* decreases the diversity of *msa*, *Meff* gives a weight for sequences in the *msa*. For example: One sequence in MSA has 5 other similar sequences in this MSA(itself included). The weight of this sequence is defined as 1/5=0.2. Meff is the sum of all sequence weights. In another word, Meff can be understood as the effective number of independent sequences. Sequences sharing sequence identity of *seqid* or more with another sequence are regarded as similar sequences to calculate Meff. Sequences are not refined by default. When *refine* is set **True**, the MSA will be refined by the first sequence. The weight for each sequence are returned when *weight* is **True**.""" msa = getMSA(msa) from .msatools import msameff LOGGER.timeit("_meff") refine = 1 if refine else 0 weight = 0 if weight else 1 # A Mark for return weighted array. if not weight: w = zeros((msa.shape[0]), float) meff = msameff(msa, theta=1.0 - seqid, meff_only=weight, refine=refine, w=w) else: meff = msameff(msa, theta=1.0 - seqid, meff_only=weight, refine=refine) LOGGER.report("Meff was calculated in %.2fs.", "_meff") return meff
def range2selstr(rangestr): if rangestr.strip() == '': return None frags = rangestr.split(',') sels = [] for frag in frags: try: fromtos = frag.split('-') if len(fromtos) == 2: fro, to = fromtos else: LOGGER.warn('range "%s" is irregular'%rangestr) fro = '1' to = fromtos[-1] fro_num = intResnum(fro) to_num = intResnum(to) if fro_num > to_num: LOGGER.warn('range "%s" is irregular'%rangestr) to_num = fro_num fro_num = 1 fro = str(fro_num) to = str(to_num) except ValueError: print('error occurred when parsing "%s"'%rangestr) continue sels.append('resnum %s to %s'%(fro, to)) selstr = ' or '.join(sels) return selstr
def calcMinBranchLength(go_id1, go_id2, go): '''Find the minimum branch length between two terms in the GO DAG. :arg go_id1: the first GO ID :type go_id1: str :arg go_id2: the second GO ID :type go_id2:str :arg go: object containing a gene ontology (GO) directed acyclic graph (DAG) :type go: `~goatools.obo_parser.GODag` ''' # First get the deepest common ancestor dca = findDeepestCommonAncestor([go_id1, go_id2], go) if dca is None: LOGGER.warn('There are no common ancestors between {0} and {1} so no meaningful distance can be calculated.'.format( go_id1, go_id2)) return None # Then get the distance from the DCA to each term dca_depth = go[dca].depth d1 = go[go_id1].depth - dca_depth d2 = go[go_id2].depth - dca_depth # Return the total distance - i.e., to the deepest common ancestor and back. return d1 + d2
def iterpose(self, rmsd=0.0001): confs = self._confs.copy() Ensemble.iterpose(self, rmsd) self._confs = confs LOGGER.info('Final superposition to calculate transformations.') self.superpose()
def calcEnsembleFunctionOverlaps(ens, **kwargs): """Calculate function overlaps for an ensemble as the mean of the value from :func:`calcDeepFunctionOverlaps`. :arg ens: an ensemble with labels :type ens: :class:`Ensemble` """ if not isinstance(ens, Ensemble) and not isListLike(ens): raise TypeError('ens should be an ensemble or list-like') if isinstance(ens, Ensemble): ids = [label[:5] for label in ens.getLabels()] else: ids = ens if not isinstance(ids[0], str): raise TypeError('ens should have labels') goa_ens = queryGOA(ids, **kwargs) for entry in goa_ens: if len(entry._molecular) == 0: LOGGER.warn( 'ensemble member {0} has no molecular functions and was omitted'.format(entry._title)) goa_ens = [entry for entry in goa_ens if len(entry._molecular) > 0] overlaps = calcDeepFunctionOverlaps(*goa_ens, **kwargs) return overlaps
def calcDeepFunctionOverlaps(*goa_data, **kwargs): """Calculate function overlaps between the deep (most detailed) molecular functions in particular from two sets of GO terms. :arg goa1: the first set of GO terms :type goa1: tuple, list, :class:`~numpy.ndarray` :arg goa2: the second set of GO terms :type goa2: tuple, list, :class:`~numpy.ndarray` """ return_funcs = kwargs.pop('return_funcs', False) deepFuncs = [findDeepestFunctions(entry, **kwargs) for entry in goa_data] for i, entry in enumerate(deepFuncs): if len(entry) == 0: LOGGER.warn( 'ensemble member {0} has no deep molecular functions and was omitted' .format(goa_data[i]._title)) deepFuncs = [entry for entry in deepFuncs if len(entry) > 0] overlaps = calcGoOverlap(*deepFuncs, **kwargs) if return_funcs: return overlaps, deepFuncs return overlaps
def showContactMap(enm, *args, **kwargs): """Show Kirchhoff matrix using :func:`~matplotlib.pyplot.spy`. .. plot:: :context: :include-source: p38_gnm = GNM('p38') p38_gnm.buildKirchhoff( p38_structure ) plt.figure(figsize=(4,4)) showContactMap( p38_gnm ) .. plot:: :context: :nofigs: plt.close('all')""" import matplotlib.pyplot as plt if not isinstance(enm, GNMBase): raise TypeError('model argument must be an ENM instance') kirchhoff = enm.getKirchhoff() if kirchhoff is None: LOGGER.warning('kirchhoff matrix is not set') return None show = plt.spy(kirchhoff, *args, **kwargs) plt.title('{0:s} contact map'.format(enm.getTitle())) plt.xlabel('Residue index') plt.ylabel('Residue index') return show
def mapOntoChainByAlignment(atoms, chain, **kwargs): """This function is similar to :func:`.mapOntoChain` but correspondence of chains is found by alignment provided. :arg alignments: A list of predefined alignments. It can be also a dictionary or :class:`MSA` instance where the keys or labels are the title of *atoms* or *chains*. :type alignments: list, dict, :class:`MSA` """ alignments = kwargs.pop('alignments', None) if alignments is None: return mapOntoChain(atoms, chain, **kwargs) else: if isinstance(alignments, (MSA, dict)): refseq = str(alignments[chain.getTitle()]) tarseq = str(alignments[atoms.getTitle()]) alignment = [refseq, tarseq] else: index = kwargs.pop('index', 0) alignment = alignments[index] tar_aligned_seq = alignment[-1] for char in GAPCHARS: tar_aligned_seq = tar_aligned_seq.replace(char, '').upper() hv = atoms.getHierView() for target_chain in hv.iterChains(): tar_seq = target_chain.getSequence().upper() if tar_seq == tar_aligned_seq: mappings = mapOntoChain(target_chain, chain, pwalign=alignment, **kwargs) return mappings LOGGER.warn('The sequence of chain does not match that in alignment (%s).'%atoms.getTitle()) return []
def __add__(self, other): """Returns an :class:`.AtomMap` instance. Order of pointed atoms are preserved.""" try: ag = other.getAtomGroup() except AttributeError: raise TypeError('unsupported operand type(s) for +: {0} and ' '{1}'.format(repr(type(self).__name__), repr(type(other).__name__))) if ag != self._ag: raise ValueError('AtomPointer instances must point to the same ' 'AtomGroup instance') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warning('Active coordset indices of atoms are not the same.' ' Result will have ACSI {0}.'.format(acsi)) title = '({0}) + ({1})'.format(str(self), str(other)) indices = concatenate([self._getIndices(), other._getIndices()]) dummies = 0 try: dummies += self.numDummies() except AttributeError: pass try: dummies += other.numDummies() except AttributeError: pass return AtomMap(ag, indices, acsi, title=title, intarrays=True, dummies=dummies)
def __or__(self, other): if self is other: return self try: ag = other.getAtomGroup() except AttributeError: raise TypeError('other must be an AtomPointer') if self._ag != ag: raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = unique(concatenate((self._getIndices(), other._getIndices()))) if indices[-1] == atommap.DUMMY: indices = indices[:-1] return Selection(self._ag, indices, '({0:s}) or ({1:s})'.format( self.getSelstr(), other.getSelstr()), acsi, unique=True)
def getCoordsets(self, indices=None): """Returns coordinate sets at given *indices*. *indices* may be an integer, a list of integers or ``None``. ``None`` returns all coordinate sets.""" if self._closed: raise ValueError('I/O operation on closed file') if (self._indices is None and (indices is None or indices == slice(None))): nfi = self._nfi self.reset() n_floats = self._n_floats + self._unitcell * 14 n_atoms = self._n_atoms n_csets = self._n_csets data = np.fromfile(self._file, self._dtype, n_floats * n_csets) if len(data) > n_floats * n_csets: n_csets = len(data)/n_floats data = data[:n_csets] LOGGER.warning('DCD is corrupt, {0:d} out of {1:d} frames ' 'were parsed.'.format(n_csets, self._n_csets)) data = data.reshape((n_csets, n_floats)) if self._unitcell: data = data[:, 14:] data = data.reshape((n_csets, 3, n_atoms+2)) data = data[:, :, 1:-1] data = data.transpose(0, 2, 1) self.goto(nfi) if self._astype is not None and self._astype != data.dtype: data = data.astype(self._astype) return data else: return TrajFile.getCoordsets(self, indices)
def __and__(self, other): if self is other: return self try: ag = other.getAtomGroup() except AttributeError: raise TypeError('other must be an AtomPointer') if self._ag != ag: raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warning('active coordinate set indices do not match, ' 'so it will be set to zero in the union.') acsi = 0 acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = set(self._getIndices()) indices = indices.intersection(other.getIndices()) if indices: indices = unique(indices) if indices[-1] == atommap.DUMMY: indices = indices[:-1] return Selection(self._ag, indices, '({0}) and ({1})' .format(self.getSelstr(), other.getSelstr()), acsi)
def calcShannonEntropy(msa, ambiguity=True, omitgaps=True, **kwargs): """Returns Shannon entropy array calculated for *msa*, which may be an :class:`.MSA` instance or a 2D Numpy character array. Implementation is case insensitive and handles ambiguous amino acids as follows: * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn) * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln) * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu) * **X** (Xaa) count is allocated to the twenty standard amino acids Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered as distinct amino acids. When *ambiguity* is set **False**, all alphabet characters as considered as distinct types. All non-alphabet characters are considered as gaps, and they are handled in two ways: * non-existent, the probability of observing amino acids in a given column is adjusted, by default * as a distinct character with its own probability, when *omitgaps* is **False**""" msa = getMSA(msa) length = msa.shape[1] if msa.shape[0] < 100: LOGGER.warning( "SCA performs the best with higher number of sequences, and " "minimal number of sequences is recommended as 100." ) entropy = empty(length, float) from .msatools import msaentropy return msaentropy(msa, entropy, ambiguity=bool(ambiguity), omitgaps=bool(omitgaps))
def wwPDBServer(*key): """Set/get `wwPDB`_ FTP/HTTP server location used for downloading PDB structures. Use one of the following keywords for setting a server: +---------------------------+-----------------------------+ | wwPDB FTP server | *Key* (case insensitive) | +===========================+=============================+ | RCSB PDB (USA) (default) | RCSB, USA, US | +---------------------------+-----------------------------+ | PDBe (Europe) | PDBe, Europe, Euro, EU | +---------------------------+-----------------------------+ | PDBj (Japan) | PDBj, Japan, Jp | +---------------------------+-----------------------------+ .. _wwPDB: http://www.wwpdb.org/""" if not key: return SETTINGS.get('wwpdb', None) elif len(key) == 1: try: key = key[0].lower() except AttributeError: raise TypeError('key must be a string') if key in WWPDB_FTP_SERVERS: SETTINGS['wwpdb'] = key SETTINGS.save() LOGGER.info('wwPDB server is set to {}.' .format(WWPDB_FTP_SERVERS[key][0])) else: raise ValueError('{0} is not a valid wwPDB server identifier' .format(repr(key))) else: raise TypeError('one wwPDB server identifier is expected, {0} given' .format(len(key)))
def buildSCAMatrix(msa, turbo=True, **kwargs): """Return SCA matrix calculated for *msa*, which may be an :class:`.MSA` instance or a 2D Numpy character array. Implementation is case insensitive and handles ambiguous amino acids as follows: * **B** (Asx) count is allocated to *D* (Asp) and *N* (Asn) * **Z** (Glx) count is allocated to *E* (Glu) and *Q* (Gln) * **J** (Xle) count is allocated to *I* (Ile) and *L* (Leu) * **X** (Xaa) count is allocated to the twenty standard amino acids * Joint probability of observing a pair of ambiguous amino acids is allocated to all potential combinations, e.g. probability of **XX** is allocated to 400 combinations of standard amino acids, similarly probability of **XB** is allocated to 40 combinations of *D* and *N* with the standard amino acids. Selenocysteine (**U**, Sec) and pyrrolysine (**O**, Pyl) are considered as distinct amino acids. When *ambiguity* is set **False**, all alphabet characters as considered as distinct types. All non-alphabet characters are considered as gaps.""" msa = getMSA(msa) from .msatools import msasca LOGGER.timeit('_sca') length = msa.shape[1] sca = zeros((length, length), float) sca = msasca(msa, sca, turbo=bool(turbo)) LOGGER.report('SCA matrix was calculated in %.2fs.', '_sca') return sca
def getResnums(self, gaps=False): """Return list of residue numbers associated with non-gapped *seq*. When *gaps* is **True**, return a list containing the residue numbers with gaps appearing as **None**. Residue numbers are inferred from the full label. When label does not contain residue number information, indices a range of numbers starting from 1 is returned.""" title, start, end = splitSeqLabel(self.getLabel(True)) try: start, end = int(start), int(end) except: LOGGER.info('Cannot parse label start, end values, Setting ' 'resnums 1 to {0:d}'.format(self.numResidues())) start, end = 1, self.numResidues() else: if (end - start + 1) != self.numResidues(): LOGGER.info('Label start-end position does not match ' 'length of ungapped sequence. Setting ' 'resnums 1 to {0:d}'.format(self.numResidues())) start, end = 1, self.numResidues() resnums = iter(range(start, end + 1)) if gaps: return [next(resnums) if torf else None for torf in char.isalpha(self._array)] else: return list(resnums)
def parsePQR(filename, **kwargs): """Returns an :class:`.AtomGroup` containing data parsed from PDB lines. :arg filename: a PQR filename :type filename: str""" title = kwargs.get('title', kwargs.get('name')) model = 1 header = False chain = kwargs.get('chain') subset = kwargs.get('subset') altloc = kwargs.get('altloc', 'A') max_n_atoms = kwargs.get('max_n_atoms', 1e5) if not os.path.isfile(filename): raise IOError('No such file: {0}'.format(repr(filename))) if title is None: fn, ext = os.path.splitext(os.path.split(filename)[1]) if ext == '.gz': fn, ext = os.path.splitext(fn) title = fn.lower() title_suffix = '' if subset: try: subset = _PDBSubsets[subset.lower()] except AttributeError: raise TypeError('subset must be a string') except KeyError: raise ValueError('{0} is not a valid subset' .format(repr(subset))) title_suffix = '_' + subset if chain is not None: if not isinstance(chain, str): raise TypeError('chain must be a string') elif len(chain) == 0: raise ValueError('chain must not be an empty string') title_suffix = '_' + chain + title_suffix if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() else: ag = AtomGroup(title + title_suffix) n_csets = 0 pqr = openFile(filename, 'rt') lines = pqr.readlines() pqr.close() LOGGER.timeit() ag = _parsePDBLines(ag, lines, split=0, model=1, chain=chain, subset=subset, altloc_torf=False, format='pqr', max_n_atoms=max_n_atoms) if ag.numAtoms() > 0: LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) return ag else: return None
def calcCrossProjection(ensemble, mode1, mode2, scale=None, **kwargs): """Return projection of conformational deviations onto modes from different models. :arg ensemble: ensemble for which deviations will be projected :type ensemble: :class:`.Ensemble` :arg mode1: normal mode to project conformations onto :type mode1: :class:`.Mode`, :class:`.Vector` :arg mode2: normal mode to project conformations onto :type mode2: :class:`.Mode`, :class:`.Vector` :arg scale: scale width of the projection onto mode ``x`` or ``y``, best scaling factor will be calculated and printed on the console, absolute value of scalar makes the with of two projection same, sign of scalar makes the projections yield a positive correlation""" if not isinstance(ensemble, (Ensemble, Conformation, Vector, TrajBase)): raise TypeError('ensemble must be Ensemble, Conformation, Vector, ' 'or a Trajectory, not {0}'.format(type(ensemble))) if not isinstance(mode1, VectorBase): raise TypeError('mode1 must be a Mode instance, not {0}' .format(type(mode1))) if not mode1.is3d(): raise ValueError('mode1 must be 3-dimensional') if not isinstance(mode2, VectorBase): raise TypeError('mode2 must be a Mode instance, not {0}' .format(type(mode2))) if not mode2.is3d(): raise ValueError('mode2 must be 3-dimensional') if scale is not None: assert isinstance(scale, str), 'scale must be a string' scale = scale.lower() assert scale in ('x', 'y'), 'scale must be x or y' xcoords = calcProjection(ensemble, mode1, kwargs.get('rmsd', True)) ycoords = calcProjection(ensemble, mode2, kwargs.pop('rmsd', True)) if scale: scalar = kwargs.get('scalar', None) if scalar: assert isinstance(scalar, (float, int)), 'scalar must be a number' else: scalar = ((ycoords.max() - ycoords.min()) / (xcoords.max() - xcoords.min()) ) * np.sign(np.dot(xcoords, ycoords)) if scale == 'x': LOGGER.info('Projection onto {0} is scaled by {1:.2f}' .format(mode1, scalar)) else: scalar = 1 / scalar LOGGER.info('Projection onto {0} is scaled by {1:.2f}' .format(mode2, scalar)) if scale == 'x': xcoords = xcoords * scalar else: ycoords = ycoords * scalar return xcoords, ycoords
def pathVMD(*path): """Return VMD path, or set it to be a user specified *path*.""" if not path: path = SETTINGS.get('vmd', None) if isExecutable(path): return path else: LOGGER.warning('VMD path is not set by user, looking for it.') vmdbin = None vmddir = None if PLATFORM == 'Windows': if PY3K: import winreg else: import _winreg as winreg # PY3K: OK for vmdversion in ('1.8.7', '1.9', '1.9.1'): try: key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, 'Software\\University of Illinois\\VMD\\' + vmdversion) vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0] vmdbin = join(vmddir, 'vmd.exe') except: pass try: key = winreg.OpenKey(winreg.HKEY_LOCAL_MACHINE, 'Software\\WOW6432node\\University of Illinois\\VMD\\' + vmdversion) vmddir = winreg.QueryValueEx(key, 'VMDDIR')[0] vmdbin = join(vmddir, 'vmd.exe') except: pass else: vmdbin = which('vmd') if False: pipe = os.popen('which vmd') vmdbin = pipe.next().strip() vmdfile = open(vmdbin) for line in vmdfile: if line.startswith('defaultvmddir='): vmddir = line.split('=')[1].replace('"', '') break vmdfile.close() if isExecutable(vmdbin): setVMDpath(vmdbin) return vmdbin elif len(path) == 1: path = path[0] if isExecutable(path): SETTINGS['vmd'] = path SETTINGS.save() LOGGER.info("VMD path is set to '{0}'.".format(path)) else: raise OSError('{0} is not executable.'.format(str(path))) else: raise ValueError('specify a single path string')
def setVMDpath(path): """Set path to a VMD executable.""" if isExecutable(path): SETTINGS["vmd"] = path SETTINGS.save() LOGGER.info("VMD path is set to '{0:s}'.".format(path)) else: raise OSError("{0:s} is not executable.".format(str(path)))
def superpose(self): """Superpose the ensemble onto the reference coordinates.""" if self._coords is None: raise ValueError('coordinates are not set, use `setCoords`') if self._confs is None or len(self._confs) == 0: raise ValueError('conformations are not set, use `addCoordset`') LOGGER.timeit() self._superpose(trans=True) # trans kwarg is used by PDBEnsemble LOGGER.timing('Superposition completed in %.2f seconds.')
def _setCoords(self, coords, label=None, overwrite=False): """Set coordinates without data type checking. *coords* must be a :class:`~numpy.ndarray`, but may have data type other than :class:`numpy.float64`, e.g. :class:`numpy.float32`. *label* argument may be used to label coordinate sets. *label* may be a string or a list of strings length equal to the number of coordinate sets.""" n_atoms = self._n_atoms if n_atoms: if coords.shape[-2] != n_atoms: raise ValueError('coords array has incorrect number of atoms') else: self._n_atoms = n_atoms = coords.shape[-2] ndim = coords.ndim shape = coords.shape if self._coords is None or overwrite or (ndim == 3 and shape[0] > 1): if ndim == 2: self._coords = coords.reshape((1, n_atoms, 3)) if label is None: self._cslabels = [None] else: self._cslabels = [str(label)] self._n_csets = n_csets = 1 else: self._coords = coords self._n_csets = n_csets = shape[0] if label is None or isinstance(label, str): self._cslabels = [label] * n_csets elif isinstance(label, (list, tuple)): if len(label) == n_csets: self._cslabels = list(label) else: self._cslabels = [None] * n_csets LOGGER.warn('Number of labels does not match number ' 'of coordinate sets.') else: LOGGER.warn('Wrong type for `label` argument.') self._acsi = 0 self._setTimeStamp() else: acsi = self._acsi if ndim == 2: self._coords[acsi] = coords else: self._coords[acsi] = coords[0] self._setTimeStamp(acsi) if label is not None: self._cslabels[acsi] = str(label)
def performSVD(self, coordsets): """Calculate principal modes using singular value decomposition (SVD). *coordsets* argument may be a :class:`.Atomic`, :class:`.Ensemble`, or :class:`numpy.ndarray` instance. If *coordsets* is a numpy array, its shape must be ``(n_csets, n_atoms, 3)``. Note that coordinate sets must be aligned prior to SVD calculations. This is a considerably faster way of performing PCA calculations compared to eigenvalue decomposition of covariance matrix, but is an approximate method when heterogeneous datasets are analyzed. Covariance method should be preferred over this one for analysis of ensembles with missing atomic data. See :ref:`pca-xray-calculations` example for comparison of results from SVD and covariance methods.""" linalg = importLA() start = time.time() if not isinstance(coordsets, (Ensemble, Atomic, np.ndarray)): raise TypeError('coordsets must be an Ensemble, Atomic, Numpy ' 'array instance') if isinstance(coordsets, np.ndarray): if (coordsets.ndim != 3 or coordsets.shape[2] != 3 or coordsets.dtype not in (np.float32, float)): raise ValueError('coordsets is not a valid coordinate array') deviations = coordsets - coordsets.mean(0) else: if isinstance(coordsets, Ensemble): deviations = coordsets.getDeviations() elif isinstance(coordsets, Atomic): deviations = (coordsets._getCoordsets() - coordsets._getCoords()) n_confs = deviations.shape[0] if n_confs < 3: raise ValueError('coordsets must have more than 3 coordinate sets') n_atoms = deviations.shape[1] if n_atoms < 3: raise ValueError('coordsets must have more than 3 atoms') dof = n_atoms * 3 deviations = deviations.reshape((n_confs, dof)).T vectors, values, self._temp = linalg.svd(deviations, full_matrices=False) values = (values ** 2) / n_confs self._dof = dof self._n_atoms = n_atoms which = values > 1e-18 self._eigvals = values[which] self._array = vectors[:, which] self._vars = self._eigvals self._trace = self._vars.sum() self._n_modes = len(self._eigvals) LOGGER.debug('{0} modes were calculated in {1:.2f}s.' .format(self._n_modes, time.time()-start))
def fetchPDBviaHTTP(*pdb, **kwargs): """Retrieve PDB file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) report = kwargs.get('report', True) extension = '.pdb' local_folder = pathPDBFolder() if local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile( filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip( filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) getURL = WWPDB_HTTP_URL[wwPDBServer() or 'us'] success = 0 failure = 0 filenames = [] for pdb in identifiers: if pdb is None: filenames.append(None) continue try: handle = openURL(getURL(pdb)) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format(pdb, str(err))) failure += 1 filenames.append(None) else: data = handle.read() if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: pdbfile.write(data) filename = normpath(relpath(second(filename, pdb))) LOGGER.debug('{0} downloaded ({1})'.format( pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn('{0} download failed, reason unknown.'.format(pdb)) failure += 1 filenames.append(None) if report: LOGGER.debug('PDB download via HTTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def addCoordset(self, coords, weights=None, label=None, **kwargs): """Add coordinate set(s) to the ensemble. *coords* must be a Numpy array with suitable shape and dimensionality, or an object with :meth:`getCoordsets`. *weights* is an optional argument. If provided, its length must match number of atoms. Weights of missing (not resolved) atoms must be ``0`` and weights of those that are resolved can be anything greater than ``0``. If not provided, weights of all atoms for this coordinate set will be set equal to ``1``. *label*, which may be a PDB identifier or a list of identifiers, is used to label conformations.""" degeneracy = kwargs.pop('degeneracy', False) adddata = kwargs.pop('data', None) atoms = coords n_atoms = self._n_atoms n_select = self.numSelected() n_confs = self.numCoordsets() try: if degeneracy: if self._coords is not None: if isinstance(coords, Ensemble): coords = coords._getCoords(selected=False) elif hasattr(coords, '_getCoords'): coords = coords._getCoords() else: if isinstance(coords, Ensemble): coords = coords.getCoords(selected=False) elif hasattr(coords, 'getCoords'): coords = coords.getCoords() else: if self._coords is not None: if isinstance(coords, Ensemble): coords = coords._getCoordsets(selected=False) elif hasattr(coords, '_getCoordsets'): coords = coords._getCoordsets() else: if isinstance(coords, Ensemble): coords = coords.getCoordsets(selected=False) elif hasattr(coords, 'getCoordsets'): coords = coords.getCoordsets() except AttributeError: label = label or 'Unknown' else: if coords is None: raise ValueError('coordinates are not set') elif label is None and isinstance(atoms, Atomic): if not isinstance(atoms, AtomGroup): ag = atoms.getAtomGroup() else: ag = atoms label = ag.getTitle() if coords.shape[0] < ag.numCoordsets(): label += '_m' + str(atoms.getACSIndex()) else: label = label or 'Unknown' # check coordinates try: checkCoords(coords, csets=True, natoms=n_atoms) except: try: checkCoords(coords, csets=True, natoms=n_select) except TypeError: raise TypeError('coords must be a numpy array or an object ' 'with `getCoords` method') if coords.ndim == 2: n_nodes, _ = coords.shape coords = coords.reshape((1, n_nodes, 3)) n_csets = 1 else: n_csets, n_nodes, _ = coords.shape if degeneracy: coords = coords[:1] n_repeats = 1 if degeneracy else n_csets if not n_atoms: self._n_atoms = n_nodes if n_nodes == n_select and self.isSelected(): full_coords = np.repeat(self._coords[np.newaxis, :, :], n_csets, axis=0) full_coords[:, self._indices, :] = coords coords = full_coords # check weights if weights is None: weights = np.ones((n_csets, n_atoms, 1), dtype=float) else: weights = checkWeights(weights, n_atoms, n_csets) if degeneracy: weights = weights[:1] # check sequences seqs = None sequence = kwargs.pop('sequence', None) if hasattr(atoms, 'getSequence'): if sequence is not None: LOGGER.warn( 'sequence is supplied though coords has getSequence') sequence = atoms.getSequence() seqs = [sequence for _ in range(n_repeats)] else: if sequence is None: try: sequence = self._atoms.getSequence() except AttributeError: if self._msa: sequence = ''.join('X' for _ in range(n_atoms)) # sequence and seqs remains to be None if MSA has not been created if isinstance(sequence, Sequence): seqs = [str(sequence)] elif isinstance(sequence, MSA): seqs = [str(seq) for seq in sequence] elif np.isscalar(sequence): seqs = [sequence for _ in range(n_repeats)] if seqs: if len(seqs) != n_repeats: raise ValueError( 'the number of sequences should be either one or ' 'that of coordsets') # assign new values # update labels if n_csets > 1 and not degeneracy: if isinstance(label, str): labels = [ '{0}_m{1}'.format(label, i + 1) for i in range(n_csets) ] else: if len(label) != n_csets: raise ValueError('length of label and number of ' 'coordinate sets must be the same') labels = label else: labels = [label] if np.isscalar(label) else label self._labels.extend(labels) # update sequences if seqs: msa = MSA(seqs, title=self.getTitle(), labels=labels) if self._msa is None: if n_confs > 0: def_seqs = np.chararray((n_confs, n_atoms)) def_seqs[:] = 'X' old_labels = [self._labels[i] for i in range(n_confs)] self._msa = MSA(def_seqs, title=self.getTitle(), labels=old_labels) self._msa.extend(msa) else: self._msa = msa else: self._msa.extend(msa) # update coordinates if self._confs is None and self._weights is None: self._confs = coords self._weights = weights elif self._confs is not None and self._weights is not None: self._confs = np.concatenate((self._confs, coords), axis=0) self._weights = np.concatenate((self._weights, weights), axis=0) else: raise RuntimeError('_confs and _weights must be set or None at ' 'the same time') # appending new data if self._data is not None and adddata is not None: if self._data is None: self._data = {} if adddata is None: adddata = {} all_keys = set(list(self._data.keys()) + list(adddata.keys())) for key in all_keys: if key in self._data: data = self._data[key] if key not in adddata: shape = [n_repeats] for s in data.shape[1:]: shape.append(s) newdata = np.zeros(shape, dtype=data.dtype) else: newdata = np.asarray(adddata[key]) if newdata.shape[0] != n_repeats: raise ValueError( 'the length of data["%s"] does not match that of coords' % key) else: newdata = np.asarray(adddata[key]) shape = [self._n_csets] for s in newdata.shape[1:]: shape.append(s) data = np.zeros(shape, dtype=newdata.dtype) self._data[key] = np.concatenate((data, newdata), axis=0) # update the number of coordinate sets self._n_csets += n_repeats
def calcPerturbResponse(model, **kwargs): """Returns a matrix of profiles from scanning the response of the structure to random perturbations at specific atom (or node) positions. The function implements the perturbation response scanning (PRS) method described in [CA09]_. Rows of the matrix are the average magnitude of the responses obtained by perturbing the atom/node position at that row index, i.e. ``prs_profile[i,j]`` will give the response of residue/node *j* to perturbations in residue/node *i*. PRS is performed using the covariance matrix from *model*, e.g. :class:`.ANM` instance. When an *atoms* instance is given, the PRS matrix will be added as data, which can be retrieved with ``atoms.getData('prs_matrix')``. *model* and *atoms* must have the same number of atoms. *atoms* must be an :class:`.AtomGroup` instance. .. [CA09] Atilgan C, Atilgan AR, Perturbation-Response Scanning Reveals Ligand Entry-Exit Mechanisms of Ferric Binding Protein. *PLoS Comput Biol* **2009** 5(10):e1000544. The PRS matrix can be calculated and saved as follows:: prs_matrix = calcPerturbResponse(p38_anm, saveMatrix=True) The PRS matrix can also be save later as follows:: writeArray('prs_matrix.txt', prs_matrix, format='%8.6f', delimiter='\t') :arg saveMatrix: whether to save the last matrix generated to a text file. Default is False :type saveMatrix: bool :arg saveName: The file name for saved matrices Default is 'response_matrix.txt'. :type saveName: str """ if not isinstance(model, (NMA, ModeSet, Mode)): raise TypeError('model must be an NMA, ModeSet, or Mode instance') if isinstance(model, NMA) and len(model) == 0: raise ValueError('model must have normal modes calculated') atoms = kwargs.get('atoms', None) if atoms is not None: if isinstance(atoms, Selection): atoms = atoms.copy() if not isinstance(atoms, AtomGroup): raise TypeError('atoms must be an AtomGroup instance') elif atoms.numAtoms() != model.numAtoms(): raise ValueError('model and atoms must have the same number atoms') n_atoms = model.numAtoms() LOGGER.timeit('_prody_prs_all') LOGGER.info('Calculating covariance matrix') LOGGER.timeit('_prody_cov') cov = calcCovariance(model) if cov is None: raise ValueError('model did not return a covariance matrix') LOGGER.clear() LOGGER.report('Covariance matrix calculated in %.1fs.', '_prody_cov') LOGGER.progress('Calculating perturbation response', n_atoms, '_prody_prs_mat') if not model.is3d(): prs_matrix = cov**2 else: cov_squared = cov**2 n_by_3n_cov_squared = np.zeros((n_atoms, 3 * n_atoms)) prs_matrix = np.zeros((n_atoms, n_atoms)) i3 = -3 i3p3 = 0 for i in range(n_atoms): i3 += 3 i3p3 += 3 n_by_3n_cov_squared[i, :] = (cov_squared[i3:i3p3, :]).sum(0) j3 = -3 j3p3 = 0 for j in range(n_atoms): j3 += 3 j3p3 += 3 prs_matrix[:, j] = (n_by_3n_cov_squared[:, j3:j3p3]).sum(1) LOGGER.clear() LOGGER.report('Perturbation response matrix calculated in %.1fs.', '_prody_prs_mat') saveMatrix = kwargs.get('saveMatrix', False) suppressDiag = kwargs.get('suppressDiag', False) saveName = kwargs.get('saveName', 'response_matrix.txt') norm_prs_matrix = np.zeros((n_atoms, n_atoms)) self_dp = np.diag(prs_matrix) self_dp = self_dp.reshape(n_atoms, 1) norm_prs_matrix = prs_matrix / np.repeat(self_dp, n_atoms, axis=1) if suppressDiag == True: # suppress the diagonal (self displacement) to facilitate # visualizing the response profile norm_prs_matrix = norm_prs_matrix - np.diag(np.diag(norm_prs_matrix)) if saveMatrix == True: np.savetxt(saveName, norm_prs_matrix, delimiter='\t', fmt='%8.6f') LOGGER.report('Perturbation response scanning completed in %.1fs.', '_prody_prs_all') if atoms is not None: atoms.setData('prs_matrix', norm_prs_matrix) return atoms, norm_prs_matrix else: return norm_prs_matrix
def assignSecstr(header, atoms, coil=False): """Assign secondary structure from *header* dictionary to *atoms*. *header* must be a dictionary parsed using the :func:`.parsePDB`. *atoms* may be an instance of :class:`.AtomGroup`, :class:`.Selection`, :class:`.Chain` or :class:`.Residue`. ProDy can be configured to automatically parse and assign secondary structure information using ``confProDy(auto_secondary=True)`` command. See also :func:`.confProDy` function. The Dictionary of Protein Secondary Structure, in short DSSP, type single letter code assignments are used: * **G** = 3-turn helix (310 helix). Min length 3 residues. * **H** = 4-turn helix (alpha helix). Min length 4 residues. * **I** = 5-turn helix (pi helix). Min length 5 residues. * **T** = hydrogen bonded turn (3, 4 or 5 turn) * **E** = extended strand in parallel and/or anti-parallel beta-sheet conformation. Min length 2 residues. * **B** = residue in isolated beta-bridge (single pair beta-sheet hydrogen bond formation) * **S** = bend (the only non-hydrogen-bond based assignment). * **C** = residues not in one of above conformations. See http://en.wikipedia.org/wiki/Protein_secondary_structure#The_DSSP_code for more details. Following PDB helix classes are omitted: * Right-handed omega (2, class number) * Right-handed gamma (4) * Left-handed alpha (6) * Left-handed omega (7) * Left-handed gamma (8) * 2 - 7 ribbon/helix (9) * Polyproline (10) Secondary structures are assigned to all atoms in a residue. Amino acid residues without any secondary structure assignments in the header section will be assigned coil (C) conformation. This can be prevented by passing ``coil=False`` argument.""" if not isinstance(header, dict): raise TypeError('header must be a dictionary') helix = header.get('helix', {}) sheet = header.get('sheet', {}) if len(helix) == 0 and len(sheet) == 0: LOGGER.warn('header does not contain secondary structure data') return atoms ssa = atoms.getSecstrs() if ssa is None: if isinstance(atoms, AtomGroup): ag = atoms else: ag = atoms.getAtomGroup() ag.setSecstrs(np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secondary'].dtype)) ag.setSecids(np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secid'].dtype)) ag.setSecclasses( np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secclass'].dtype)) ag.setSecindices( np.zeros(ag.numAtoms(), ATOMIC_FIELDS['secindex'].dtype)) atoms.select('protein').setSecstrs('C') hierview = atoms.getHierView() count = 0 getResidue = hierview.getResidue for key, value in helix.items(): # PY3K: OK res = getResidue(*key) if res is None: continue res.setSecids(value[2]) res.setSecclasses(value[0]) res.setSecindices(value[1]) res.setSecstrs(mapHelix[value[0]]) count += 1 for key, value in sheet.items(): # PY3K: OK res = getResidue(*key) if res is None: continue res.setSecids(value[2]) res.setSecclasses(value[0]) res.setSecindices(value[1]) res.setSecstrs('E') count += 1 LOGGER.info( 'Secondary structures were assigned to {0} residues.'.format(count)) return atoms
def writePerturbResponsePDB(prs_matrix, pdbIn=None, **kwargs): """ Write the average response to perturbation of a particular residue (a row of a perturbation response matrix) or the average effect of perturbation of a particular residue (a column of a normalized perturbation response matrix) into the b-factor field of a PDB file for visualisation in a molecular graphics program. If no chain is given this will be done for that residue in all chains. If no residue number is given then the effectiveness and sensitivity profiles will be written out instead. These two profiles are also returned as arrays for further analysis if they aren't already provided. :arg prs_matrix: a perturbation response matrix or a :class:`.AtomGroup` object with a PRS matrix associated as data :type prs_matrix: array or :class:`.AtomGroup` :arg pdbIn: file name for the input PDB file where you would like the PRS data mapped :type pdbIn: str :arg pdbOut: a list of file names (enclosed in square brackets) for the output PDB file, default is to append the chain and residue info (name and number) onto the pdbIn stem. The input for pdbOut can also be used as a stem if you enter a single string enclosed in quotes. If no residue number is supplied, chain is ignored and the default is to append '_effectiveness' and '_sensitivity' onto the stem. :type pdbOut: list :arg chain: chain identifier for the residue of interest, default is all chains If you want to analyse residues in a subset of chains, concatentate them together e.g. 'AC' :type chain: str :arg resnum: residue number for the residue of interest :type resnum: int :arg direction: the direction you want to use to read data out of the PRS matrix for plotting: the options are 'effect' or 'response'. Default is 'effect'. A row gives the effect on each residue of peturbing the specified residue. A column gives the response of the specified residue to perturbing each residue. If no residue number is provided then this option will be ignored :type direction: str :arg returnData: whether to return effectiveness and sensitivity for analysis default is False :type returnProfiles: bool :arg effectiveness: effectiveness profile :type array :arg sensitivity: sensitivity profile :type array """ if not isinstance(prs_matrix, np.ndarray): try: prs_matrix = prs_matrix.getData('prs_matrix') except: raise TypeError( 'Please provide a valid PRS matrix in numpy ndarray format.') try: fi = open(pdbIn, 'r') lines = fi.readlines() fi.close() except: raise PRSMatrixParseError( 'Please provide a valid file name for the input PDB.') chain = kwargs.get('chain', None) structure = parsePDB(pdbIn, subset='ca') structure.setData('prs_matrix', prs_matrix) hv = structure.getHierView() chains = [] for i in range(len(list(hv))): chainAg = list(hv)[i] chains.append(chainAg.getChids()[0]) chains = np.array(chains) if chain is None: chain = ''.join(chains) resnum = kwargs.get('resnum', None) pdbOut = kwargs.get('pdbOut', None) if pdbOut is None: out_stem = pdbIn.split('.')[0] elif type(pdbOut) is str: out_stem = pdbOut.split('.')[0] pdbOut = None if resnum is None: effectiveness = kwargs.get('effectiveness', None) sensitivity = kwargs.get('sensitivity', None) if effectiveness is None or sensitivity is None: effectiveness, sensitivity = calcPerturbResponseProfiles( prs_matrix) structure.setData('effectiveness', effectiveness) structure.setData('sensitivity', sensitivity) file_effs_name = '{0}_effectiveness.pdb'.format(out_stem) file_sens_name = '{0}_sensitivity.pdb'.format(out_stem) fileEffs = open(file_effs_name, 'w') fileSens = open(file_sens_name, 'w') for line in lines: if line.find('ATOM') != 0 and line.find( 'HETATM') != 0 and line.find('ANISOU') != 0: fileEffs.write(line) fileSens.write(line) elif line.find('ATOM') == 0: fileEffs.write(line[:60] + '{:6.2f}'.format(float(structure.select( \ 'chain {0} and resnum {1}'.format(line[21],line[22:26])) \ .getData('effectiveness')) * 100/np.max( \ structure.getData('effectiveness'))) + line[66:]) fileSens.write(line[:60] + '{:6.2f}'.format(float(structure.select( \ 'chain {0} and resnum {1}'.format(line[21],line[22:26])) \ .getData('sensitivity')) * 100/np.max( \ structure.getData('sensitivity'))) + line[66:]) elif line.find('HETATM') == 0: fileEffs.write(line[:60] + ' 0.00' + line[66:]) fileSens.write(line[:60] + ' 0.00' + line[66:]) fileEffs.close() fileSens.close() LOGGER.info('The effectiveness and sensitivity profiles were written' \ ' to {0} and {1}.'.format(file_effs_name,file_sens_name)) returnData = kwargs.get('returnData', False) if returnData: return structure, effectiveness, sensitivity else: return direction = kwargs.get('direction', 'effect') for n in range(len(chain)): if not chain[n] in chains: raise PRSMatrixParseError('Chain {0} was not found in {1}'.format( chain[n], pdbIn)) if pdbOut is None: pdbOut = [] for c in chain: pdbOut.append('{0}_{1}_{2}{3}_{4}.pdb' \ .format(out_stem, c, \ str(structure.select('chain {0} and resnum {1}' \ .format(c, resnum)).getResnames()), \ resnum, direction)) for c in chain: fo = open(pdbOut[n], 'w') for line in lines: if line.find('ATOM') != 0 and line.find( 'HETATM') != 0 and line.find('ANISOU') != 0: fo.write(line) elif line.find('ATOM') == 0: if direction is 'effect': fo.write(line[:60] + '{:6.2f}'.format(float(structure.getData('prs_matrix') \ [structure.select('chain {0} and resnum {1}' \ .format(c, resnum)).getResindices(), \ structure.select('chain {0} and resnum {1}' \ .format(line[21], line[22:26])).getResindices()])*100) \ + line[66:]) else: fo.write(line[:60] + '{:6.2f}'.format(float(structure.getData('prs_matrix') \ [structure.select('chain {0} and resnum {1}' \ .format(line[21], line[22:26])).getResindices(), \ structure.select('chain {0} and resnum {1}' \ .format(c, resnum)).getResindices()])*100) \ + line[66:]) elif line.find('HETATM') == 0: fo.write(line[:60] + ' 0.00' + line[66:]) LOGGER.info('Perturbation responses for specific residues were written' \ ' to {0}.'.format(', '.join(pdbOut)))
def searchPfam(query, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, a protein sequence, or a sequence file. Sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = ''.join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError('could not parse a sequence without gaps from ' + query) else: seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + ' is not a valid sequence') fseq = '>Seq\n' + seq parameters = {'hmmdb': 'pfam', 'seq': fseq} enc_params = urllib.urlencode(parameters).encode('utf-8') request = urllib2.Request( 'https://www.ebi.ac.uk/Tools/hmmer/search/hmmscan', enc_params) results_url = urllib2.urlopen(request).geturl() #res_params = { 'output' : 'xml' } res_params = {'format': 'tsv'} enc_res_params = urllib.urlencode(res_params) #modified_res_url = results_url + '?' + enc_res_params modified_res_url = results_url.replace( 'results', 'download') + '?' + enc_res_params result_request = urllib2.Request(modified_res_url) # url = ( urllib2.urlopen(request).geturl() + '?output=xml') LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format( seq[:MINSEQLEN])) try: #xml = urllib2.urlopen(result_request).read() tsv = urllib2.urlopen(result_request).read() # openURL(url, timeout=timeout).read() except: raise ValueError('No matching Pfam domains were found.') # try: # root = ET.XML(xml) # except Exception as err: # raise ValueError('failed to parse results XML, check URL: ' + modified_res_url) matches = {} #for child in root[0]: #if child.tag == 'hits': # accession = child.get('acc') # pfam_id = accession.split('.')[0] # matches[pfam_id]={} # matches[pfam_id]['accession']=accession # matches[pfam_id]['class']='Domain' # matches[pfam_id]['id']=child.get('name') # matches[pfam_id]['locations']={} # matches[pfam_id]['locations']['ali_end']=child[0].get('alisqto') # matches[pfam_id]['locations']['ali_start']=child[0].get('alisqfrom') # matches[pfam_id]['locations']['bitscore']=child[0].get('bitscore') # matches[pfam_id]['locations']['end']=child[0].get('alisqto') # matches[pfam_id]['locations']['evalue']=child.get('evalue') # matches[pfam_id]['locations']['evidence']='hmmer v3.0' # matches[pfam_id]['locations']['hmm_end']=child[0].get('alihmmto') # matches[pfam_id]['locations']['hmm_start']=child[0].get('alihmmfrom') # matches[pfam_id]['locations']['significant']=child[0].get('significant') # matches[pfam_id]['locations']['start']=child[0].get('alisqfrom') # matches[pfam_id]['type']='Pfam-A' # return matches if PY3K: tsv = tsv.decode() lines = tsv.split('\n') keys = lines[0].split('\t') root = {} for i, line in enumerate(lines[1:-1]): root[i] = {} for j, key in enumerate(keys): root[i][key] = line.split('\t')[j] for child in root.values(): accession = child['Family Accession'] pfam_id = accession.split('.')[0] matches[pfam_id] = {} matches[pfam_id]['accession'] = accession matches[pfam_id]['class'] = 'Domain' matches[pfam_id]['id'] = child['Family id'] matches[pfam_id]['locations'] = {} matches[pfam_id]['locations']['ali_end'] = child['Ali. End'] matches[pfam_id]['locations']['ali_start'] = child['Ali. Start'] matches[pfam_id]['locations']['bitscore'] = child['Bit Score'] matches[pfam_id]['locations']['end'] = child['Env. End'] matches[pfam_id]['locations']['cond_evalue'] = child[ 'Cond. E-value'] matches[pfam_id]['locations']['ind_evalue'] = child['Ind. E-value'] matches[pfam_id]['locations']['evidence'] = 'hmmer v3.0' matches[pfam_id]['locations']['hmm_end'] = child['Model End'] matches[pfam_id]['locations']['hmm_start'] = child['Model Start'] #matches[pfam_id]['locations']['significant'] = child['significant'] matches[pfam_id]['locations']['start'] = child['Env. Start'] matches[pfam_id]['type'] = 'Pfam-A' return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], 'polymers') except Exception as err: LOGGER.warn('failed to parse header for {0} ({1})'.format( seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != 'UniProt': continue idcode = dbref.idcode accession = dbref.accession LOGGER.info('UniProt ID code {0} for {1} chain ' '{2} will be used.'.format( idcode, seq[:4], poly.chid)) break if idcode is not None: break if idcode is None: LOGGER.warn('A UniProt ID code for PDB {0} could not be ' 'parsed.'.format(repr(seq))) url = prefix + 'protein/' + seq + '?output=xml' else: url = prefix + 'protein/' + idcode + '?output=xml' else: url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml not in ['PEND', 'RUN']: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None elif xml.find(b'No valid UniProt accession or ID') > 0: try: url = prefix + 'protein/' + accession + '?output=xml' xml = openURL(url, timeout=timeout).read() except: try: ag = parsePDB(seq, subset='ca') ag_seq = ag.getSequence() return searchPfam(ag_seq) except: raise ValueError('No valid UniProt accession or ID for: ' + seq) try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError('failed to parse results XML, check URL: ' + url) else: key = '{' + prefix + '}' results = dictElement(root[0], key) try: xml_matches = results['matches'] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) matches = dict() for child in xml_matches: try: accession = child.attrib['accession'][:7] except KeyError: raise ValueError('failed to parse results XML, check URL: ' + url) if not re.search('^P(F|B)[0-9]{5}$', accession): raise ValueError('{0} does not match pfam accession' ' format'.format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault('locations', []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = 'Query ' + repr(query) else: query = 'Query sequence' if matches: LOGGER.info(query + ' matched {0} Pfam families.'.format(len(matches))) else: LOGGER.info(query + ' did not match any Pfam families.') return matches
def fetchPfamMSA(acc, alignment='full', compressed=False, **kwargs): """Returns a path to the downloaded Pfam MSA file. :arg acc: Pfam ID or Accession Code :type acc: str :arg alignment: alignment type, one of ``'full'`` (default), ``'seed'``, ``'ncbi'``, ``'metagenomics'``, ``'rp15'``, ``'rp35'``, ``'rp55'``, ``'rp75'`` or ``'uniprot'`` where rp stands for representative proteomes :arg compressed: gzip the downloaded MSA file, default is **False** *Alignment Options* :arg format: a Pfam supported MSA file format, one of ``'selex'``, (default), ``'stockholm'`` or ``'fasta'`` :arg order: ordering of sequences, ``'tree'`` (default) or ``'alphabetical'`` :arg inserts: letter case for inserts, ``'upper'`` (default) or ``'lower'`` :arg gaps: gap character, one of ``'dashes'`` (default), ``'dots'``, ``'mixed'`` or **None** for unaligned *Other Options* :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :arg outname: out filename, default is input ``'acc_alignment.format'`` :arg folder: output folder, default is ``'.'``""" url = prefix + 'family/acc?id=' + acc handle = openURL(url, timeout=int(kwargs.get('timeout', 60))) orig_acc = acc acc = handle.readline().strip() if PY3K: acc = acc.decode() url_flag = False if not re.search('(?<=PF)[0-9]{5}$', acc): raise ValueError('{0} is not a valid Pfam ID or Accession Code'.format( repr(orig_acc))) if alignment not in DOWNLOAD_FORMATS: raise ValueError('alignment must be one of full, seed, ncbi or' ' metagenomics') if alignment == 'ncbi' or alignment == 'metagenomics' or alignment == 'uniprot': url = (prefix + 'family/' + acc + '/alignment/' + alignment + '/gzipped') url_flag = True extension = '.sth' else: if not kwargs: url = (prefix + 'family/' + acc + '/alignment/' + alignment + '/gzipped') url_flag = True extension = '.sth' else: align_format = kwargs.get('format', 'selex').lower() if align_format not in FORMAT_OPTIONS['format']: raise ValueError('alignment format must be of type selex' ' stockholm or fasta. MSF not supported') if align_format == SELEX: align_format, extension = 'pfam', '.slx' elif align_format == FASTA: extension = '.fasta' else: extension = '.sth' gaps = str(kwargs.get('gaps', 'dashes')).lower() if gaps not in FORMAT_OPTIONS['gaps']: raise ValueError('gaps must be of type mixed, dots, dashes, ' 'or None') inserts = kwargs.get('inserts', 'upper').lower() if (inserts not in FORMAT_OPTIONS['inserts']): raise ValueError('inserts must be of type lower or upper') order = kwargs.get('order', 'tree').lower() if order not in FORMAT_OPTIONS['order']: raise ValueError('order must be of type tree or alphabetical') url = (prefix + 'family/' + acc + '/alignment/' + alignment + '/format?format=' + align_format + '&alnType=' + alignment + '&order=' + order[0] + '&case=' + inserts[0] + '&gaps=' + gaps + '&download=1') response = openURL(url, timeout=int(kwargs.get('timeout', 60))) outname = kwargs.get('outname', None) if not outname: outname = orig_acc folder = str(kwargs.get('folder', '.')) filepath = join(makePath(folder), outname + '_' + alignment + extension) if compressed: filepath = filepath + '.gz' if url_flag: f_out = open(filepath, 'wb') else: f_out = openFile(filepath, 'wb') f_out.write(response.read()) f_out.close() else: if url_flag: gunzip(response.read(), filepath) else: with open(filepath, 'wb') as f_out: f_out.write(response.read()) filepath = relpath(filepath) LOGGER.info('Pfam MSA for {0} is written as {1}.'.format( orig_acc, filepath)) return filepath
def iterpose(self, rmsd=0.0001): confs = copy(self._confs) Ensemble.iterpose(self, rmsd) self._confs = confs LOGGER.info('Final superposition to calculate transformations.') self.superpose()
def writeHeatmap(filename, heatmap, **kwargs): """Return *filename* that contains *heatmap* in Heat Mapper :file:`.hm` file (extension is automatically added when not found). *filename* may also be an output stream. :arg title: title of the heatmap :type title: str :arg xlabel: x-axis lab, default is ``'unknown'`` :type xlabel: str :arg ylabel: y-axis lab, default is ``'unknown'`` :type ylabel: str :arg xorigin: x-axis origin, default is 0 :type xorigin: float :arg xstep: x-axis step, default is 1 :type xstep: float :arg min: minimum value, default is minimum in *heatmap* :type min: float :arg max: maximum value, default is maximum in *heatmap* :type max: float :arg format: number format, default is ``'%f'`` :type format: str Other keyword arguments that are arrays with length equal to the y-axis (second dimension of heatmap) will be considered as *numbering*.""" try: ndim, shape = heatmap.ndim, heatmap.shape except: raise TypeError('heatmap must be an array object') if ndim!= 2: raise TypeError('heatmap must be a 2D array') try: write, close, stream = filename.write, lambda: None, filename except AttributeError: out = openFile(addext(filename, '.hm'), 'wb') write, close, stream = out.write, out.close, out format = kwargs.pop('format', '%f') write('-min "{0}"\n'.format(kwargs.pop('min', heatmap.min()))) write('-max "{0}"\n'.format(kwargs.pop('max', heatmap.max()))) for label, default in [ ('title', 'unknown'), ('xlabel', 'unknown'), ('xorigin', 0), ('xstep', 1), ('ylabel', 'unknown'), ]: write('-{0} "{1}"\n'.format(label, kwargs.pop(label, default))) numbering = [] numlabels = [] for key, val in kwargs.items(): try: length = len(val) except TypeError: LOGGER.warn('Keyword argument {0} is not used.'.format(key)) else: if length == shape[0]: numlabels.append(key) numbering.append(val) if not numbering: numlabels.append('unknown') numbering.append(arange(1, shape[0] + 1)) write('-numbering "{0}"\n'.format(':'.join(numlabels))) for i, row in enumerate(heatmap): write(':'.join(str(nums[i]) for nums in numbering) + ':') row.tofile(stream, sep=';', format=format) write(';\n') close() return filename
def print_sat_mutagen_figure(filename, rhapsody_obj, res_interval=None, PolyPhen2=True, EVmutation=True, extra_plot=None, fig_height=8, fig_width=None, dpi=300, min_interval_size=15, html=False, main_clsf='main', aux_clsf='aux.'): # check inputs assert isinstance(filename, str), 'filename must be a string' assert isinstance(rhapsody_obj, Rhapsody), 'not a Rhapsody object' assert rhapsody_obj._isColSet('main score'), 'predictions not found' assert rhapsody_obj._isSaturationMutagenesis(), 'unable to create figure' if res_interval is not None: assert isinstance(res_interval, tuple) and len(res_interval) == 2, \ 'res_interval must be a tuple of 2 values' assert res_interval[1] >= res_interval[0], 'invalid res_interval' if extra_plot is not None: assert len(extra_plot) == rhapsody_obj.numSAVs, \ 'length of additional predictions array is incorrect' assert isinstance(fig_height, (int, float)) assert isinstance(dpi, int) matplotlib = _try_import_matplotlib() if matplotlib is None: return # delete extension from filename filename = os.path.splitext(filename)[0] # make sure that all variants belong to the same Uniprot sequence accs = [s.split()[0] for s in rhapsody_obj.data['SAV coords']] if len(set(accs)) != 1: m = 'Only variants from a single Uniprot sequence can be accepted' raise ValueError(m) # select an appropriate interval, based on available predictions seq_pos = [int(s.split()[1]) for s in rhapsody_obj.data['SAV coords']] res_min = np.min(seq_pos) res_max = np.max(seq_pos) upper_lim = res_max + min_interval_size # create empty (20 x num_res) mutagenesis tables table_best = np.zeros((20, upper_lim), dtype=float) table_best[:] = 'nan' table_main = table_best.copy() if extra_plot is not None: table_other = table_best.copy() if PolyPhen2: table_PP2 = table_best.copy() if EVmutation: table_EVmut = table_best.copy() # import pathogenicity probabilities from Rhapsody object p_best = rhapsody_obj.getPredictions(classifier='best')['path. prob.'] p_main = rhapsody_obj.data['main path. prob.'] if PolyPhen2: rhapsody_obj._calcPolyPhen2Predictions() p_PP2 = rhapsody_obj.data['PolyPhen-2 score'] if EVmutation: rhapsody_obj._calcEVmutationPredictions() EVmut_score = np.array(rhapsody_obj.data['EVmutation score']) EVmut_cutoff = SETTINGS.get('EVmutation_metrics')['optimal cutoff'] p_EVmut = -EVmut_score / EVmut_cutoff * 0.5 # fill tables with predicted probability # 1: deleterious # 0: neutral # 'nan': no prediction/wt aa_list = 'ACDEFGHIKLMNPQRSTVWY' aa_map = {aa: i for i, aa in enumerate(aa_list)} for i, SAV in enumerate(rhapsody_obj.data['SAV coords']): aa_mut = SAV.split()[3] index = int(SAV.split()[1]) - 1 table_best[aa_map[aa_mut], index] = p_best[i] table_main[aa_map[aa_mut], index] = p_main[i] if extra_plot is not None: table_other[aa_map[aa_mut], index] = extra_plot[i] if PolyPhen2: table_PP2[aa_map[aa_mut], index] = p_PP2[i] if EVmutation: table_EVmut[aa_map[aa_mut], index] = p_EVmut[i] # compute average pathogenicity profiles # NB: I expect to see RuntimeWarnings in this block with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) avg_p_best = np.nanmean(table_best, axis=0) avg_p_main = np.nanmean(table_main, axis=0) min_p = np.nanmin(table_best, axis=0) max_p = np.nanmax(table_best, axis=0) if extra_plot is not None: avg_p_other = np.nanmean(table_other, axis=0) if PolyPhen2: avg_p_PP2 = np.nanmean(table_PP2, axis=0) if EVmutation: avg_p_EVmut = np.nanmean(table_EVmut, axis=0) # use upper strip for showing additional info, such as PDB lengths upper_strip = np.zeros((1, upper_lim)) upper_strip[:] = 'nan' PDB_sizes = np.zeros(upper_lim, dtype=int) PDB_coords = [''] * upper_lim for s in rhapsody_obj.data: index = int(s['SAV coords'].split()[1]) - 1 if s['PDB size'] != 0: PDB_length = int(s['PDB size']) PDBID_chain = ':'.join(s['PDB SAV coords'][0].split()[:2]) upper_strip[0, index] = PDB_length PDB_sizes[index] = PDB_length PDB_coords[index] = PDBID_chain max_PDB_size = max(PDB_sizes) if max_PDB_size != 0: upper_strip[0, :] /= max_PDB_size # PLOT FIGURE from matplotlib import pyplot as plt from matplotlib import gridspec as gridspec # portion of the sequence to display if res_interval is None: res_interval = (res_min, res_max) # adjust interval res_i, res_f = _adjust_res_interval(res_interval, upper_lim, min_interval_size) nres_shown = res_f - res_i + 1 # figure proportions if fig_width is None: fig_width = fig_height / 2 # inches fig_width *= nres_shown / 20 fig, ax = plt.subplots(3, 2, figsize=(fig_width, fig_height)) wspace = 0.5 # inches plt.subplots_adjust(wspace=wspace / fig_width, hspace=0.15) # figure structure gs = gridspec.GridSpec(3, 2, width_ratios=[nres_shown, 1], height_ratios=[1, 20, 10]) ax0 = plt.subplot(gs[0, 0]) # secondary structure strip ax1 = plt.subplot(gs[1, 0]) # mutagenesis table axcb = plt.subplot(gs[1, 1]) # colorbar ax2 = plt.subplot(gs[2, 0]) # average profile # padding for tick labels pad = 0.2 / fig_width # top strip matplotlib.cm.YlGn.set_bad(color='antiquewhite') ax0.imshow(upper_strip[0:1, res_i - 1:res_f], aspect='auto', cmap='YlGn', vmin=0, vmax=1) ax0.set_ylim((-0.45, .45)) ax0.set_yticks([]) ax0.set_ylabel(f'PDB size \n[0-{max_PDB_size} res] ', fontsize=14, ha='right', va='center', rotation=0) ax0.set_xticks(np.arange(5 - res_i % 5, res_f - res_i + 1, 5)) ax0.set_xticklabels([]) # add white grid ax0.set_xticks(np.arange(-.5, res_f - res_i + 1, 1), minor=True) ax0.tick_params(axis='both', which='minor', length=0) ax0.grid(which='minor', color='w', linestyle='-', linewidth=.5) # mutagenesis table (heatmap) matplotlib.cm.coolwarm.set_bad(color='antiquewhite') im = ax1.imshow(table_best[:, res_i - 1:res_f], aspect='auto', cmap='coolwarm', vmin=0, vmax=1) axcb.figure.colorbar(im, cax=axcb) ax1.set_yticks(np.arange(len(aa_list))) ax1.set_yticklabels(aa_list, ha='center', position=(-pad, 0), fontsize=14) ax1.set_xticks(np.arange(5 - res_i % 5, res_f - res_i + 1, 5)) ax1.set_xticklabels([]) ax1.set_ylabel('pathog. probability', labelpad=10) # add white grid ax1.set_xticks(np.arange(-.5, res_f - res_i + 1, 1), minor=True) ax1.set_yticks(np.arange(-.5, 20, 1), minor=True) ax1.tick_params(axis='both', which='minor', length=0) ax1.grid(which='minor', color='w', linestyle='-', linewidth=.5) # average pathogenicity profile x_resids = np.arange(1, upper_lim + 1) # shading showing range of values ax2.fill_between(x_resids, min_p, max_p, alpha=0.5, edgecolor='salmon', facecolor='salmon') # plot average profile for other predictions, if available if extra_plot is not None: ax2.plot(x_resids, avg_p_other, color='gray', lw=1) if PolyPhen2: ax2.plot(x_resids, avg_p_PP2, color='blue', lw=1) if EVmutation: ax2.plot(x_resids, avg_p_EVmut, color='green', lw=1) # solid line for predictions obtained with full classifier ax2.plot(x_resids, avg_p_main, 'ro-') # dotted line for predictions obtained with auxiliary classifier ax2.plot(x_resids, avg_p_best, 'ro-', markerfacecolor='none', ls='dotted') # cutoff line ax2.axhline(y=0.5, color='grey', lw=.8, linestyle='dashed') ax2.set_xlim((res_i - .5, res_f + .5)) ax2.set_xlabel('residue number') ax2.set_ylim((-0.05, 1.05)) ax2.set_ylabel('average', rotation=90, labelpad=10) ax2.set_yticklabels([]) ax2r = ax2.twinx() ax2r.set_ylim((-0.05, 1.05)) ax2r.set_yticks([0, .5, 1]) ax2r.set_yticklabels(['0', '0.5', '1']) ax2r.tick_params(axis='both', which='major', pad=15) tight_padding = 0.1 fig.savefig(filename + '.png', format='png', bbox_inches='tight', pad_inches=tight_padding, dpi=dpi) plt.close() plt.rcParams.update(plt.rcParamsDefault) LOGGER.info(f'Saturation mutagenesis figure saved to {filename}.png') # write a map in html format, to make figure clickable if html: all_axis = {'strip': ax0, 'table': ax1, 'bplot': ax2} # precompute some useful quantities for html code html_data = {} # dpi of printed figure html_data["dpi"] = dpi # figure size *before* tight html_data["fig_size"] = fig.get_size_inches() # tight bbox as used by fig.savefig() html_data["tight_bbox"] = fig.get_tightbbox(fig.canvas.get_renderer()) # compute new origin and height, based on tight box and padding html_data["new_orig"] = html_data["tight_bbox"].min - tight_padding html_data["new_height"] = (html_data["tight_bbox"].height + 2 * tight_padding) def get_area_coords(ax, d): assert ax_type in ("strip", "table", "bplot") # get bbox coordinates (x0, y0, x1, y1) bbox = ax.get_position().get_points() # get bbox coordinates in inches b_inch = bbox * d["fig_size"] # adjust bbox coordinates based on tight bbox b_adj = b_inch - d["new_orig"] # use html reference system (y = 1 - y) b_html = b_adj * np.array([1, -1]) + np.array([0, d["new_height"]]) # convert to pixels b_px = (d["dpi"] * b_html).astype(int) b_px = np.sort(b_px, axis=0) # put in html format coords = '{},{},{},{}'.format(*b_px.flatten()) # output return coords # html templates area_html = Template('<area shape="rect" coords="$coords" ' 'id="{{map_id}}_$areaid" {{area_attrs}}> \n') # write html with open(filename + '.html', 'w') as f: f.write('<div>\n') f.write('<map name="{{map_id}}" id="{{map_id}}" {{map_attrs}}>\n') for ax_type, ax in all_axis.items(): fields = {'areaid': ax_type} fields['coords'] = get_area_coords(ax, html_data) f.write(area_html.substitute(fields)) f.write('</map>\n') f.write('</div>\n') # populate info table that will be passed as a javascript variable best_preds = rhapsody_obj.getPredictions() best_avg_preds = rhapsody_obj.getResAvgPredictions() PDB_coords = rhapsody_obj.getPDBcoords() abbrev = { '?': '?', 'deleterious': 'del', 'neutral': 'neu', 'prob.delet.': 'p.del', 'prob.neutral': 'p.neu' } info = {} for k in ['strip', 'table', 'bplot']: n_cols = 20 if k == 'table' else 1 info[k] = [[''] * nres_shown for i in range(n_cols)] for i, row in enumerate(rhapsody_obj.data): SAV = row['SAV coords'] acc, resid, aa_wt, aa_mut = SAV.split() resid = int(resid) # consider only residues shown in figure if not (res_i <= resid <= res_f): continue # SAV coordinates SAV_code = f'{aa_wt}{resid}{aa_mut}' # coordinates on table t_i = aa_map[aa_mut] t_j = resid - 1 # coordinates on *shown* table ts_i = t_i ts_j = resid - res_i # compose message for table bp = best_preds[i] pprob = bp['path. prob.'] pclass = bp['path. class'] clsf = main_clsf if row['best classifier'] == 'main' else aux_clsf m = f'{SAV_code}: Rhapsody-{clsf} = {pprob:<3.2f} ({pclass})' if PolyPhen2: score = bp['PolyPhen-2 score'] pclass = abbrev[bp['PolyPhen-2 path. class']] m += f', PolyPhen-2 = {score:<3.2f} ({pclass})' if EVmutation: score = bp['EVmutation score'] pclass = abbrev[bp['EVmutation path. class']] m += f', EVmutation = {score:<3.2f} ({pclass})' if extra_plot is not None: score = table_other[t_i, t_j] m += f', other = {score:<3.2f}' info['table'][ts_i][ts_j] = m info['table'][aa_map[aa_wt]][ts_j] = f'{SAV_code[:-1]}: wild-type' if i % 19 == 0: # compose message for upper strip PDBID, ch, resid, aa, size = PDB_coords[i][[ 'PDBID', 'chain', 'resid', 'resname', 'PDB size' ]] if size > 0: m = f'{PDBID}:{ch}, resid {resid}, aa {aa}, size {size}' else: m = 'no PDB found' info['strip'][0][ts_j] = m # compose message for bottom plot (residue-averages) bap = best_avg_preds[int(i / 19)] pprob = bap['path. prob.'] pcl = bap['path. class'] m = f'{SAV_code[:-1]}: Rhapsody-{clsf} = {pprob:<3.2f} ({pcl})' if PolyPhen2: score = bap['PolyPhen-2 score'] pcl = abbrev[bap['PolyPhen-2 path. class']] m += f', PolyPhen-2 = {score:<3.2f} ({pcl})' if EVmutation: score = bap['EVmutation score'] pcl = abbrev[bap['EVmutation path. class']] m += f', EVmutation = {score:<3.2f} ({pcl})' if extra_plot is not None: score = avg_p_other[t_j] m += f', other = {score:<3.2f}' info['bplot'][0][ts_j] = m def create_info_msg(ax_type, d): text = '[ \n' for row in d: text += ' [' for m in row: text += f'"{m}",' text += '], \n' text += ']' return text area_js = Template('{{map_data}}["{{map_id}}_$areaid"] = { \n' ' "img_id": "{{img_id}}", \n' ' "map_id": "{{map_id}}", \n' ' "coords": [$coords], \n' ' "num_rows": $num_rows, \n' ' "num_cols": $num_cols, \n' ' "info_msg": $info_msg, \n' '}; \n') # dump info in javascript format with open(filename + '.js', 'w') as f: f.write('var {{map_data}} = {}; \n') for ax_type, d in info.items(): vars = {'areaid': ax_type} vars['coords'] = get_area_coords(all_axis[ax_type], html_data) vars['num_rows'] = 20 if ax_type == 'table' else 1 vars['num_cols'] = nres_shown vars['info_msg'] = create_info_msg(ax_type, d) f.write(area_js.substitute(vars)) return info return
def calcEvolProperties(self, resid='all', refresh=False, folder=None, max_cols=None, max_seqs=25000, **kwargs): ''' Computes Evol properties, i.e. Shannon entropy, Mutual Information and Direct Information, from Pfam Multiple Sequence Alignments, for a given residue. ''' assert type(refresh) is bool # recover Pfam mapping (if not found already) self._searchPfam(refresh=refresh) if resid == 'all': PF_list = self.Pfam.keys() else: # get list of Pfam domains containing resid PF_list = [ k for k in self.Pfam if any([ resid >= int(segment['start']) and resid <= int(segment['end']) for segment in self.Pfam[k]['locations'] ]) ] if len(PF_list) == 0: raise RuntimeError(f'No Pfam domain for resid {resid}.') if len(PF_list) > 1: LOGGER.warn(f'Residue {resid} is found in multiple ' '({}) Pfam domains.'.format(len(PF_list))) if folder is None: folder = SETTINGS.get('rhapsody_local_folder') if folder is None: folder = '.' else: folder = os.path.join(folder, 'pickles') # iterate over Pfam families for PF in PF_list: d = self.Pfam[PF] # skip if properties are pre-computed if not refresh and d.get('mapping') is not None: continue d['mapping'] = None d['ref_MSA'] = None d['entropy'] = np.nan d['MutInfo'] = np.nan d['DirInfo'] = np.nan try: LOGGER.info('Processing {}...'.format(PF)) # fetch & parse MSA # fname = PF + '_full.sth' # fullname = os.path.join(folder, fname) # if not os.path.isfile(fullname): # f = fetchPfamMSA(PF) # shutil.move(f, folder) # msa = parseMSA(fullname, **kwargs) # fetch & parse MSA without saving downloaded MSA f = pd.fetchPfamMSA(PF) msa = pd.parseMSA(f, **kwargs) os.remove(f) # slice MSA to match all segments of the Uniprot sequence sliced_msa, indexes = self._sliceMSA(msa) # if max_cols is not None and sliced_msa.numResidues() > max_cols: # raise Exception('Unable to compute DI: MSA has ' +\ # 'too many columns (max: {}).'.format(max_cols)) # get mapping between Uniprot sequence and Pfam domain d['mapping'] = self._mapUniprot2Pfam(PF, sliced_msa, indexes) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) d['mapping'] = str(e) continue try: # refine MSA ('seqid' param. is set as in PolyPhen-2) rowocc = 0.6 while True: sliced_msa = pd.refineMSA(sliced_msa, rowocc=rowocc) rowocc += 0.02 if sliced_msa.numSequences() <= max_seqs or rowocc >= 1: break ref_msa = pd.refineMSA(sliced_msa, seqid=0.94, **kwargs) d['ref_MSA'] = ref_msa # compute evolutionary properties d['entropy'] = pd.calcShannonEntropy(ref_msa) d['MutInfo'] = pd.buildMutinfoMatrix(ref_msa) # d['DirInfo'] = buildDirectInfoMatrix(ref_msa) except Exception as e: LOGGER.warn('{}: {}'.format(PF, e)) return {k: self.Pfam[k] for k in PF_list}
def calcPairDeformationDist(model, coords, ind1, ind2, kbt=1.): """Returns distribution of the deformations in the distance contributed by each mode for selected pair of residues *ind1* *ind2* using *model* from a :class:`.ANM`. Method described in [EB08]_ equation (10) and figure (2). .. [EB08] Eyal E., Bahar I. Toward a Molecular Understanding of the Anisotropic Response of Proteins to External Forces: Insights from Elastic Network Models. *Biophys J* **2008** 94:3424-34355. :arg model: this is an 3-dimensional :class:`NMA` instance from a :class:`.ANM` calculations. :type model: :class:`.ANM` :arg coords: a coordinate set or an object with :meth:`getCoords` method. Recommended: ``coords = parsePDB('pdbfile').select('protein and name CA')``. :type coords: :class:`~numpy.ndarray`. :arg ind1: first residue number. :type ind1: int :arg ind2: secound residue number. :type ind2: int """ try: resnum_list = coords.getResnums() resnam_list = coords.getResnames() coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') if not isinstance(model, NMA): raise TypeError('model must be a NMA instance') elif not model.is3d(): raise TypeError('model must be a 3-dimensional NMA instance') elif len(model) == 0: raise ValueError('model must have normal modes calculated') linalg = importLA() n_atoms = model.numAtoms() n_modes = model.numModes() LOGGER.timeit('_pairdef') r_ij = np.zeros((n_atoms, n_atoms, 3)) r_ij_norm = np.zeros((n_atoms, n_atoms, 3)) for i in range(n_atoms): for j in range(i + 1, n_atoms): r_ij[i][j] = coords[j, :] - coords[i, :] r_ij[j][i] = r_ij[i][j] r_ij_norm[i][j] = r_ij[i][j] / linalg.norm(r_ij[i][j]) r_ij_norm[j][i] = r_ij_norm[i][j] eigvecs = model.getEigvecs() eigvals = model.getEigvals() D_pair_k = [] mode_nr = [] ind1 = ind1 - resnum_list[0] ind2 = ind2 - resnum_list[0] for m in range(6, n_modes): U_ij_k = [(eigvecs[ind1*3][m] - eigvecs[ind2*3][m]), (eigvecs[ind1*3+1][m] \ - eigvecs[ind2*3+1][m]), (eigvecs[ind1*3+2][m] - eigvecs[ind2*3+2][m])] D_ij_k = abs( sqrt(kbt / eigvals[m]) * (np.vdot(r_ij_norm[ind1][ind2], U_ij_k))) D_pair_k.append(D_ij_k) mode_nr.append(m) LOGGER.report('Deformation was calculated in %.2lfs.', label='_pairdef') return mode_nr, D_pair_k
def _getPolymers(lines): """Returns list of polymers (macromolecules).""" pdbid = lines['pdbid'] polymers = dict() for i, line in lines['SEQRES']: ch = line[11] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.sequence += ''.join(getSequence(line[19:].split())) for i, line in lines['DBREF ']: i += 1 ch = line[12] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('DBREF chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] dbabbr = line[26:32].strip() dbref = DBRef() dbref.dbabbr = dbabbr dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown') dbref.accession = line[33:41].strip() dbref.idcode = line[42:54].strip() try: first = int(line[14:18]) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'initial sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: last = int(line[20:24]) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'ending sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.first = (first, line[18], int(line[56:60])) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'initial sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.last = (last, line[24].strip(), int(line[62:67])) except: LOGGER.warn('DBREF for chain {2}: failed to parse ' 'ending sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.dbrefs.append(dbref) dbref1 = lines['DBREF1'] dbref2 = lines['DBREF2'] if len(dbref1) != len(dbref2): LOGGER.warn('DBREF1 and DBREF1 records are not complete') dbref12 = [] else: dbref12 = zip(dbref1, dbref2) # PY3K: OK for dbref1, dbref2 in dbref12: i, line = dbref1 i += 1 ch = line[12] dbabbr = line[26:32].strip() dbref = DBRef() dbref.dbabbr = dbabbr dbref.database = _PDB_DBREF.get(dbabbr, 'Unknown') dbref.idcode = line[47:67].strip() try: first = int(line[14:18]) except: LOGGER.warn('DBREF1 for chain {2}: failed to parse ' 'initial sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: last = int(line[20:24]) except: LOGGER.warn('DBREF1 for chain {2}: failed to parse ' 'ending sequence number of the PDB sequence ' '({0}:{1})'.format(pdbid, i, ch)) i, line = dbref2 i += 1 if line[12] == ' ': LOGGER.warn('DBREF2 chain identifier is not specified ' '({0}:{1})'.format(pdbid, ch)) elif line[12] != ch: LOGGER.warn('DBREF1 and DBREF2 chain id mismatch' '({0}:{1})'.format(pdbid, ch)) dbref.accession = line[18:40].strip() try: dbref.first = (first, line[18].strip(), int(line[45:55])) except: LOGGER.warn('DBREF2 for chain {2}: failed to parse ' 'initial sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) try: dbref.last = (last, line[24].strip(), int(line[57:67])) except: LOGGER.warn('DBREF2 for chain {2}: failed to parse ' 'ending sequence number of the database sequence ' '({0}:{1})'.format(pdbid, i, ch)) poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.dbrefs.append(dbref) for poly in polymers.values(): # PY3K: OK resnum = [] for dbref in poly.dbrefs: dbabbr = dbref.dbabbr if dbabbr == 'PDB': if not (pdbid == dbref.accession == dbref.idcode): LOGGER.warn('DBREF for chain {2} refers to PDB ' 'entry {3} ({0}:{1})'.format( pdbid, i, ch, dbref.accession)) else: if pdbid == dbref.accession or pdbid == dbref.idcode: LOGGER.warn('DBREF for chain {2} is {3}, ' 'expected PDB ({0}:{1})'.format( pdbid, i, ch, dbabbr)) dbref.database = 'PDB' resnum.append((dbref.first[0], dbref.last[0])) resnum.sort() last = -10000 for first, temp in resnum: if first <= last: LOGGER.warn('DBREF records overlap for chain {0} ({1})'.format( poly.chid, pdbid)) last = temp for i, line in lines['MODRES']: ch = line[16] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('MODRES chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly if poly.modified is None: poly.modified = [] poly.modified.append( (line[12:15].strip(), line[18:22].strip() + line[22].strip(), line[24:27].strip(), line[29:70].strip())) for i, line in lines['SEQADV']: i += 1 ch = line[16] if ch == ' ': if not len(polymers) == 1: LOGGER.warn('MODRES chain identifier is not specified ' '({0}:{1})'.format(pdbid, i)) continue else: ch = list(polymers)[0] poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly dbabbr = line[24:28].strip() resname = line[12:15].strip() try: resnum = int(line[18:22].strip()) except: #LOGGER.warn('SEQADV for chain {2}: failed to parse PDB sequence ' # 'number ({0}:{1})'.format(pdbid, i, ch)) continue icode = line[22].strip() try: dbnum = int(line[43:48].strip()) except: #LOGGER.warn('SEQADV for chain {2}: failed to parse database ' # 'sequence number ({0}:{1})'.format(pdbid, i, ch)) continue comment = line[49:70].strip() match = False for dbref in poly.dbrefs: if not dbref.first[0] <= resnum <= dbref.last[0]: continue match = True if dbref.dbabbr != dbabbr: LOGGER.warn('SEQADV for chain {2}: reference database ' 'mismatch, expected {3} parsed {4} ' '({0}:{1})'.format(pdbid, i, ch, repr(dbref.dbabbr), repr(dbabbr))) continue dbacc = line[29:38].strip() if dbref.accession[:9] != dbacc[:9]: LOGGER.warn('SEQADV for chain {2}: accession code ' 'mismatch, expected {3} parsed {4} ' '({0}:{1})'.format(pdbid, i, ch, repr(dbref.accession), repr(dbacc))) continue dbref.diff.append((resname, resnum, icode, dbnum, dbnum, comment)) if not match: LOGGER.warn('SEQADV for chain {2}: database sequence reference ' 'not found ({0}:{1})'.format(pdbid, i, ch)) continue string = ' '.join([line[10:].strip() for i, line in lines['COMPND']]) if string.startswith('MOL_ID'): dict_ = {} for molecule in string[6:].split('MOL_ID'): dict_.clear() for token in molecule.split(';'): token = token.strip() if not token: continue items = token.split(':', 1) if len(items) == 2: key, value = items dict_[key.strip()] = value.strip() chains = dict_.pop('CHAIN', '').strip() if not chains: continue for ch in chains.split(','): ch = ch.strip() poly = polymers.get(ch, Polymer(ch)) polymers[ch] = poly poly.name = dict_.get('MOLECULE', '') poly.fragment = dict_.get('FRAGMENT', '') poly.comments = dict_.get('OTHER_DETAILS', '') val = dict_.get('SYNONYM', '') poly.synonyms = [s.strip() for s in val.split(',')] if val else [] val = dict_.get('EC', '') poly.ec = [s.strip() for s in val.split(',')] if val else [] poly.engineered = dict_.get('ENGINEERED', '') == 'YES' poly.mutation = dict_.get('MUTATION', '') == 'YES' return list(polymers.values())
def writeDCD(filename, trajectory, start=None, stop=None, step=None, align=False): """Write 32-bit CHARMM format DCD file (also NAMD 2.1 and later). *trajectory* can be an :class:`Trajectory`, :class:`DCDFile`, or :class:`Ensemble` instance. *filename* is returned upon successful output of file.""" if not filename.lower().endswith('.dcd'): filename += '.dcd' if not isinstance(trajectory, (TrajBase, Ensemble, Atomic)): raise TypeError('{0} is not a valid type for trajectory'.format( type(trajectory))) irange = list( range(*slice(start, stop, step).indices(trajectory.numCoordsets()))) n_csets = len(irange) if n_csets == 0: raise ValueError('trajectory does not have any coordinate sets, or ' 'no coordinate sets are selected') if isinstance(trajectory, Atomic): isEnsemble = False isAtomic = True n_atoms = trajectory.numAtoms() else: isEnsemble = True isAtomic = False n_atoms = trajectory.numSelected() if n_atoms == 0: raise ValueError('no atoms are selected in the trajectory') if isinstance(trajectory, TrajBase): isTrajectory = True unitcell = trajectory.hasUnitcell() nfi = trajectory.nextIndex() trajectory.reset() pack_i_48 = pack('i', 48) if isinstance(trajectory, Trajectory): timestep = trajectory.getTimestep()[0] first_ts = trajectory.getFirstTimestep()[0] framefreq = trajectory.getFrameFreq()[0] n_fixed = trajectory.numFixed()[0] else: timestep = trajectory.getTimestep() first_ts = trajectory.getFirstTimestep() framefreq = trajectory.getFrameFreq() n_fixed = trajectory.numFixed() else: isTrajectory = False unitcell = False if isinstance(trajectory, Ensemble): frame = trajectory[0] else: frame = trajectory acsi = trajectory.getACSIndex() timestep = 1 first_ts = 0 framefreq = 1 n_fixed = 0 dcd = DCDFile(filename, mode='w') LOGGER.progress('Writing DCD', len(irange), '_prody_writeDCD') prev = -1 uc = None time_ = time() for j, i in enumerate(irange): diff = i - prev if diff > 1: trajectory.skip(diff - 1) prev = i if isTrajectory: frame = next(trajectory) if frame is None: break if unitcell: uc = frame._getUnitcell() uc[3:] = np.sin((PISQUARE / 90) * (90 - uc[3:])) uc = uc[[0, 3, 1, 4, 5, 2]] elif isEnsemble: frame._index = i else: frame.setACSIndex(i) if align: frame.superpose() if j == 0: dcd.write(frame._getCoords(), uc, timestep=timestep, firsttimestep=first_ts, framefreq=framefreq) else: dcd.write(frame._getCoords(), uc) LOGGER.update(i, label='_prody_writeDCD') if isAtomic: trajectory.setACSIndex(acsi) j += 1 LOGGER.finish() dcd.close() time_ = time() - time_ or 0.01 dcd_size = 1.0 * (56 + (n_atoms * 3 + 6) * 4) * n_csets / (1024 * 1024) LOGGER.info('DCD file was written in {0:.2f} seconds.'.format(time_)) LOGGER.info('{0:.2f} MB written at input rate {1:.2f} MB/s.'.format( dcd_size, dcd_size / time_)) LOGGER.info( '{0} coordinate sets written at output rate {1} frame/s.'.format( n_csets, int(n_csets / time_))) if j != n_csets: LOGGER.warn('Warning: {0} frames expected, {1} written.'.format( n_csets, j)) if isTrajectory: trajectory.goto(nfi) return filename
def parseHeatmap(heatmap, **kwargs): """Return a two dimensional array and a dictionary with information parsed from *heatmap*, which may be an input stream or an :file:`.hm` file in VMD plugin Heat Mapper format.""" try: readline, close = heatmap.readline, lambda: None except AttributeError: heatmap = openFile(heatmap) readline, close = heatmap.readline, heatmap.close meta = {} arrs = [] line = readline() while line: if line.startswith('-'): label, data = line[1:].split(None, 1) data = data.strip() if data[0] == data[-1] == '"': data = data[1:-1] label = label.strip() try: meta[label] = HMTYPES[label](data) except KeyError: LOGGER.warn('Unrecognized label encountered: {0}' .format(repr(label))) meta[label] = HMTYPES[label](data) except TypeError: LOGGER.warn('Could not parse data with label {0}.' .format(repr(label))) else: arrs.append(line.rstrip()) line = readline() close() nnums = len(meta.get('numbering', '')) heatmap = [] numbers = [] for arr in arrs: if nnums: items = arr.split(':', nnums + 1) numbers.append(items[:nnums]) else: items = [arr] heatmap.append(fromstring(items[-1], float, sep=';')) heatmap = array(heatmap) if nnums: numbering = meta['numbering'] try: numbers = array(numbers, int) except ValueError: try: numbers = array(numbers, float) except ValueError: LOGGER.warn('Numbering for y-axis could not be parsed.') numbering = [] for i, label in enumerate(numbering): meta[label] = numbers[:, i].copy() return heatmap, meta
def fetchPDBviaFTP(*pdb, **kwargs): """Retrieve PDB (default), PDBML, mmCIF, or EMD file(s) for specified *pdb* identifier(s) and return path(s). Downloaded files will be stored in local PDB folder, if one is set using :meth:`.pathPDBFolder`, and copied into *folder*, if specified by the user. If no destination folder is specified, files will be saved in the current working directory. If *compressed* is **False**, decompressed files will be copied into *folder*. *format* keyword argument can be used to retrieve `PDBML <http://pdbml.pdb.org/>`_, `mmCIF <http://mmcif.pdb.org/>`_ and `PDBML <ftp://ftp.wwpdb.org/pub/emdb/doc/Map-format/current/EMDB_map_format.pdf>`_ files: ``format='cif'`` will fetch an mmCIF file, ``format='emd'`` will fetch an EMD file, and ``format='xml'`` will fetch a PDBML file. If PDBML header file is desired, ``noatom=True`` argument will do the job.""" if kwargs.get('check', True): identifiers = checkIdentifiers(*pdb) else: identifiers = list(pdb) output_folder = kwargs.pop('folder', None) compressed = bool(kwargs.pop('compressed', True)) format = str(kwargs.pop('format', 'pdb')).lower() noatom = bool(kwargs.pop('noatom', False)) report = kwargs.get('report', True) if format == 'pdb': ftp_divided = 'pdb/data/structures/divided/pdb' ftp_pdbext = '.ent.gz' ftp_prefix = 'pdb' extension = '.pdb' elif format == 'xml': if noatom: ftp_divided = 'pdb/data/structures/divided/XML-noatom' ftp_pdbext = '-noatom.xml.gz' extension = '-noatom.xml' else: ftp_divided = 'pdb/data/structures/divided/XML' ftp_pdbext = '.xml.gz' extension = '.xml' ftp_prefix = '' elif format == 'cif': ftp_divided = 'pdb/data/structures/divided/mmCIF' ftp_pdbext = '.cif.gz' ftp_prefix = '' extension = '.cif' elif format == 'emd' or format == 'map': ftp_divided = 'emdb/structures' ftp_pdbext = '.map.gz' ftp_prefix = 'emd_' extension = '.map' else: raise ValueError(repr(format) + ' is not valid format') local_folder = pathPDBFolder() if format == 'pdb' and local_folder: local_folder, is_divided = local_folder if is_divided: getPath = lambda pdb: join(makePath(join(local_folder, pdb[1:3])), 'pdb' + pdb + '.pdb.gz') else: getPath = lambda pdb: join(local_folder, pdb + '.pdb.gz') if output_folder is None: second = lambda filename, pdb: filename else: if compressed: second = lambda filename, pdb: (copyFile( filename, join(output_folder, pdb + extension + '.gz'))) else: second = lambda filename, pdb: gunzip( filename, join(output_folder, pdb + extension)) else: if output_folder is None: output_folder = getcwd() if compressed: getPath = lambda pdb: join(output_folder, pdb + extension + '.gz') second = lambda filename, pdb: filename else: getPath = lambda pdb: join(output_folder, pdb + extension) second = lambda filename, pdb: gunzip(getPath(pdb), getPath(pdb)) ftp_name, ftp_host, ftp_path = WWPDB_FTP_SERVERS[wwPDBServer() or 'us'] if report: LOGGER.debug('Connecting wwPDB FTP server {0}.'.format(ftp_name)) from ftplib import FTP try: ftp = FTP(ftp_host) except Exception as error: raise type(error)('FTP connection problem, potential reason: ' 'no internet connectivity') else: success = 0 failure = 0 filenames = [] ftp.login('') for pdb in identifiers: if pdb is None: filenames.append(None) continue data = [] ftp_fn = ftp_prefix + pdb + ftp_pdbext try: ftp.cwd(ftp_path) ftp.cwd(ftp_divided) if format == 'emd': ftp.cwd('EMD-{0}/map'.format(pdb)) else: ftp.cwd(pdb[1:3]) ftp.retrbinary('RETR ' + ftp_fn, data.append) except Exception as error: if ftp_fn in ftp.nlst(): LOGGER.warn( '{0} download failed ({1}). It is ' 'possible that you do not have rights to ' 'download .gz files in the current network.'.format( pdb, str(error))) else: LOGGER.warn('{0} download failed. {1} does not exist ' 'on {2}.'.format(ftp_fn, pdb, ftp_host)) failure += 1 filenames.append(None) else: if len(data): filename = getPath(pdb) with open(filename, 'w+b') as pdbfile: write = pdbfile.write [write(block) for block in data] filename = normpath(relpath(second(filename, pdb))) if report: LOGGER.debug('{0} downloaded ({1})'.format( pdb, sympath(filename))) success += 1 filenames.append(filename) else: LOGGER.warn( '{0} download failed, reason unknown.'.format(pdb)) failure += 1 filenames.append(None) ftp.quit() if report: LOGGER.debug('PDB download via FTP completed ({0} downloaded, ' '{1} failed).'.format(success, failure)) if len(identifiers) == 1: return filenames[0] else: return filenames
def parseMSA(filename, **kwargs): """Return an :class:`.MSA` instance that stores multiple sequence alignment and sequence labels parsed from Stockholm, SELEX, or FASTA format *filename* file, which may be a compressed file. Uncompressed MSA files are parsed using C code at a fraction of the time it would take to parse compressed files in Python.""" from .msa import MSA try: fileok = isfile(filename) except TypeError: raise TypeError('filename must be a string') else: if not fileok: raise IOError('[Errno 2] No such file or directory: ' + repr(filename)) # if MSA is a compressed file or filter/slice is passed, use # Python parsers LOGGER.timeit('_parsemsa') title, ext = splitext(filename) title = split(title)[1] aligned = kwargs.get('aligned', True) if (ext.lower() == '.gz' or 'filter' in kwargs or 'slice' in kwargs or not aligned): if ext.lower() == '.gz': title = splitext(title)[0] msa = MSAFile(filename, split=False, **kwargs) seqlist = [] sappend = seqlist.append labels = [] lappend = labels.append mapping = {} maxlen = 0 for i, seq in enumerate(msa): label = seq.getLabel(True) lappend(label) if aligned: sappend(seq._array) else: if len(seq) > maxlen: maxlen = len(seq) sappend(seq) key = splitSeqLabel(label)[0] if key in mapping: try: mapping[key].append(i) except AttributeError: mapping[key] = [mapping[key], i] else: mapping[key] = i if not seqlist: LOGGER.warn('No sequences were parsed from {0}.'.format(filename)) return if aligned: msaarr = array(seqlist, '|S1') else: msaarr = array(seqlist, '|S' + str(maxlen)) else: filesize = getsize(filename) format = MSAEXTMAP.get(splitext(filename)[1]) if format == FASTA: from .msaio import parseFasta as parser elif format == SELEX or format == STOCKHOLM: from .msaio import parseSelex as parser else: raise IOError('MSA file format is not recognized from the ' 'extension') msaarr = empty(filesize, '|S1') msaarr, labels, mapping, lcount = parser(filename, msaarr) if lcount != len(msaarr): LOGGER.warn('Failed to parse {0} sequence labels.' .format(len(msaarr) - lcount)) msa = MSA(msa=msaarr, title=title, labels=labels, mapping=mapping, aligned=aligned) if aligned: LOGGER.report('{0} sequence(s) with {1} residues were parsed in ' '%.2fs.'.format(*msaarr.shape), '_parsemsa') else: LOGGER.report('{0} sequence(s) were parsed in %.2fs.' .format(*msaarr.shape), '_parsemsa') return msa
def buildHessian(self, coords, masses, cutoff=15., gamma=1., **kwargs): """Build Hessian matrix for given coordinate set. NOTE: MASS WEIGHTING OF HESSIAN IS PERFORMED :arg coords: a coordinate set or an object with ``getCoords`` method :type coords: :class:`numpy.ndarray` :arg cutoff: cutoff distance (Å) for pairwise interactions, default is 15.0 Å, minimum is 4.0 Å :type cutoff: float :arg gamma: spring constant, default is 1.0 :type gamma: float, :class:`Gamma` :arg sparse: elect to use sparse matrices, default is **False**. If Scipy is not found, :class:`ImportError` is raised. :type sparse: bool :arg kdtree: elect to use KDTree for building Hessian matrix, default is **False** since KDTree method is slower :type kdtree: bool Instances of :class:`Gamma` classes and custom functions are accepted as *gamma* argument. When Scipy is available, user can select to use sparse matrices for efficient usage of memory at the cost of computation speed.""" try: coords = (coords._getCoords() if hasattr(coords, '_getCoords') else coords.getCoords()) except AttributeError: try: checkCoords(coords) except TypeError: raise TypeError('coords must be a Numpy array or an object ' 'with `getCoords` method') cutoff, g, gamma = checkENMParameters(cutoff, gamma) self._reset() self._cutoff = cutoff self._gamma = g n_atoms = coords.shape[0] dof = n_atoms * 3 LOGGER.timeit('_anm_hessian') sparse = kwargs.get('sparse', False) if sparse: try: from scipy import sparse as scipy_sparse except ImportError: raise ImportError('failed to import scipy.sparse, which is ' 'required for sparse matrix calculations') kirchhoff = scipy_sparse.lil_matrix((n_atoms, n_atoms)) hessian = scipy_sparse.lil_matrix((dof, dof)) else: kirchhoff = np.zeros((n_atoms, n_atoms), 'd') hessian = np.zeros((dof, dof), float) if kwargs.get('kdtree', False): LOGGER.info('Using KDTree for building the Hessian.') kdtree = KDTree(coords) kdtree.search(cutoff) for i, j in kdtree.getIndices(): i2j = coords[j] - coords[i] dist2 = np.dot(i2j, i2j) g = gamma(dist2, i, j) mass_i = masses[i] mass_j = masses[j] #super_element = np.outer(i2j, i2j) * (- g / dist2) super_element = np.outer( i2j, i2j) * (-g / dist2) * (1.0 / np.sqrt(mass_i * mass_j)) res_i3 = i * 3 res_i33 = res_i3 + 3 res_j3 = j * 3 res_j33 = res_j3 + 3 hessian[res_i3:res_i33, res_j3:res_j33] = super_element hessian[res_j3:res_j33, res_i3:res_i33] = super_element hessian[res_i3:res_i33, res_i3:res_i33] = \ hessian[res_i3:res_i33, res_i3:res_i33] - super_element hessian[res_j3:res_j33, res_j3:res_j33] = \ hessian[res_j3:res_j33, res_j3:res_j33] - super_element kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] + g kirchhoff[j, j] = kirchhoff[j, j] + g else: print "anm-mod: not using kdtree - using mass weighting" # # mass weighting of the hessian # cutoff2 = cutoff * cutoff for i in range(n_atoms): res_i3 = i * 3 res_i33 = res_i3 + 3 i_p1 = i + 1 i2j_all = coords[i_p1:, :] - coords[i] for j, dist2 in enumerate((i2j_all**2).sum(1)): if dist2 > cutoff2: continue i2j = i2j_all[j] j += i_p1 g = gamma(dist2, i, j) mass_i = masses[i] mass_j = masses[j] # super element in my own code # hessian_ij = np.outer(i2j,i2j)*( - self.k/dist_ij)*(1/np.sqrt(mi*mj)) res_j3 = j * 3 res_j33 = res_j3 + 3 super_element = np.outer(i2j, i2j) * (-g / dist2) * ( 1.0 / np.sqrt(mass_i * mass_j)) # super_element = np.outer(i2j, i2j) * (- g / dist2) hessian[res_i3:res_i33, res_j3:res_j33] = super_element hessian[res_j3:res_j33, res_i3:res_i33] = super_element hessian[res_i3:res_i33, res_i3:res_i33] = \ hessian[res_i3:res_i33, res_i3:res_i33] - super_element hessian[res_j3:res_j33, res_j3:res_j33] = \ hessian[res_j3:res_j33, res_j3:res_j33] - super_element kirchhoff[i, j] = -g kirchhoff[j, i] = -g kirchhoff[i, i] = kirchhoff[i, i] + g kirchhoff[j, j] = kirchhoff[j, j] + g if sparse: kirchhoff = kirchhoff.tocsr() hessian = hessian.tocsr() LOGGER.report('Hessian was built in %.2fs.', label='_anm_hessian') self._kirchhoff = kirchhoff self._hessian = hessian self._n_atoms = n_atoms self._dof = dof
def parsePfamPDBs(query, data=[], **kwargs): """Returns a list of :class:`.AtomGroup` objects containing sections of chains that correspond to a particular PFAM domain family. These are defined by alignment start and end residue numbers. :arg query: UniProt ID or PDB ID If a PDB ID is provided the corresponding UniProt ID is used. If this returns multiple matches then start or end must also be provided. This query is also used for label refinement of the Pfam domain MSA. :type query: str :arg data: If given the data list from the Pfam mapping table will be output through this argument. :type data: list :keyword start: Residue number for defining the start of the domain. The PFAM domain that starts closest to this will be selected. Default is **1** :type start: int :keyword end: Residue number for defining the end of the domain. The PFAM domain that ends closest to this will be selected. :type end: int """ start = kwargs.pop('start', 1) end = kwargs.pop('end', None) if len(query) > 4 and query.startswith('PF'): pfam_acc = query else: pfam_matches = searchPfam(query) keys = list(pfam_matches.keys()) if isinstance(start, Integral): start_diff = [] for i, key in enumerate(pfam_matches): start_diff.append( int(pfam_matches[key]['locations'][0]['start']) - start) start_diff = np.array(start_diff) pfam_acc = keys[np.where( abs(start_diff) == min(abs(start_diff)))[0][0]] elif isinstance(end, Integral): end_diff = [] for i, key in enumerate(pfam_matches): end_diff.append( int(pfam_matches[key]['locations'][0]['end']) - end) end_diff = np.array(end_diff) pfam_acc = keys[np.where( abs(end_diff) == min(abs(end_diff)))[0][0]] else: raise ValueError('Please provide an integer for start or end ' 'when using a UniProt ID or PDB ID.') from ftplib import FTP from .uniprot import queryUniprot data_stream = BytesIO() ftp_host = 'ftp.ebi.ac.uk' ftp = FTP(ftp_host) ftp.login() ftp.cwd('pub/databases/Pfam/current_release') ftp.retrbinary('RETR pdbmap.gz', data_stream.write) ftp.quit() zip_data = data_stream.getvalue() data_stream.close() rawdata = gunzip(zip_data) if PY3K: rawdata = rawdata.decode() fields = [ 'PDB_ID', 'chain', 'nothing', 'PFAM_Name', 'PFAM_ACC', 'UniprotAcc', 'UniprotResnumRange' ] data_dicts = [] for line in rawdata.split('\n'): if line.find(pfam_acc) != -1: data_dicts.append({}) for j, entry in enumerate(line.strip().split('\t')): data_dicts[-1][fields[j]] = entry.strip(';') pdb_ids = [data_dict['PDB_ID'] for data_dict in data_dicts] chains = [data_dict['chain'] for data_dict in data_dicts] header = kwargs.pop('header', False) model = kwargs.get('model', None) results = parsePDB(*pdb_ids, chain=chains, header=True, **kwargs) ags, headers = results ags, headers = list(ags), list(headers) if model == 0: LOGGER.info('only header is requested and returned') return results if header: results = (ags, headers) else: results = ags LOGGER.progress('Extracting Pfam domains...', len(ags)) comma_splitter = re.compile(r'\s*,\s*').split no_info = [] for i, ag in enumerate(ags): LOGGER.update(i) data_dict = data_dicts[i] pfamRange = data_dict['UniprotResnumRange'].split('-') uniprotAcc = data_dict['UniprotAcc'] try: uniData = queryUniprot(uniprotAcc) except: LOGGER.warn('No Uniprot record found for {0}'.format( data_dict['PBD_ID'])) continue resrange = None found = False for key, value in uniData.items(): if not key.startswith('dbReference'): continue try: pdbid = value['PDB'] except: continue if pdbid != data_dict['PDB_ID']: continue pdbchains = value['chains'] # example chain strings: "A=27-139, B=140-150" or "A/B=27-150" pdbchains = comma_splitter(pdbchains) for chain in pdbchains: chids, resrange = chain.split('=') chids = [chid.strip() for chid in chids.split('/')] if data_dict['chain'] in chids: resrange = resrange.split('-') found = True break if found: break if found: header = headers[i] chain_accessions = [ dbref.accession for dbref in header[data_dict['chain']].dbrefs ] try: if len(chain_accessions) > 0: right_part = np.where( np.array(chain_accessions) == data_dict['UniprotAcc'])[0][0] else: raise ValueError( 'There is no accession for a chain in the Header') except: LOGGER.warn( 'Could not map domains in {0}'.format(data_dict['PDB_ID'] + data_dict['chain'])) no_info.append(i) continue right_dbref = header[data_dict['chain']].dbrefs[right_part] chainStart = ag.select('chain {0}'.format( data_dict['chain'])).getResnums()[0] missing = chainStart - right_dbref.first[0] partStart = ag.getResindices()[np.where( ag.getResnums() == right_dbref.first[0] + missing)][0] pfStart, pfEnd = int(pfamRange[0]), int(pfamRange[1]) uniStart, uniEnd = int(resrange[0]), int(resrange[1]) resiStart = pfStart - uniStart + partStart - missing resiEnd = pfEnd - uniStart + partStart - missing ags[i] = ag.select('resindex {0} to {1}'.format( resiStart, resiEnd)) else: no_info.append(i) LOGGER.finish() for i in reversed(no_info): ags.pop(i) if header: headers.pop(i) if isinstance(data, list): data.extend(data_dicts) else: LOGGER.warn('data should be a list in order to get output') return results
def _superpose(self, **kwargs): """Superpose conformations and update coordinates.""" ref = kwargs.pop('ref', None) indices = self._indices weights = self._weights mobs = self._confs if indices is None: idx = False tar = self._coords movs = None else: idx = True if self._weights is not None: weights = weights[indices] tar = self._coords[indices] movs = self._confs linalg = importLA() svd = linalg.svd det = linalg.det if weights is None: if ref is None: tar_com = tar.mean(0) else: tar_com = tar[ref] tar_org = (tar - tar_com) mob_org = zeros(tar_org.shape, dtype=mobs.dtype) tar_org = tar_org.T else: weights_sum = weights.sum() weights_dot = dot(weights.T, weights) if ref is None: tar_com = (tar * weights).sum(axis=0) / weights_sum else: tar_com = (tar[ref] * weights[ref]).sum(axis=0) / sum( weights[ref]) tar_org = (tar - tar_com) mob_org = zeros(tar_org.shape, dtype=mobs.dtype) LOGGER.progress('Superposing ', len(mobs), '_prody_ensemble') for i, mob in enumerate(mobs): if idx: mob = mob[indices] if weights is None: mob_com = mob.mean(0) matrix = dot(tar_org, subtract(mob, mob_com, mob_org)) else: mob_com = (mob * weights).sum(axis=0) / weights_sum subtract(mob, mob_com, mob_org) matrix = dot((tar_org * weights).T, (mob_org * weights)) / weights_dot U, s, Vh = svd(matrix) Id = array([[1, 0, 0], [0, 1, 0], [0, 0, sign(det(matrix))]]) rotation = dot(Vh.T, dot(Id, U.T)) if movs is None: mobs[i] = dot(mob_org, rotation) add(mobs[i], tar_com, mobs[i]) else: add(dot(movs[i], rotation), (tar_com - dot(mob_com, rotation)), movs[i]) LOGGER.update(i + 1, label='_prody_ensemble') LOGGER.finish()
def searchUniprotID(query, search_b=False, skip_a=False, **kwargs): """Returns Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file. Sequence queries must not contain gaps and must be at least 16 characters long :type query: str :arg search_b: search Pfam-B families when **True** :type search_b: bool :arg skip_a: do not search Pfam-A families when **True** :type skip_a: bool :arg ga: use gathering threshold when **True** :type ga: bool :arg evalue: user specified e-value cutoff, must be smaller than 10.0 :type evalue: float :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" query = str(query) seq = ''.join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit('_pfam') timeout = int(kwargs.get('timeout', 60)) url = prefix + 'protein/' + seq + '?output=xml' LOGGER.debug('Retrieving Pfam search results: ' + url) xml = None while LOGGER.timing('_pfam') < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError('Pfam search timed out or failed to parse results ' 'XML, check URL: ' + url) else: LOGGER.report('Pfam search completed in %.2fs.', '_pfam') if xml.find(b'There was a system error on your last request.') > 0: LOGGER.warn('No Pfam matches found for: ' + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError('failed to parse results XML, check URL: ' + url) result = root[0].get('id') return result
def refineMSA(msa, index=None, label=None, rowocc=None, seqid=None, colocc=None, **kwargs): """Refine *msa* by removing sequences (rows) and residues (columns) that contain gaps. :arg msa: multiple sequence alignment :type msa: :class:`.MSA` :arg index: remove columns that are gaps in the sequence with that index :type index: int :arg label: remove columns that are gaps in the sequence matching label, ``msa.getIndex(label)`` must return a sequence index, a PDB identifier is also acceptable :type label: str :arg rowocc: row occupancy, sequences with less occupancy will be removed after *label* refinement is applied :type rowocc: float :arg seqid: keep unique sequences at specified sequence identity level, unique sequences are identified using :func:`.uniqueSequences` :type seqid: float :arg colocc: column occupancy, residue positions with less occupancy will be removed after other refinements are applied :type colocc: float :arg keep: keep columns corresponding to residues not resolved in the PDB structure, default is **False**, applies when *label* is a PDB identifier :arg type: bool For Pfam MSA data, *label* is UniProt entry name for the protein. You may also use PDB structure and chain identifiers, e.g. ``'1p38'`` or ``'1p38A'``, for *label* argument and UniProt entry names will be parsed using :func:`.parsePDBHeader` function (see also :class:`.Polymer` and :class:`.DBRef`). The order of refinements are applied in the order of arguments. If *label* and *unique* is specified, sequence matching *label* will be kept in the refined :class:`.MSA` although it may be similar to some other sequence.""" # if msa is a char array, it will be refined but label won't work try: ndim, dtype_ = msa.ndim, msa.dtype except AttributeError: try: arr = msa._getArray() except AttributeError: raise TypeError('msa must be a character array or an MSA instance') ndim, dtype_ = arr.ndim, arr.dtype else: arr, msa = msa, None if dtype('|S1') != dtype_: raise ValueError('msa must be a character array or an MSA instance') if ndim != 2: raise ValueError('msa must be a 2D array or an MSA instance') title = [] cols = None if index is not None: before = arr.shape[1] LOGGER.timeit('_refine') cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) title.append('index=' + str(index)) LOGGER.report( 'Index refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if label is not None: if index is not None: LOGGER.info('An index was provided so the label will be ignored.') else: before = arr.shape[1] LOGGER.timeit('_refine') try: upper, lower = label.upper(), label.lower() except AttributeError: raise TypeError('label must be a string') if msa is None: raise TypeError('msa must be an MSA instance, ' 'label cannot be used') index = msa.getIndex(label) if index is None: index = msa.getIndex(upper) if index is None: index = msa.getIndex(lower) chain = None if index is None and (len(label) == 4 or len(label) == 5): from prody import parsePDB try: structure, header = parsePDB(label[:4], header=True) except Exception as err: raise IOError( 'failed to parse header for {0} ({1})'.format( label[:4], str(err))) chid = label[4:].upper() for poly in header['polymers']: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if index is None: index = msa.getIndex(dbref.idcode) if index is not None: LOGGER.info('{0} idcode {1} for {2}{3} ' 'is found in chain {4}.'.format( dbref.database, dbref.idcode, label[:4], poly.chid, str(msa))) break if index is None: index = msa.getIndex(dbref.accession) if index is not None: LOGGER.info('{0} accession {1} for {2}{3} ' 'is found in chain {4}.'.format( dbref.database, dbref.accession, label[:4], poly.chid, str(msa))) break if index is not None: chain = structure[poly.chid] resnums = chain.ca.getResnums() if index is None: raise ValueError('label is not in msa, or msa is not indexed') try: len(index) except TypeError: pass else: raise ValueError( 'label {0} maps onto multiple sequences, ' 'so cannot be used for refinement'.format(label)) title.append('label=' + label) cols = char.isalpha(arr[index]).nonzero()[0] arr = arr.take(cols, 1) LOGGER.report( 'Label refinement reduced number of columns from {0} to ' '{1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if chain is not None and not kwargs.get('keep', False): before = arr.shape[1] LOGGER.timeit('_refine') from Bio import pairwise2 from prody.utilities import MATCH_SCORE, MISMATCH_SCORE from prody.utilities import GAP_PENALTY, GAP_EXT_PENALTY, ALIGNMENT_METHOD chseq = chain.getSequence() algn = pairwise2.align.localms(pystr( arr[index].tostring().upper()), pystr(chseq), MATCH_SCORE, MISMATCH_SCORE, GAP_PENALTY, GAP_EXT_PENALTY, one_alignment_only=1) torf = [] for s, c in zip(*algn[0][:2]): if s == '-': continue elif c != '-': torf.append(True) else: torf.append(False) torf = array(torf) tsum = torf.sum() assert tsum <= before, 'problem in mapping sequence to structure' if tsum < before: arr = arr.take(torf.nonzero()[0], 1) resnums = resnums.take(torf.nonzero()[0] - torf.nonzero()[0][0] + 1) LOGGER.report( 'Structure refinement reduced number of ' 'columns from {0} to {1} in %.2fs.'.format( before, arr.shape[1]), '_refine') else: LOGGER.debug( 'All residues in the sequence are contained in ' 'PDB structure {0}.'.format(label)) labels = msa._labels labels[index] = splitSeqLabel(labels[index])[0] + '/' + str( resnums[0]) + '-' + str(resnums[-1]) from .analysis import calcMSAOccupancy, uniqueSequences rows = None if rowocc is not None: before = arr.shape[0] LOGGER.timeit('_refine') try: rowocc = float(rowocc) except Exception as err: raise TypeError('rowocc must be a float ({0})'.format(str(err))) assert 0. <= rowocc <= 1., 'rowocc must be between 0 and 1' rows = calcMSAOccupancy(arr, 'row') >= rowocc if index is not None: index = rows[:index].sum() rows = (rows).nonzero()[0] arr = arr[rows] title.append('rowocc>=' + str(rowocc)) LOGGER.report( 'Row occupancy refinement reduced number of rows from ' '{0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if seqid is not None: before = arr.shape[0] LOGGER.timeit('_refine') unique = uniqueSequences(arr, seqid) if index is not None: unique[index] = True unique = unique.nonzero()[0] arr = arr[unique] title.append('seqid>=' + str(seqid)) if rows is not None: rows = rows[unique] else: rows = unique LOGGER.report( 'Sequence identity refinement reduced number of rows ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[0]), '_refine') if colocc is not None: before = arr.shape[1] LOGGER.timeit('_refine') try: colocc = float(colocc) except Exception as err: raise TypeError('colocc must be a float ({0})'.format(str(err))) assert 0. <= colocc <= 1., 'colocc must be between 0 and 1' cols = (calcMSAOccupancy(arr, 'col') >= colocc).nonzero()[0] arr = arr.take(cols, 1) title.append('colocc>=' + str(colocc)) LOGGER.report( 'Column occupancy refinement reduced number of columns ' 'from {0} to {1} in %.2fs.'.format(before, arr.shape[1]), '_refine') if not title: raise ValueError( 'label, index, seqid, rowocc, colocc all cannot be None') # depending on slicing of rows, arr may not have it's own memory if arr.base is not None: arr = arr.copy() if msa is None: return arr else: if rows is None: from copy import copy labels = copy(msa._labels) else: labels = msa._labels labels = [labels[i] for i in rows] return MSA(arr, title=msa.getTitle() + ' refined ({0})'.format(', '.join(title)), labels=labels)
def rhapsody(query, query_type='SAVs', main_classifier=None, aux_classifier=None, custom_PDB=None, force_env=None, refresh=False, log=True, **kwargs): """Obtain Rhapsody pathogenicity predictions on a list of human missense variants ([ref]_) :arg query: Single Amino Acid Variants (SAVs) in Uniprot coordinates - if *query_type* = ``'SAVs'`` (default), it should be a filename, a string or a list/tuple of strings, containing Uniprot SAV coordinates, with the format ``'P17516 135 G E'``. The string could also be just a single Uniprot sequence identifier (e.g. ``'P17516'``), or the coordinate of a specific site in a sequence (e.g. ``'P17516 135'``), in which case all possible 19 amino acid substitutions at the specified positions will be analyzed. - if *query_type* = ``'PolyPhen2'``, it should be a filename containing the output from PolyPhen-2, usually named :file:`pph2-full.txt` :type query: str, list :arg query_type: ``'SAVs'`` or ``'PolyPhen2'`` :type query_type: str :arg main_classifier: main classifier's filename. If **None**, the default *full* Rhapsody classifier will be used :type main_classifier: str :arg aux_classifier: auxiliary classifier's filename. If both *main_classifier* and *aux_classifier* are **None**, the default *reduced* Rhapsody classifier will be used :type aux_classifier: str :arg custom_PDB: a PDBID, a filename or an :class:`Atomic` to be used for computing structural and dynamical features, instead of the PDB structure automatically selected by the program :type custom_PDB: str, :class:`AtomGroup` :arg force_env: force a specific environment model for GNM/ANM calculations, among ``'chain'``, ``'reduced'`` and ``'sliced'``. If **None** (default), the model of individual dynamical features will match that found in the classifier's feature set :type force_env: str :arg refresh: if **True**, precomputed features and PDB mappings found in the working directory will be ignored and computed again :type refresh: str :arg log: if **True**, log messages will be saved in :file:`rhapsody-log.txt` :type log: str .. [ref] Ponzoni L, Bahar I. Structural dynamics is a determinant of the functional significance of missense variants. *PNAS* **2018** 115 (16) 4164-4169. """ assert query_type in ['SAVs', 'PolyPhen2'], 'Invalid query type.' if log: LOGGER.start('rhapsody-log.txt') # select classifiers if main_classifier is None: main_classifier = getDefaultClassifiers()['full'] if aux_classifier is None: aux_classifier = getDefaultClassifiers()['reduced'] # initialize object that will contain all results and predictions r = Rhapsody(**kwargs) # import classifiers and feature set from pickle r.importClassifiers(main_classifier, aux_classifier, force_env=force_env) # import custom PDB structure if custom_PDB is not None: r.setCustomPDB(custom_PDB) # obtain or import PolyPhen-2 results if query_type == 'SAVs': r.queryPolyPhen2(query) elif query_type == 'PolyPhen2': r.importPolyPhen2output(query) # compute predictions r.getPredictions(refresh=refresh) # print predictions to file r.printPredictions() if aux_classifier is not None: # print both 'full' and 'reduced' predictions in a more detailed format r.printPredictions( classifier="both", PolyPhen2=False, EVmutation=False, filename='rhapsody-predictions-full_vs_reduced.txt') # save pickle r.savePickle() if log: LOGGER.close('rhapsody-log.txt') return r
def _parseHeader(self): """Read the header information from a dcd file. Input: fd - a file struct opened for binary reading. Output: 0 on success, negative error code on failure. Side effects: *natoms set to number of atoms per frame *nsets set to number of frames in dcd file *istart set to starting timestep of dcd file *nsavc set to timesteps between dcd saves *delta set to value of trajectory timestep *nfixed set to number of fixed atoms *freeind may be set to heap-allocated space *reverse set to one if reverse-endian, zero if not. *charmm set to internal code for handling charmm data. """ dcd = self._file endian = b'' #'=' # native endian rec_scale = RECSCALE32BIT charmm = None dcdcordmagic = unpack(endian + b'i', b'CORD')[0] # Check magic number in file header and determine byte order bits = dcd.read(calcsize('ii')) temp = unpack(endian + b'ii', bits) if temp[0] + temp[1] == 84: LOGGER.info('Detected CHARMM -i8 64-bit DCD file of native ' 'endianness.') rec_scale = RECSCALE64BIT elif temp[0] == 84 and temp[1] == dcdcordmagic: pass #LOGGER.info('Detected standard 32-bit DCD file of native ' # 'endianness.') else: if unpack(b'>ii', bits) == temp: endian = '>' else: endian = '<' temp = unpack(endian + b'ii', bits) if temp[0] + temp[1] == 84: rec_scale = RECSCALE64BIT LOGGER.info('Detected CHARMM -i8 64-bit DCD file of opposite ' 'endianness.') else: endian = '' temp = unpack(endian + b'ii', bits) if temp[0] == 84 and temp[1] == dcdcordmagic: LOGGER.info('Detected standard 32-bit DCD file of ' 'opposite endianness.') else: raise IOError('Unrecognized DCD header or unsupported ' 'DCD format.') # check for magic string, in case of long record markers if rec_scale == RECSCALE64BIT: raise IOError('CHARMM 64-bit DCD files are not yet supported.') temp = unpack(b'I', dcd.read(calcsize('I'))) if temp[0] != dcdcordmagic: raise IOError('Failed to find CORD magic in CHARMM -i8 64-bit ' 'DCD file.') # Buffer the entire header for random access bits = dcd.read(80) # CHARMm-genereate DCD files set the last integer in the # header, which is unused by X-PLOR, to its version number. # Checking if this is nonzero tells us this is a CHARMm file # and to look for other CHARMm flags. temp = unpack(endian + b'i' * 20, bits) if temp[-1] != 0: charmm = True if charmm: #LOGGER.info('CHARMM format DCD file (also NAMD 2.1 and later).') temp = unpack(endian + b'i' * 9 + b'f' + b'i' * 10, bits) else: LOGGER.info('X-PLOR format DCD file (also NAMD 2.0 and earlier) ' 'is not supported.') return None # Store the number of sets of coordinates (NSET) self._n_csets = temp[0] # Store ISTART, the starting timestep self._first_ts = temp[1] # Store NSAVC, the number of timesteps between dcd saves self._framefreq = temp[2] # Store NAMNF, the number of fixed atoms self._n_fixed = temp[8] if self._n_fixed > 0: raise IOError('DCD files with fixed atoms is not yet supported.') # Read in the timestep, DELTA # Note: DELTA is stored as double with X-PLOR but as float with CHARMm self._timestep = temp[9] self._unitcell = temp[10] == 1 # Get the end size of the first block if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 84: raise IOError('Unrecognized DCD format.') # Read in the size of the next block temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i'))) if (temp[0] - 4) % 80 != 0: raise IOError('Unrecognized DCD format.') noremarks = temp[0] == 84 # Read NTITLE, the number of 80 character title strings there are temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i'))) self._dcdtitle = dcd.read(80) if not noremarks: self._remarks = dcd.read(80) # Get the ending size for this block temp = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i'))) if (temp[0] - 4) % 80 != 0: raise IOError('Unrecognized DCD format.') # Read in an integer '4' if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 4: raise IOError('Unrecognized DCD format.') # Read in the number of atoms self._n_atoms = unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] # Read in an integer '4' if unpack(endian + b'i', dcd.read(rec_scale * calcsize('i')))[0] != 4: raise IOError('Bad DCD format.') self._is64bit = rec_scale == RECSCALE64BIT self._endian = endian self._n_floats = (self._n_atoms + 2) * 3 if self._is64bit: if self._unitcell: self._bytes_per_frame = 56 + self._n_floats * 8 else: self._bytes_per_frame = self._n_floats * 8 LOGGER.warning('Reading of 64 bit DCD files has not been tested. ' 'Please report any problems that you may find.') self._dtype = np.float64 self._itemsize = 8 else: if self._unitcell: self._bytes_per_frame = 56 + self._n_floats * 4 else: self._bytes_per_frame = self._n_floats * 4 self._dtype = np.float32 self._itemsize = 4 self._first_byte = self._file.tell() n_csets = (getsize(self._filename) - self._first_byte) // self._bytes_per_frame if n_csets != self._n_csets: LOGGER.warning('DCD header claims {0} frames, file size ' 'indicates there are actually {1} frames.'.format( self._n_csets, n_csets)) self._n_csets = n_csets self._coords = self.nextCoordset() self._file.seek(self._first_byte) self._nfi = 0
def mapSAVs2PDB(SAV_coords, custom_PDB=None, refresh=False, status_file=None, status_prefix=None): LOGGER.info('Mapping SAVs to PDB structures...') LOGGER.timeit('_map2PDB') # sort SAVs, so to group together those # with identical accession number accs = [s.split()[0] for s in SAV_coords] sorting_map = np.argsort(accs) # define a structured array PDBmap_dtype = np.dtype([('orig. SAV coords', 'U25'), ('unique SAV coords', 'U25'), ('PDB SAV coords', 'U100'), ('PDB size', 'i')]) nSAVs = len(SAV_coords) mapped_SAVs = np.zeros(nSAVs, dtype=PDBmap_dtype) # define how to report progress if status_prefix is None: status_prefix = '' bar_format = '{l_bar}{bar}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}]' if status_file is not None: status_file = open(status_file, 'w') progress_bar = tqdm([(i, SAV_coords[i]) for i in sorting_map], file=status_file, bar_format=bar_format + '\n') else: progress_bar = tqdm([(i, SAV_coords[i]) for i in sorting_map], bar_format=bar_format) # map to PDB using Uniprot class cache = {'acc': None, 'obj': None} count = 0 for indx, SAV in progress_bar: count += 1 acc, pos, aa1, aa2 = SAV.split() pos = int(pos) # report progress progress_msg = f"{status_prefix}Mapping SAV '{SAV}' to PDB" # LOGGER.info(f"[{count}/{nSAVs}] {progress_msg}...") progress_bar.set_description(progress_msg) # map Uniprot to PDB chains if acc == cache['acc']: # use mapping from previous iteration U2P_map = cache['obj'] else: # save previous mapping if isinstance(cache['obj'], UniprotMapping): cache['obj'].savePickle() cache['acc'] = acc # compute the new mapping try: U2P_map = UniprotMapping(acc, recover_pickle=not (refresh)) if custom_PDB is not None: LOGGER.info('Aligning Uniprot sequence to custom PDB...') U2P_map.alignCustomPDB(custom_PDB, 'all') except Exception as e: U2P_map = str(e) cache['obj'] = U2P_map # map specific SAV try: if isinstance(U2P_map, str): raise RuntimeError(U2P_map) # check wt aa if not 0 < pos <= len(U2P_map.sequence): raise ValueError('Index out of range') wt_aa = U2P_map.sequence[pos - 1] if aa1 != wt_aa: raise ValueError(f'Incorrect wt aa: {aa1} instead of {wt_aa}') # map to PDB. Format: [('2DZF', 'A', 150, 'N', 335)] if custom_PDB is None: r = U2P_map.mapSingleResidue(pos, check_aa=True) else: r = U2P_map.mapSingleRes2CustomPDBs(pos, check_aa=True) if len(r) == 0: raise RuntimeError('Unable to map SAV to PDB') else: PDBID, chID, resid, aa, PDB_size = r[0] # NB: check for blank "chain" field if chID.strip() == '': chID = '?' res_map = f'{PDBID} {chID} {resid} {aa}' except Exception as e: res_map = str(e) PDB_size = 0 # store SAVs mapped on PDB chains and unique Uniprot coordinates if isinstance(U2P_map, str): uniq_coords = U2P_map else: uniq_coords = f'{U2P_map.uniq_acc} {pos} {aa1} {aa2}' mapped_SAVs[indx] = (SAV, uniq_coords, res_map, PDB_size) # save last pickle if isinstance(cache['obj'], UniprotMapping): cache['obj'].savePickle() n = sum(mapped_SAVs['PDB size'] != 0) LOGGER.report(f'{n} out of {nSAVs} SAVs have been mapped to PDB in %.1fs.', '_map2PDB') if status_file: os.remove(status_file.name) return mapped_SAVs
def buildBiomolecules(header, atoms, biomol=None): """Returns *atoms* after applying biomolecular transformations from *header* dictionary. Biomolecular transformations are applied to all coordinate sets in the molecule. Some PDB files contain transformations for more than 1 biomolecules. A specific set of transformations can be choosen using *biomol* argument. Transformation sets are identified by numbers, e.g. ``"1"``, ``"2"``, ... If multiple biomolecular transformations are provided in the *header* dictionary, biomolecules will be returned as :class:`.AtomGroup` instances in a :func:`list`. If the resulting biomolecule has more than 26 chains, the molecular assembly will be split into multiple :class:`.AtomGroup` instances each containing at most 26 chains. These :class:`.AtomGroup` instances will be returned in a tuple. Note that atoms in biomolecules are ordered according to chain identifiers. """ if not isinstance(header, dict): raise TypeError('header must be a dictionary') if not isinstance(atoms, Atomic): raise TypeError('atoms must be an Atomic instance') biomt = header.get('biomoltrans') if not isinstance(biomt, dict) or len(biomt) == 0: raise ValueError("header doesn't contain biomolecular transformations") if not isinstance(atoms, AtomGroup): atoms = atoms.copy() biomols = [] if biomol is None: keys = list(biomt) else: biomol = str(biomol) if biomol in biomt: keys = [biomol] else: LOGGER.warn('Transformations for biomolecule {0} was not ' 'found in the header dictionary.'.format(biomol)) return None keys.sort() for i in keys: segnm = list('ABCDEFGHIJKLMNOPQRSTUVWXYZ' * 20) ags = [] mt = biomt[i] # mt is a list, first item is list of chain identifiers # following items are lines corresponding to transformation # mt must have 3n + 1 lines if (len(mt)) % 4 != 0: LOGGER.warn('Biomolecular transformations {0} were not ' 'applied'.format(i)) continue for times in range(int((len(mt)) / 4)): rotation = np.zeros((3, 3)) translation = np.zeros(3) line0 = np.fromstring(mt[times * 4 + 1], sep=' ') rotation[0, :] = line0[:3] translation[0] = line0[3] line1 = np.fromstring(mt[times * 4 + 2], sep=' ') rotation[1, :] = line1[:3] translation[1] = line1[3] line2 = np.fromstring(mt[times * 4 + 3], sep=' ') rotation[2, :] = line2[:3] translation[2] = line2[3] t = Transformation(rotation, translation) newag = atoms.select('chain ' + ' '.join(mt[0])).copy() if newag is None: continue newag.all.setSegnames(segnm.pop(0)) for acsi in range(newag.numCoordsets()): newag.setACSIndex(acsi) newag = t.apply(newag) newag.setACSIndex(0) ags.append(newag) if ags: newag = ags.pop(0) while ags: newag += ags.pop(0) newag.setTitle('{0} biomolecule {1}'.format(atoms.getTitle(), i)) biomols.append(newag) if biomols: if len(biomols) == 1: return biomols[0] else: return biomols else: return None
def matchAlign(mobile, target, **kwargs): """Superpose *mobile* onto *target* based on best matching pair of chains. This function uses :func:`matchChains` for matching chains and returns a tuple that contains the following items: * *mobile* after it is superposed, * matching chain from *mobile* as a :class:`.AtomMap` instance, * matching chain from *target* as a :class:`.AtomMap` instance, * percent sequence identity of the match, * percent sequence overlap of the match. :arg mobile: atoms that contain a protein chain :type mobile: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg target: atoms that contain a protein chain :type target: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg tarsel: *target* atoms that will be used for alignment, default is ``'calpha'`` :type tarsel: str :arg allcsets: align all coordinate sets of *mobile*, default is **True** :type allcsets: bool :keyword seqid: percent sequence identity, default is 90 :type seqid: float :keyword overlap: percent overlap, default is 90 :type overlap: float :keyword pwalign: perform pairwise sequence alignment :type pwalign: bool""" selstr = kwargs.pop('tarsel', 'calpha') if selstr == 'calpha': selstr = None subset = 'calpha' if selstr: if selstr in _SUBSETS: subset = selstr else: subset = 'all' sel = target.select(selstr) if sel is None: raise ValueError('selection {0} did not match any atoms'.format( repr(selstr))) chid = set(sel.getChids()) if len(chid) == 1: chid = chid.pop() target = target.select('chain ' + chid) match = matchChains(mobile, target, subset=subset, **kwargs) if not match: return match = match[0] mob = match[0] tar = match[1] if selstr: which = SELECT.getIndices(tar, selstr) n_atoms = len(which) else: which = slice(None) n_atoms = len(tar) selstr = 'calpha' if kwargs.get('allcets', True): csets = range(mobile.numCoordsets()) # PY3K: OK else: csets = [mobile.getACSIndex()] LOGGER.info('Alignment is based on {0} atoms matching {1}.'.format( n_atoms, repr(selstr))) printRMSD(tar._getCoords()[which], mob._getCoordsets()[:, which], msg='Before alignment ') for acsi in csets: mob.setACSIndex(acsi) mobile.setACSIndex(acsi) calcTransformation(mob._getCoords()[which], tar._getCoords()[which]).apply(mobile) printRMSD(tar._getCoords()[which], mob._getCoordsets()[:, which], msg='After alignment ') return (mobile, ) + match
def writeDeformProfile(model, pdb, filename='dp_out', selstr='protein and name CA',\ pdb_selstr='protein', loadToVMD=True): """Calculate deformability (plasticity) profile of molecule based on mechanical stiffness matrix (see [EB08]_). :arg model: this is an 3-dimensional NMA instance from a :class:`.ANM calculations :type model: :class:`.ANM` :arg pdb: a coordinate set or an object with ``getCoords`` method :type pdb: :class:`numpy.ndarray` Note: selection can be done usig ``selstr`` and ``pdb_selstr``. ``selstr`` define ``model`` selection (used for building :class:`.ANM` model) and ``pdb_selstr`` will be used in VMD program for visualization. By default files are saved as *filename* and loaded to VMD program. To change it use ``loadToVMD=False``. Mean value of mechanical stiffness for molecule can be found in occupancy column in PDB file. """ pdb = pdb.select(pdb_selstr) coords = pdb.select(selstr) meanSiff = np.mean(model.getStiffness(), axis=0) out_mean = open(filename + '_mean.txt', 'w') # mean value of Kij for each residue for nr_i, i in enumerate(meanSiff): out_mean.write("{} {}\n".format(nr_i, i)) out_mean.close() from collections import Counter aa_counter = Counter(pdb.getResindices()) meanStiff_all = [] for i in range(coords.numAtoms()): meanStiff_all.extend(aa_counter.values()[i] * [round(meanSiff[i], 2)]) kw = {'occupancy': meanStiff_all} writePDB(filename, pdb, **kw) LOGGER.info('PDB file with deformability profile has been saved.') LOGGER.info('Creating TCL file.') out_tcl = open(filename + '.tcl', 'w') out_tcl.write('display resetview \nmol addrep 0 \ndisplay resetview \n') out_tcl.write('mol new {./' + filename + '.pdb} type {pdb} first 0 last -1 step 1 waitfor 1 \n') out_tcl.write('animate style Loop \ndisplay projection Orthographic \n') out_tcl.write( 'display depthcue off \ndisplay rendermode GLSL \naxes location Off \n' ) out_tcl.write('color Display Background white \n') out_tcl.write( 'mol modstyle 0 0 NewCartoon 0.300000 10.000000 4.100000 0 \n') out_tcl.write( 'mol modmaterial 0 0 Diffuse \nmol modcolor 0 0 Occupancy \n') out_tcl.write('menu colorscalebar on \n') out_tcl.close() if (loadToVMD == True): from prody import pathVMD LOGGER.info('File will be loaded to VMD program.') os.system(pathVMD() + " -e " + str(filename) + ".tcl")
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename + '.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, '_prody_fetchPDBClusters') LOGGER.clear() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def matchTNMAChains(atoms1, atoms2, prepareForHungarian=True, **kwargs): """Modified matchChains that only uses the Biopython chain matching. Return pairs of chains matched based on Biopython similarity. Makes an all-to-all comparison of chains in *atoms1* and *atoms2*. Chains are obtained from hierarchical views (:class:`.HierView`) of atom groups. This function returns a list of matching chains in a tuples that contain 4 items: * matching chain from *atoms1* as a :class:`.AtomMap` instance, * matching chain from *atoms2* as a :class:`.AtomMap` instance, * percent sequence identity of the match, * percent sequence overlap of the match. List of matches are sorted in decreasing percent sequence identity order. :class:`.AtomMap` instances can be used to calculate RMSD values and superpose atom groups. :arg atoms1: atoms that contain a chain :type atoms1: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :arg atoms2: atoms that contain a chain :type atoms2: :class:`.Chain`, :class:`.AtomGroup`, :class:`.Selection` :keyword subset: one of the following well-defined subsets of atoms: ``"calpha"`` (or ``"ca"``), ``"backbone"`` (or ``"bb"``), ``"heavy"`` (or ``"noh"``), or ``"all"``, default is ``"calpha"`` :type subset: string :keyword seqid: percent sequence identity, default is 90 :type seqid: float :keyword overlap: percent overlap, default is 90 :type overlap: float :keyword pwalign: perform pairwise sequence alignment :type pwalign: bool If *subset* is set to *calpha* or *backbone*, only alpha carbon atoms or backbone atoms will be paired. If set to *all*, all atoms common to matched residues will be returned. This function tries to match chains based on residue numbers and names. All chains in *atoms1* is compared to all chains in *atoms2*. This works well for different structures of the same protein. When it fails, :mod:`Bio.pairwise2` is used for pairwise sequence alignment, and matching is performed based on the sequence alignment. User can control, whether sequence alignment is performed or not with *pwalign* keyword. If ``pwalign=True`` is passed, pairwise alignment is enforced.""" if not isinstance(atoms1, (AtomGroup, Chain, Selection)): raise TypeError('atoms1 must be an AtomGroup, Chain, or Selection') if not isinstance(atoms2, (AtomGroup, Chain, Selection)): raise TypeError('atoms2 must be an AtomGroup, Chain, or Selection') subset = kwargs.get('subset', 'calpha') if subset not in _SUBSETS: raise ValueError('{0} is not a valid subset argument'.format( str(subset))) seqid = kwargs.get('seqid', 90.) assert isinstance(seqid, (float, int)), 'seqid must be float' assert 0 < seqid <= 100, 'seqid must be in the range from 0 to 100' coverage = kwargs.get('overlap') if coverage is None: coverage = kwargs.get('coverage', 90.) assert isinstance(coverage, (float, int)), 'overlap must be float' assert 0 < coverage <= 100, 'overlap must be in the range from 0 to 100' pwalign = kwargs.get('pwalign', None) if isinstance(atoms1, Chain): chains1 = [atoms1] atoms1 = atoms1.getAtomGroup() else: chains1 = list(atoms1.getHierView().iterChains()) if not isinstance(atoms1, AtomGroup): atoms1 = atoms1.getAtomGroup() chains = list() for ch in chains1: simpch = SimpleChain(ch) if len(simpch) > 0: chains.append(simpch) chains1 = chains if not isinstance(atoms1, Chain): LOGGER.debug('Checking {0}: {1} chains are identified'.format( str(atoms1), len(chains1))) if isinstance(atoms2, Chain): chains2 = [atoms2] atoms2 = atoms2.getAtomGroup() else: chains2 = list(atoms2.getHierView().iterChains()) if not isinstance(atoms2, AtomGroup): atoms2 = atoms2.getAtomGroup() chains = list() for ch in chains2: simpch = SimpleChain(ch) if len(simpch) > 0: chains.append(simpch) chains2 = chains if not isinstance(atoms2, Chain): LOGGER.debug('Checking {0}: {1} chains are identified'.format( str(atoms2), len(chains2))) matches = [] unmatched = [] LOGGER.debug('Trying to match chains based on residue numbers and names:') for simpch1 in chains1: for simpch2 in chains2: LOGGER.debug(' Comparing {0} (len={1}) and {2} (len={3}):'.format( simpch1.getTitle(), len(simpch1), simpch2.getTitle(), len(simpch2))) match1, match2, nmatches = getTrivialMatch(simpch1, simpch2) _seqid = nmatches * 100 / min(len(simpch1), len(simpch2)) _cover = len(match2) * 100 / max(len(simpch1), len(simpch2)) if _seqid >= seqid and _cover >= coverage: LOGGER.debug( '\tIgnoring Match: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( len(match1), _seqid, _cover)) unmatched.append((simpch1, simpch2)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) unmatched.append((simpch1, simpch2)) if pwalign or (not matches and (pwalign is None or pwalign)): pairwise2 = importBioPairwise2() if pairwise2: LOGGER.debug('Trying to match chains based on {0} sequence ' 'alignment:'.format(ALIGNMENT_METHOD)) for simpch1, simpch2 in unmatched: LOGGER.debug(' Comparing {0} (len={1}) and {2} ' '(len={3}):'.format(simpch1.getTitle(), len(simpch1), simpch2.getTitle(), len(simpch2))) match1, match2, nmatches = getAlignedMatch(simpch1, simpch2) _seqid = nmatches * 100 / min(len(simpch1), len(simpch2)) _cover = len(match2) * 100 / max(len(simpch1), len(simpch2)) if _seqid >= seqid and _cover >= coverage: LOGGER.debug( '\tMatch: {0} residues match with {1:.0f}% ' 'sequence identity and {2:.0f}% overlap.'.format( len(match1), _seqid, _cover)) matches.append( (match1, match2, _seqid, _cover, simpch1, simpch2)) else: LOGGER.debug('\tFailed to match chains (seqid={0:.0f}%, ' 'overlap={1:.0f}%).'.format(_seqid, _cover)) else: LOGGER.warning('Pairwise alignment could not be performed.') if not matches: return None subset = _SUBSETS[subset] for mi, result in enumerate(matches): match1, match2, _seqid, _cover, simpch1, simpch2 = result indices1 = [] indices2 = [] for i in range(len(match1)): ares = match1[i] bres = match2[i] if subset == 'ca': try: aid = ares.getNames().tolist().index('CA') except ValueError: aid = None try: bid = bres.getNames().tolist().index('CA') if aid is not None: indices1.append(ares._indices[aid]) indices2.append(bres._indices[bid]) except ValueError: pass elif subset == 'bb': for bban in ('N', 'CA', 'C', 'O'): try: aid = ares.getNames().tolist().index(bban) except ValueError: continue try: bid = bres.getNames().tolist().index(bban) except ValueError: continue else: indices1.append(ares._indices[aid]) indices2.append(bres._indices[bid]) elif subset == 'noh': for han, aid, noh in zip(ares.getNames(), ares._indices, ares.getFlags('noh')): if not noh: continue try: bid = bres.getNames().tolist().index(han) except ValueError: continue else: indices1.append(aid) indices2.append(bres._indices[bid]) elif subset is None or subset is 'all': aans = ares.getNames() bans = bres.getNames().tolist() aids = ares.getIndices() #bids = bres.getIndices() for j in range(len(aans)): try: bid = bres._indices[bans.index(aans[j])] indices1.append(aids[j]) indices2.append(bid) except ValueError: pass indices1 = np.array(indices1, int) indices2 = np.array(indices2, int) match1 = AM(atoms1, indices1, atoms1.getACSIndex(), title=simpch1.getTitle() + ' -> ' + simpch2.getTitle(), intarrays=True) match2 = AM(atoms2, indices2, atoms2.getACSIndex(), title=simpch2.getTitle() + ' -> ' + simpch1.getTitle(), intarrays=True) matches[mi] = (match1, match2, _seqid, _cover) if len(matches) > 1: def compare(m1, m2): return cmp(m1[2], m2[2]) if not prepareForHungarian: matches.sort(compare, reverse=True) return matches