def pathEVmutationFolder(folder=None): """Returns or sets path of local folder where EVmutation data are stored. To release the current folder, pass an invalid path, e.g. ``folder=''``. """ if folder is None: folder = SETTINGS.get('EVmutation_local_folder') if folder: if isdir(folder): return folder else: LOGGER.warn('Local folder {} is not accessible.'.format( repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local EVmutation folder is set: {}'.format( repr(folder))) SETTINGS['EVmutation_local_folder'] = folder SETTINGS.save() else: current = SETTINGS.pop('EVmutation_local_folder') if current: LOGGER.info('EVmutation folder {0} is released.'.format( repr(current))) SETTINGS.save() else: raise IOError('{} is not a valid path.'.format(repr(folder)))
def __or__(self, other): if self is other: return self try: ag = other.getAtomGroup() except AttributeError: raise TypeError('other must be an AtomPointer') if self._ag != ag: raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = unique(concatenate((self._getIndices(), other._getIndices()))) if indices[-1] == atommap.DUMMY: indices = indices[:-1] return Selection(self._ag, indices, '({0:s}) or ({1:s})'.format( self.getSelstr(), other.getSelstr()), acsi, unique=True)
def parseCCD(ids): """Retrieve the whole Chemical Component Dictionary (CCD) resource. """ if isListLike(ids): n_ids = len(ids) else: ids = [ids] n_ids = 1 ret = [] for id in ids: id_url = 'http://ligand-expo.rcsb.org/reports/{0}/{1}/{1}.cif'.format(id[0], id) try: handle = openURL(id_url) except Exception as err: LOGGER.warn('download failed ({1}).'.format(str(err))) else: data = handle.read() if len(data): if PY3K: data = data.decode() parsingDict, prog = parseSTARLines(data.split('\n'), shlex=True) star_dict = StarDict(parsingDict, prog, id) ret.append(star_dict[id]) else: ret.append(None) LOGGER.warn('Could not parse CCD data for {0}'.format(id)) if n_ids == 1: return ret[0] return ret
def calcSignatureSqFlucts(mode_ensemble, **kwargs): """ Get the signature square fluctuations of *mode_ensemble*. :arg mode_ensemble: an ensemble of structures or ENMs :type mode_ensemble: :class: `ModeEnsemble` """ if not isinstance(mode_ensemble, ModeEnsemble): raise TypeError('mode_ensemble should be an instance of ModeEnsemble') if not mode_ensemble.isMatched(): LOGGER.warn('modes in mode_ensemble did not match cross modesets. ' 'Consider running mode_ensemble.match() prior to using this function') modesets = mode_ensemble V = [] for modes in modesets: sqfs = calcSqFlucts(modes) V.append(sqfs) V = np.vstack(V) title_str = '%d modes'%mode_ensemble.numModes() weights = mode_ensemble.getWeights() if weights is not None: weights = weights[:, :, 0] labels = mode_ensemble.getLabels() # even the original model is 3d, sqfs are still 1d sig = sdarray(V, title=title_str, weights=weights, labels=labels, is3d=False) return sig
def __init__(self, PDB, n_modes='all', recover_pickle=False, **kwargs): assert isinstance(PDB, (str, Atomic)), \ 'PDB must be either a PDBID or an Atomic instance.' assert type(recover_pickle) is bool # definition and initialization of variables if isinstance(PDB, str): self.PDBID = PDB self._pdb = None else: self.PDBID = None self._pdb = PDB.copy() self.n_modes = n_modes self.chids = None self.resids = None self.feats = None self._gnm = None self._anm = None self.timestamp = None if recover_pickle: try: self.recoverPickle(**kwargs) except Exception as e: LOGGER.warn('Unable to recover pickle: %s' % e) self.refresh() else: self.refresh() return
def __and__(self, other): if self is other: return self if not isinstance(other, AtomPointer): raise TypeError('other must be an AtomPointer') if self._ag != other.getAtomGroup(): raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warning('active coordinate set indices do not match, ' 'so it will be set to zero in the union.') acsi = 0 acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = set(self._getIndices()) indices = indices.intersection(other.getIndices()) if indices: indices = np.unique(indices) return Selection(self._ag, indices, '({0:s}) and ({1:s})'.format( self.getSelstr(), other.getSelstr()), acsi)
def range2selstr(rangestr): if rangestr.strip() == '': return None frags = rangestr.split(',') sels = [] for frag in frags: try: fromtos = frag.split('-') if len(fromtos) == 2: fro, to = fromtos else: LOGGER.warn('range "%s" is irregular' % rangestr) fro = '1' to = fromtos[-1] fro_num = intResnum(fro) to_num = intResnum(to) if fro_num > to_num: LOGGER.warn('range "%s" is irregular' % rangestr) to_num = fro_num fro_num = 1 fro = str(fro_num) to = str(to_num) except ValueError: print('error occurred when parsing "%s"' % rangestr) continue sels.append('resindex %s to %s' % (fro, to)) selstr = ' or '.join(sels) return selstr
def writePDB(self, filename=None, single=True, **kwargs): ''' Write conformers in PDB format to a file. :arg filename: The name of the file. If it is None (default), the title of the ClustENM will be used. :type filename: str :arg single: If it is True (default), then the conformers will be saved into a single PDB file with each conformer as a model. Otherwise, a directory will be created with the filename, and each conformer will be saved as a separate PDB fle. :type single: bool ''' if filename is None: filename = self.getTitle() if single: filename = writePDB(filename, self) LOGGER.info('PDB file saved as %s' % filename) else: direc = filename if isdir(direc): LOGGER.warn('%s is not empty; will be flooded' % direc) else: mkdir(direc) LOGGER.info('Saving files ...') for i, lab in enumerate(self.getLabels()): filename = '%s/%s'%(direc, lab) writePDB(filename, self, csets=i) LOGGER.info('PDB files saved in %s ...'%direc)
def calcAuxPredictions(self, aux_clsf, force_env=None): assert self.predictions is not None, 'Primary predictions not found.' assert self.featMatrix is not None, 'Features not computed.' assert force_env in [None, 'chain', 'reduced', 'sliced'] # import feature subset clsf_dict = pickle.load(open(aux_clsf, 'rb')) LOGGER.info('Auxiliary Random Forest classifier imported.') feat_subset = clsf_dict['features'] if force_env is not None: # force a given ENM environment model for i, f in enumerate(feat_subset): if f in RHAPSODY_FEATS['PDB'] and \ (f.startswith('ANM') or f.startswith('GNM')): old_env = f.split('-')[-1] feat_subset[i] = f.replace(old_env, force_env) assert all(f in self.featSet for f in feat_subset), \ 'The new set of features must be a subset of the original one.' # reduce original feature matrix sel = [i for i, f in enumerate(self.featSet) if f in feat_subset] fm = self.featMatrix[:, sel] p_a = calcPredictions(fm, clsf_dict, SAV_coords=self.SAVcoords['text']) if p_a is None: LOGGER.warn('No additional predictions.') return None self.auxPreds = p_a p_o = self.predictions self.mixPreds = np.where(np.isnan(p_o['score']), p_a, p_o) return self.auxPreds, self.mixPreds
def showSignatureOverlaps(mode_ensemble): from matplotlib.pyplot import xlabel, ylabel if not isinstance(mode_ensemble, ModeEnsemble): raise TypeError('mode_ensemble should be an instance of ModeEnsemble') if not mode_ensemble.isMatched(): LOGGER.warn( 'modes in mode_ensemble did not match cross modesets. ' 'Consider running mode_ensemble.match() prior to using this function' ) overlaps = calcSignatureOverlaps(mode_ensemble, diag=True) r, c = np.triu_indices(overlaps.shape[1], k=1) overlap_triu = overlaps[:, r, c] meanV = overlap_triu.mean(axis=1) stdV = overlap_triu.std(axis=1) show = showSignatureAtomicLines(meanV, stdV) xlabel('Mode index') ylabel('Overlap') return show
def calcSignatureCrossCorr(mode_ensemble, norm=True): """Calculate average cross-correlations for a ModeEnsemble.""" if not isinstance(mode_ensemble, ModeEnsemble): raise TypeError('mode_ensemble should be an instance of ModeEnsemble') if not mode_ensemble.isMatched(): LOGGER.warn( 'modes in mode_ensemble did not match cross modesets. ' 'Consider running mode_ensemble.match() prior to using this function' ) matches = mode_ensemble n_atoms = matches.numAtoms() n_sets = len(matches) C = np.zeros((n_sets, n_atoms, n_atoms)) for i in range(n_sets): m = matches[i] c = calcCrossCorr(m, norm=norm) C[i, :, :] = c title_str = '%d modes' % mode_ensemble.numModes() weights = mode_ensemble.getWeights() if weights is not None: W = np.zeros((mode_ensemble.numModeSets(), mode_ensemble.numAtoms(), mode_ensemble.numAtoms())) for i, w in enumerate(weights): w2 = np.outer(w, w) W[i, :, :] = w2 labels = mode_ensemble.getLabels() # even the original model is 3d, cross-correlations are still 1d sig = sdarray(C, title=title_str, weights=W, labels=labels, is3d=False) return sig
def calcSignatureCollectivity(mode_ensemble, masses=None): """Calculate average collectivities for a ModeEnsemble.""" if not isinstance(mode_ensemble, ModeEnsemble): raise TypeError('mode_ensemble should be an instance of ModeEnsemble') if not mode_ensemble.isMatched(): LOGGER.warn( 'modes in mode_ensemble did not match cross modesets. ' 'Consider running mode_ensemble.match() prior to using this function' ) n_modes = mode_ensemble.numModes() n_sets = len(mode_ensemble) C = np.zeros((n_sets, n_modes)) for i in range(n_sets): m = mode_ensemble[i] c = calcCollectivity(m, masses=masses) C[i, :] = c title_str = 'collectivities of %d modes' % mode_ensemble.numModes() labels = mode_ensemble.getLabels() # even the original model is 3d, cross-correlations are still 1d sig = sdarray(C, title=title_str, weights=None, labels=labels, is3d=False) return sig
def calcDeepFunctionOverlaps(*goa_data, **kwargs): """Calculate function overlaps between the deep (most detailed) molecular functions in particular from two sets of GO terms. :arg goa1: the first set of GO terms :type goa1: tuple, list, :class:`~numpy.ndarray` :arg goa2: the second set of GO terms :type goa2: tuple, list, :class:`~numpy.ndarray` """ return_funcs = kwargs.pop('return_funcs', False) deepFuncs = [findDeepestFunctions(entry, **kwargs) for entry in goa_data] for i, entry in enumerate(deepFuncs): if len(entry) == 0: LOGGER.warn( 'ensemble member {0} has no deep molecular functions and was omitted' .format(goa_data[i]._title)) deepFuncs = [entry for entry in deepFuncs if len(entry) > 0] overlaps = calcGoOverlap(*deepFuncs, **kwargs) if return_funcs: return overlaps, deepFuncs return overlaps
def calcEnsembleFunctionOverlaps(ens, **kwargs): """Calculate function overlaps for an ensemble as the mean of the value from :func:`calcDeepFunctionOverlaps`. :arg ens: an ensemble with labels :type ens: :class:`Ensemble` """ if not isinstance(ens, Ensemble) and not isListLike(ens): raise TypeError('ens should be an ensemble or list-like') if isinstance(ens, Ensemble): ids = [label[:5] for label in ens.getLabels()] else: ids = ens if not isinstance(ids[0], str): raise TypeError('ens should have labels') goa_ens = queryGOA(ids, **kwargs) for entry in goa_ens: if len(entry._molecular) == 0: LOGGER.warn( 'ensemble member {0} has no molecular functions and was omitted' .format(entry._title)) goa_ens = [entry for entry in goa_ens if len(entry._molecular) > 0] overlaps = calcDeepFunctionOverlaps(*goa_ens, **kwargs) return overlaps
def _checkAccessionNumber(self, acc): if '-' in acc: acc = acc.split('-')[0] message = 'Isoforms are not allowed, the main sequence for ' + \ acc + ' will be used instead.' LOGGER.warn(message) return acc
def calcMinBranchLength(go_id1, go_id2, go): '''Find the minimum branch length between two terms in the GO DAG. :arg go_id1: the first GO ID :type go_id1: str :arg go_id2: the second GO ID :type go_id2:str :arg go: object containing a gene ontology (GO) directed acyclic graph (DAG) :type go: `~goatools.obo_parser.GODag` ''' # First get the deepest common ancestor dca = findDeepestCommonAncestor([go_id1, go_id2], go) if dca is None: LOGGER.warn( 'There are no common ancestors between {0} and {1} so no meaningful distance can be calculated.' .format(go_id1, go_id2)) return None # Then get the distance from the DCA to each term dca_depth = go[dca].depth d1 = go[go_id1].depth - dca_depth d2 = go[go_id2].depth - dca_depth # Return the total distance - i.e., to the deepest common ancestor and back. return d1 + d2
def calcEnsembleFunctionOverlaps(ens, **kwargs): """Calculate function overlaps for an ensemble as the mean of the value from :func:`calcDeepFunctionOverlaps`. :arg ens: an ensemble with labels :type ens: :class:`Ensemble` """ if not isinstance(ens, Ensemble) and not isListLike(ens): raise TypeError('ens should be an ensemble or list-like') if isinstance(ens, Ensemble): ids = [label[:5] for label in ens.getLabels()] else: ids = ens if not isinstance(ids[0], str): raise TypeError('ens should have labels') goa_ens = queryGOA(ids, **kwargs) for entry in goa_ens: if len(entry._molecular) == 0: LOGGER.warn( 'ensemble member {0} has no molecular functions and was omitted'.format(entry._title)) goa_ens = [entry for entry in goa_ens if len(entry._molecular) > 0] overlaps = calcDeepFunctionOverlaps(*goa_ens, **kwargs) return overlaps
def calcMinBranchLength(go_id1, go_id2, go): '''Find the minimum branch length between two terms in the GO DAG. :arg go_id1: the first GO ID :type go_id1: str :arg go_id2: the second GO ID :type go_id2:str :arg go: object containing a gene ontology (GO) directed acyclic graph (DAG) :type go: `~goatools.obo_parser.GODag` ''' # First get the deepest common ancestor dca = findDeepestCommonAncestor([go_id1, go_id2], go) if dca is None: LOGGER.warn('There are no common ancestors between {0} and {1} so no meaningful distance can be calculated.'.format( go_id1, go_id2)) return None # Then get the distance from the DCA to each term dca_depth = go[dca].depth d1 = go[go_id1].depth - dca_depth d2 = go[go_id2].depth - dca_depth # Return the total distance - i.e., to the deepest common ancestor and back. return d1 + d2
def calcSignatureFractVariance(mode_ensemble): """Calculate signature fractional variance for a ModeEnsemble.""" if not isinstance(mode_ensemble, ModeEnsemble): raise TypeError('mode_ensemble should be an instance of ModeEnsemble') if not mode_ensemble.isMatched(): LOGGER.warn( 'modes in mode_ensemble did not match cross modesets. ' 'Consider running mode_ensemble.match() prior to using this function' ) matches = mode_ensemble n_sets = len(matches) W = [] is3d = None for i in range(n_sets): m = matches[i] var = calcFractVariance(m) W.append(var) if is3d is None: is3d = m.is3d() title_str = '%d modes' % mode_ensemble.numModes() labels = mode_ensemble.getLabels() sig = sdarray(W, title=title_str, weights=None, labels=labels, is3d=is3d) return sig
def calcMBSfromSim(simMatrix, nEvals=20, remove_outliers=True, remove_offset=True, **kwargs): LOGGER.timeit('_MBS') n = simMatrix.shape[0] mbs = np.zeros(n) for i in range(n): try: # cut "non-covalent" bonds around atom 'i' modSim = MBSPointMutation(simMatrix, i) # compute laplacian's spectrum of eigvals laplacian = sparse.csgraph.laplacian(modSim, normed=True) evals = sparse.linalg.eigsh(laplacian, k=min(nEvals, n-1), which='SM', return_eigenvectors=False) # sort eigvals in ascending order evals = np.sort(evals) # compute MBS at site i mbs[i] = np.sum(1./evals[1:]) except Exception as err: LOGGER.warn('Unable to compute MBS at position ' '{0}. {1}'.format(i, err)) mbs[i] = np.nan if any(~np.isnan(mbs)): # remove outliers if remove_outliers is True: mbs = _removeOutliers(mbs, **kwargs) # remove offset if remove_offset is True: offset = min(mbs[~np.isnan(mbs)]) mbs = mbs - offset LOGGER.report('MBS computed in %.1fs.', '_MBS') return mbs
def setDrugGroup(self, group): """Set drug_group and update home page :arg group: group of drugs if using DrugBank options are ``"Approved"`` or ``"All"``. Default is ``"All"`` :type group: str """ if self.data_source == 'DrugBank': if group is None: group = 'All' elif not isinstance(group, str): raise TypeError('group must be string or None') elif group.lower() == 'all': group = 'All' elif group.lower() == 'approved': group = 'Approved' else: raise ValueError('group should be approved, all or None') self.drug_group = group if self.no_data: self.updateHomePage() elif group is not None: LOGGER.warn('there are no groups when using STITCH')
def searchQuartataWeb(data_source=None, drug_group=None, input_type=None, query_type=None, data=None, num_predictions=None, browser_type=None, job_id=None, filename=None, result_type='Chemical'): """Wrapper function for searching QuartataWeb. :arg result_type: type of results to get from QuartataWeb. So far only ``'Chemical'`` is supported. :type result_type: str All other arguments are the same as :class:`.QuartataWebBrowser`. """ if result_type == 'Chemical': return QuartataChemicalRecord(data_source, drug_group, input_type, query_type, data, num_predictions, browser_type, job_id, filename) else: LOGGER.warn('No other result types are supported yet') return None
def __and__(self, other): if self is other: return self try: ag = other.getAtomGroup() except AttributeError: raise TypeError('other must be an AtomPointer') if self._ag != ag: raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warning('active coordinate set indices do not match, ' 'so it will be set to zero in the union.') acsi = 0 acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = set(self._getIndices()) indices = indices.intersection(other.getIndices()) if indices: indices = unique(indices) if indices[-1] == atommap.DUMMY: indices = indices[:-1] return Selection(self._ag, indices, '({0}) and ({1})' .format(self.getSelstr(), other.getSelstr()), acsi)
def getHits(self): """Returns the dictionary associated with the DaliRecord""" if self._alignPDB is None: LOGGER.warn('Dali Record does not have any data yet. Please run fetch.') return self._alignPDB
def __or__(self, other): if self is other: return self try: ag = other.getAtomGroup() except AttributeError: raise TypeError('other must be an AtomPointer') if self._ag != ag: raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = unique(concatenate( (self._getIndices(), other._getIndices()))) if indices[-1] == atommap.DUMMY: indices = indices[:-1] return Selection(self._ag, indices, '({0}) or ({1})'.format(self.getSelstr(), other.getSelstr()), acsi, unique=True)
def pathRhapsodyFolder(folder=None): """Returns or sets path of local folder where files and pickles necessary to run Rhapsody will be stored. To release the current folder, pass an invalid path, e.g. ``folder=''``. """ if folder is None: folder = SETTINGS.get('rhapsody_local_folder') if folder: if isdir(folder): return folder else: LOGGER.warn('Local folder {} is not accessible.'.format( repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local Rhapsody folder is set: {}'.format( repr(folder))) SETTINGS['rhapsody_local_folder'] = folder SETTINGS.save() else: current = SETTINGS.pop('rhapsody_local_folder') if current: LOGGER.info('Rhapsody folder {0} is released.'.format( repr(current))) SETTINGS.save() else: raise IOError('{} is not a valid path.'.format(repr(folder)))
def mapOntoChainByAlignment(atoms, chain, **kwargs): """This function is similar to :func:`.mapOntoChain` but correspondence of chains is found by alignment provided. :arg alignments: A list of predefined alignments. It can be also a dictionary or :class:`MSA` instance where the keys or labels are the title of *atoms* or *chains*. :type alignments: list, dict, :class:`MSA` """ alignments = kwargs.pop('alignments', None) if alignments is None: return mapOntoChain(atoms, chain, **kwargs) else: if isinstance(alignments, (MSA, dict)): refseq = str(alignments[chain.getTitle()]) tarseq = str(alignments[atoms.getTitle()]) alignment = [refseq, tarseq] else: index = kwargs.pop('index', 0) alignment = alignments[index] tar_aligned_seq = alignment[-1] for char in GAPCHARS: tar_aligned_seq = tar_aligned_seq.replace(char, '').upper() hv = atoms.getHierView() for target_chain in hv.iterChains(): tar_seq = target_chain.getSequence().upper() if tar_seq == tar_aligned_seq: mappings = mapOntoChain(target_chain, chain, pwalign=alignment, **kwargs) return mappings LOGGER.warn('The sequence of chain does not match that in alignment (%s).'%atoms.getTitle()) return []
def _isSaturationMutagenesis(self, queryUniprot=False): assert self._isColSet('SAV coords'), 'SAV list not set.' if self.saturation_mutagenesis is None: self.saturation_mutagenesis = False try: SAVs = self.getUniqueSAVcoords() SAV_list = list(SAVs['unique SAV coords']) acc = list(set(SAVs['Uniprot ID'])) if len(acc) != 1: raise RuntimeError('Multiple accession numbers found') else: acc = acc[0] pos = list(set(SAVs['position'])) if len(pos) == 1: query = f'{acc} {pos[0]}' else: query = acc # generate target scanning list if queryUniprot: target_SAV_list = Uniprot.seqScanning(query) else: seq = ''.join(SAVs['wt. aa'][range(0, len(SAVs), 19)]) target_SAV_list = Uniprot.seqScanning(query, sequence=seq) if SAV_list == target_SAV_list: self.saturation_mutagenesis = True else: raise RuntimeError('Missing SAVs detected.') except Exception as e: LOGGER.warn(f'Not a saturation mutagenesis list: {e}') return self.saturation_mutagenesis
def __getitem__(self, index): """A list or tuple of integers can be used for indexing.""" if self._n_modes == 0: raise ValueError('{0} modes are not calculated, use ' 'calcModes() method'.format(str(self))) if isinstance(index, slice): if (index.stop is not None and index.stop > self.numModes()) or ( index.start is not None and index.start > self.numModes()): LOGGER.warn( 'The selection index contains a higher number than the total mode number ({0})' .format(self.numModes())) indices = np.arange(*index.indices(len(self))) if len(indices) > 0: return ModeSet(self, indices) elif isinstance(index, (list, tuple, np.ndarray)): if len(index) == 1: return self._getMode(index[0]) return ModeSet(self, index) try: index = int(index) except Exception: raise IndexError('indices must be int, slice, list, or tuple') else: return self._getMode(index)
def range2selstr(rangestr): if rangestr.strip() == '': return None frags = rangestr.split(',') sels = [] for frag in frags: try: fromtos = frag.split('-') if len(fromtos) == 2: fro, to = fromtos else: LOGGER.warn('range "%s" is irregular'%rangestr) fro = '1' to = fromtos[-1] fro_num = intResnum(fro) to_num = intResnum(to) if fro_num > to_num: LOGGER.warn('range "%s" is irregular'%rangestr) to_num = fro_num fro_num = 1 fro = str(fro_num) to = str(to_num) except ValueError: print('error occurred when parsing "%s"'%rangestr) continue sels.append('resnum %s to %s'%(fro, to)) selstr = ' or '.join(sels) return selstr
def parseEMDStream(stream, **kwargs): """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file. :arg stream: Anything that implements the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" cutoff = float(kwargs.get('cutoff', 1.20)) n_nodes = int(kwargs.get('n_nodes', 1000)) num_iter = int(kwargs.get('num_iter', 20)) ag = None title_suffix = '' if 'ag' in kwargs: ag = kwargs['ag'] if not isinstance(ag, AtomGroup): raise TypeError('ag must be an AtomGroup instance') n_csets = ag.numCoordsets() else: ag = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) n_csets = 0 biomol = kwargs.get('biomol', False) hd = None LOGGER.warn('Building coordinates from electron density map. This may take a while.') LOGGER.timeit() _parseEMDLines(ag, stream, cutoff=cutoff, n_nodes=n_nodes, num_iter=num_iter, format='EMD') LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(ag.numAtoms(), ag.numCoordsets() - n_csets)) return ag
def getESSAEnsemble(self): 'Returns ESSA mode ensemble, comprised of ENMS calculated for each scanned/perturbed residue.' if self._lowmem: LOGGER.warn('ModeEnsemble was not generated due to lowmem=True') else: return self._ensemble[:]
def getParticularSMILES(self, key): """Returns SMILES for a particular chemical""" if not self.isSuccess: LOGGER.warn( 'Quartata Chemical Record does not have any data yet.' 'Please run fetch again, possibly with different parameters.') return self._chemDict[key]['SMILES']
def _try_import_matplotlib(): try: import matplotlib as plt plt.rcParams.update({'font.size': 20, 'font.family': 'Arial'}) except ImportError: LOGGER.warn('matplotlib is required for generating figures') return None return plt
def saveESSAEnsemble(self): 'Saves ESSA mode ensemble, comprised of ENMS calculated for each scanned/perturbed residue.' if self._lowmem: LOGGER.warn('ModeEnsemble was not generated due to lowmem=True') else: saveModeEnsemble(self._ensemble, filename='{}_{}'.format(self._title, self._enm))
def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if (not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if (e.errno != 17): raise e else: raise Exception( 'Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.' ) # Check if the file exists already if (not os.path.isfile(data_folder + '/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder + '/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})'.format( go_obo_url, sympath(filename))) else: LOGGER.warn( '{0} download failed, reason unknown.'.format(go_obo_url)) else: go_obo = data_folder + '/go-basic.obo' return obo_parser.GODag(go_obo)
def getPDBs(self, filtered=True): """Returns PDB list (filters may be applied)""" if self._alignPDB is None: LOGGER.warn('Dali Record does not have any data yet. Please run fetch.') if filtered: return self._pdbList return self._pdbListAll
def parseOBO(**kwargs): """Parse a GO OBO file containing the GO itself. See `OBO`_ for more information on the file format. .. _OBO: http://owlcollab.github.io/oboformat/doc/obo-syntax.html """ try: from goatools import obo_parser except: raise ImportError('GOATools needs to be installed to use parseOBO') go_obo_url = kwargs.get('go_obo_url', None) if go_obo_url is None: go_obo_url = 'http://purl.obolibrary.org/obo/go/go-basic.obo' data_folder = kwargs.get('data_folder', None) if data_folder is None: data_folder = os.getcwd() + '/Data' # Check if we have the ./data directory already if(not os.path.isfile(data_folder)): # Emulate mkdir -p (no error if folder exists) try: os.mkdir(data_folder) except OSError as e: if(e.errno != 17): raise e else: raise Exception('Data path (' + data_folder + ') exists as a file. ' 'Please rename, remove or change the desired location of the data path.') # Check if the file exists already if(not os.path.isfile(data_folder+'/go-basic.obo')): try: handle = openURL(go_obo_url) except Exception as err: LOGGER.warn('{0} download failed ({1}).'.format( go_obo_url, str(err))) else: data = handle.read() if len(data): filename = data_folder+'/go-basic.obo' with open(filename, 'w+b') as obofile: obofile.write(data) LOGGER.debug('{0} downloaded ({1})' .format(go_obo_url, sympath(filename))) else: LOGGER.warn('{0} download failed, reason unknown.' .format(go_obo_url)) else: go_obo = data_folder+'/go-basic.obo' return obo_parser.GODag(go_obo)
def _setCoords(self, coords, label=None, overwrite=False): """Set coordinates without data type checking. *coords* must be a :class:`~numpy.ndarray`, but may have data type other than :class:`numpy.float64`, e.g. :class:`numpy.float32`. *label* argument may be used to label coordinate sets. *label* may be a string or a list of strings length equal to the number of coordinate sets.""" n_atoms = self._n_atoms if n_atoms: if coords.shape[-2] != n_atoms: raise ValueError('coords array has incorrect number of atoms') else: self._n_atoms = n_atoms = coords.shape[-2] ndim = coords.ndim shape = coords.shape if self._coords is None or overwrite or (ndim == 3 and shape[0] > 1): if ndim == 2: self._coords = coords.reshape((1, n_atoms, 3)) if label is None: self._cslabels = [None] else: self._cslabels = [str(label)] self._n_csets = n_csets = 1 else: self._coords = coords self._n_csets = n_csets = shape[0] if label is None or isinstance(label, str): self._cslabels = [label] * n_csets elif isinstance(label, (list, tuple)): if len(label) == n_csets: self._cslabels = list(label) else: self._cslabels = [None] * n_csets LOGGER.warn('Number of labels does not match number ' 'of coordinate sets.') else: LOGGER.warn('Wrong type for `label` argument.') self._acsi = 0 self._setTimeStamp() else: acsi = self._acsi if ndim == 2: self._coords[acsi] = coords else: self._coords[acsi] = coords[0] self._setTimeStamp(acsi) if label is not None: self._cslabels[acsi] = str(label)
def SCN(M, **kwargs): la = importLA() total_count = kwargs.pop('total_count', None) max_loops = kwargs.pop('max_loops', 100) tol = kwargs.pop('tol', 1e-5) N = M.copy() n = 0 d0 = None p = 1 last_p = None while True: C = np.diag(div0(1., np.sum(N, axis=0))) N = np.dot(N, C) R = np.diag(div0(1., np.sum(N, axis=1))) N = np.dot(R, N) n += 1 # check convergence of symmetry d = np.mean(np.abs(N - N.T)) if d0 is not None: p = div0(d, d0) dp = np.abs(p - last_p) if dp < tol: break else: d0 = d LOGGER.debug('Iteration {0}: d = {1}, p = {2}'.format(str(n), str(d), str(p))) last_p = p if max_loops is not None: if n >= max_loops: LOGGER.warn('The SCN algorithm did not converge after {0} ' 'iterations.'.format(max_loops)) break # guarantee symmetry N = (N + N.T) / 2. if total_count is 'original': total_count = np.sum(M) if total_count is not None: sum_N = np.sum(N) k = total_count / sum_N N = N * k return N
def parseEMDStream(stream, **kwargs): """ Returns an :class:`.AtomGroup` containing EMD data parsed from a stream of EMD file. :arg stream: Any object with the method ``readlines`` (e.g. :class:`file`, buffer, stdin)""" cutoff = kwargs.get('cutoff', None) if cutoff is not None: cutoff = float(cutoff) n_nodes = int(kwargs.get('n_nodes', 1000)) num_iter = int(kwargs.get('num_iter', 20)) map = kwargs.get('map',True) make_nodes = kwargs.get('make_nodes',False) if map is False and make_nodes is False: LOGGER.warn('At least one of map and make_nodes should be True. ' 'Setting map to False was an intentional change from the default ' 'behaviour so make_nodes has been set to True.') make_nodes = True title_suffix = kwargs.get('title_suffix','') atomgroup = AtomGroup(str(kwargs.get('title', 'Unknown')) + title_suffix) atomgroup._n_atoms = n_nodes if make_nodes: LOGGER.info('Building coordinates from electron density map. This may take a while.') LOGGER.timeit() if map: emd, atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) else: atomgroup = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) LOGGER.report('{0} atoms and {1} coordinate sets were ' 'parsed in %.2fs.'.format(atomgroup.numAtoms(), atomgroup.numCoordsets())) else: emd = _parseEMDLines(atomgroup, stream, cutoff=cutoff, n_nodes=n_nodes, \ num_iter=num_iter, map=map, make_nodes=make_nodes) if make_nodes: if map: return emd, atomgroup else: return atomgroup else: return emd
def __add__(self, other): if not isinstance(other, AtomGroup): raise TypeError('unsupported operand type(s) for +: {0} and ' '{1}'.format(repr(type(self).__name__), repr(type(other).__name__))) new = AtomGroup(self._title + ' + ' + other._title) if self._n_csets: if self._n_csets == other._n_csets: new.setCoords(np.concatenate((self._coords, other._coords), 1)) if self._n_csets > 1: LOGGER.info('All {0} coordinate sets are copied to ' '{1}.'.format(self._n_csets, new.getTitle())) else: new.setCoords(np.concatenate((self._getCoords(), other._getCoords()))) LOGGER.info('Active coordinate sets are copied to {0}.' .format(new.getTitle())) elif other._n_csets: LOGGER.warn('No coordinate sets are copied to {0}' .format(new.getTitle())) for key in set(list(self._data) + list(other._data)): if key in ATOMIC_FIELDS and ATOMIC_FIELDS[key].readonly: continue this = self._data.get(key) that = other._data.get(key) if this is not None or that is not None: if this is None: shape = list(that.shape) shape[0] = len(self) this = np.zeros(shape, that.dtype) if that is None: shape = list(this.shape) shape[0] = len(other) that = np.zeros(shape, this.dtype) new._data[key] = np.concatenate((this, that)) if self._bonds is not None and other._bonds is not None: new.setBonds(np.concatenate([self._bonds, other._bonds + self._n_atoms])) elif self._bonds is not None: new.setBonds(self._bonds.copy()) elif other._bonds is not None: new.setBonds(other._bonds + self._n_atoms) return new
def addCoordset(self, coords, label=None): """Add a coordinate set. *coords* argument may be an object with :meth:`getCoordsets` method.""" if self._coords is None: return self.setCoords(coords) n_atoms = self._n_atoms atoms = coords try: coords = (atoms._getCoordsets() if hasattr(coords, '_getCoordsets') else atoms.getCoordsets()) except AttributeError: pass else: if coords is None: raise ValueError('coordinates of {0} are not set' .format(str(atoms))) try: checkCoords(coords, csets=True, natoms=n_atoms, dtype=None) except TypeError: raise TypeError('coords must be a numpy array or an ' 'object with `getCoords` method') if coords.ndim == 2: coords = coords.reshape((1, n_atoms, 3)) diff = coords.shape[0] self._coords = np.concatenate((self._coords, coords), axis=0) self._n_csets = self._coords.shape[0] timestamps = self._timestamps self._timestamps = np.zeros(self._n_csets) self._timestamps[:len(timestamps)] = timestamps self._timestamps[len(timestamps):] = time() self._kdtrees.extend([None] * diff) if label is None or isinstance(label, str): self._cslabels.extend([label] * diff) elif isinstance(label, (list, tuple)): if len(label) == diff: self._cslabels.extend([str(lbl) for lbl in label]) else: LOGGER.warn('Number of labels does not match number ' 'of coordinate sets.') else: LOGGER.warn('Wrong type for `label` argument.')
def prody_select(selstr, *pdbs, **kwargs): """Write selected atoms from a PDB file in PDB format. :arg selstr: atom selection string, see :ref:`selections` :arg pdbs: :term:`PDB` identifier(s) or filename(s) :arg output: output filename, default is :file:`pdb_selected.pdb` :arg prefix: prefix for output file, default is PDB filename :arg suffix: output filename suffix, default is :file:`_selected`""" from os.path import isfile from prody import LOGGER, parsePDB, writePDB #selstr = kwargs.get('selstr') if not pdbs: raise ValueError('pdb argument must be provided') if ((isfile(selstr) or len(selstr) == 4 and selstr[0].isdigit()) and len(pdbs) == 1 and not isfile(pdbs[0])): pdbs, selstr = selstr, pdbs[0] LOGGER.warn('The order of selstr and pdb arguments have switched ' 'to support multiple files, old order will be supported ' 'until v1.4.') pdbs = [pdbs] prefix = kwargs.get('prefix', None) suffix = kwargs.get('suffix', '_selected') output = kwargs.get('output', None) for pdb in pdbs: pdb = parsePDB(pdb) pdbselect = pdb.select(selstr) if pdbselect is None: LOGGER.warn('Selection {0:s} did not match any atoms.' .format(repr(selstr))) return LOGGER.info('Selection {0:s} matched {1:d} atoms.' .format(repr(selstr), len(pdbselect))) outname = output or ((prefix or pdb.getTitle()) + suffix) LOGGER.info('Selection is written into: ' + writePDB(outname, pdbselect))
def pathPDBFolder(folder=None, divided=False): """Returns or specify local PDB folder for storing PDB files downloaded from `wwPDB <http://www.wwpdb.org/>`_ servers. Files stored in this folder can be accessed via :func:`.fetchPDB` from any working directory. To release the current folder, pass an invalid path, e.g. ``folder=''``. If *divided* is **True**, the divided folder structure of wwPDB servers will be assumed when reading from and writing to the local folder. For example, a structure with identifier **1XYZ** will be present as :file:`pdblocalfolder/yz/pdb1xyz.pdb.gz`. If *divided* is **False**, a plain folder structure will be expected and adopted when saving files. For example, the same structure will be present as :file:`pdblocalfolder/1xyz.pdb.gz`. Finally, in either case, lower case letters will be used and compressed files will be stored.""" if folder is None: folder = SETTINGS.get('pdb_local_folder') if folder: if isdir(folder): return folder, SETTINGS.get('pdb_local_divided', True) else: LOGGER.warn('PDB local folder {0} is not a accessible.' .format(repr(folder))) else: if isdir(folder): folder = abspath(folder) LOGGER.info('Local PDB folder is set: {0}'.format(repr(folder))) if divided: LOGGER.info('wwPDB divided folder structure will be assumed.') else: LOGGER.info('A plain folder structure will be assumed.') SETTINGS['pdb_local_folder'] = folder SETTINGS['pdb_local_divided'] = bool(divided) SETTINGS.save() else: current = SETTINGS.pop('pdb_local_folder') if current: LOGGER.info('PDB folder {0} is released.' .format(repr(current))) SETTINGS.pop('pdb_local_divided') SETTINGS.save() else: raise IOError('{0} is not a valid path.'.format(repr(folder)))
def fetchPDBClusters(sqid=None): """Retrieve PDB sequence clusters. PDB sequence clusters are results of the weekly clustering of protein chains in the PDB generated by blastclust. They are available at FTP site: ftp://resources.rcsb.org/sequence/clusters/ This function will download about 10 Mb of data and save it after compressing in your home directory in :file:`.prody/pdbclusters`. Compressed files will be less than 4 Mb in size. Cluster data can be loaded using :func:`loadPDBClusters` function and be accessed using :func:`listPDBCluster`.""" if sqid is not None: if sqid not in PDB_CLUSTERS: raise ValueError('sqid must be one of ' + PDB_CLUSTERS_SQID_STR) keys = [sqid] else: keys = list(PDB_CLUSTERS) PDB_CLUSTERS_PATH = os.path.join(getPackagePath(), 'pdbclusters') if not os.path.isdir(PDB_CLUSTERS_PATH): os.mkdir(PDB_CLUSTERS_PATH) LOGGER.progress('Downloading sequence clusters', len(PDB_CLUSTERS), '_prody_fetchPDBClusters') count = 0 for i, x in enumerate(keys): filename = 'bc-{0}.out'.format(x) url = ('ftp://resources.rcsb.org/sequence/clusters/' + filename) try: inp = openURL(url) except IOError: LOGGER.warning('Clusters at {0}% sequence identity level could ' 'not be downloaded.') continue else: out = openFile(filename+'.gz', 'w', folder=PDB_CLUSTERS_PATH) out.write(inp.read()) inp.close() out.close() count += 1 LOGGER.update(i, '_prody_fetchPDBClusters') LOGGER.clear() if len(PDB_CLUSTERS) == count: LOGGER.info('All PDB clusters were downloaded successfully.') elif count == 0: LOGGER.warn('PDB clusters could not be downloaded.')
def _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames): altloc_keys = list(altloc) altloc_keys.sort() indices = {} for key in altloc_keys: xyz = atomgroup.getCoords() success = 0 lines = altloc[key] for line, i in lines: aan = line[12:16].strip() arn = line[17:21].strip() ach = line[21] ari = int(line[22:26].split()[0]) rn, ids, ans = indices.get((ach, ari), (None, None, None)) if ids is None: ids = indices.get(ach, None) if ids is None: ids = (chainids == ach).nonzero()[0] indices[ach] = ids ids = ids[resnums[ids] == ari] if len(ids) == 0: LOGGER.warn("failed to parse altloc {0} at line {1}, " "residue not present for altloc 'A'".format( repr(key), i+1)) continue rn = resnames[ids[0]] ans = atomnames[ids] indices[(ach, ari)] = (rn, ids, ans) if rn != arn: LOGGER.warn("failed to parse altloc {0} at line {1}, " "residue name mismatch (expected {2}, " "parsed {3})".format(repr(key), i+1, repr(rn), repr(arn))) continue index = ids[(ans == aan).nonzero()[0]] if len(index) != 1: LOGGER.warn("failed to parse altloc {0} at line {1}, atom" " {2} not found in the residue" .format(repr(key), i+1, repr(aan))) continue try: xyz[index[0], 0] = float(line[30:38]) xyz[index[0], 1] = float(line[38:46]) xyz[index[0], 2] = float(line[46:54]) except: LOGGER.warn('failed to parse altloc {0} at line {1}, could' ' not read coordinates'.format(repr(key), i+1)) continue success += 1 LOGGER.info('{0} out of {1} altloc {2} lines were parsed.' .format(success, len(lines), repr(key))) if success > 0: LOGGER.info('Altloc {0} is appended as a coordinate set to ' 'atomgroup {1}.'.format(repr(key), atomgroup.getTitle())) atomgroup.addCoordset(xyz, label='altloc ' + key)
def checkIdentifiers(*pdb): """Check whether *pdb* identifiers are valid, and replace invalid ones with **None** in place.""" identifiers = [] append = identifiers.append for pid in pdb: try: pid = pid.strip().lower() except AttributeError: LOGGER.warn('{0} is not a valid identifier.'.format(repr(pid))) append(None) else: if not (len(pid) == 4 and pid.isalnum()): LOGGER.warn('{0} is not a valid identifier.' .format(repr(pid))) append(None) else: append(pid) return identifiers
def __or__(self, other): if self is other: return self if not isinstance(other, AtomPointer): raise TypeError('other must be an AtomPointer') if self._ag != other.getAtomGroup(): raise ValueError('both selections must be from the same AtomGroup') acsi = self.getACSIndex() if acsi != other.getACSIndex(): LOGGER.warn('Active coordinate set indices do not match, it will ' 'be set to zero.') acsi = 0 indices = np.unique(np.concatenate((self._getIndices(), other._getIndices()))) return Selection(self._ag, indices, '({0:s}) or ({1:s})'.format( self.getSelstr(), other.getSelstr()), acsi, unique=True)
def addNonstdAminoacid(resname, *properties): """Add non-standard amino acid *resname* with *properties* selected from: * {props} .. ipython:: python addNonstdAminoacid('PTR', 'acidic', 'aromatic', 'cyclic', 'large', 'polar', 'surface') Default set of non-standard amino acids can be restored as follows: .. ipython:: python flagDefinition(reset='nonstdaa')""" resname = str(resname) if len(resname) > 4: LOGGER.warn('Residue name {0} is unusually long.' .format(repr(resname))) propset = set(properties) for cat, val in CATEGORIES.items(): intersection = val.intersection(propset) if intersection: if len(intersection) > 1: raise ValueError('amino acid properties {0} cannot be ' 'present together' .format(', '.join([repr(prp) for prp in intersection]))) for prop in intersection: propset.remove(prop) if propset: raise ValueError('amino acid property {0} is not valid' .format(repr(propset.pop()))) nonstd = SETTINGS.get(NONSTANDARD_KEY, NONSTANDARD) nonstd[resname] = set(properties) updateNonstandard(nonstd)
def _getTrace(self): """Returns trace, and emit a warning message if trace is calculated using eigenvalues of a subset of variances (eigenvalues or inverse eigenvalues).""" trace = self._trace if trace is None: if self._vars is None: raise ValueError('variances are not set or calculated') trace = self._vars.sum() diff = self._dof - self._n_modes if self._is3d and diff > 6: diff = True elif diff > 1: diff = True else: diff = False if diff: from prody import LOGGER LOGGER.warn('Total variance for {0} is calculated using ' '{1} available modes out of {2} possible.' .format(str(self), self._n_modes, self._dof)) return trace
def getDictMapping(target, chain, map_dict): """Returns lists of matching residues (based on *map_dict*).""" pdbid = chain._chain.getTitle()[:4].lower() chid = chain._chain.getChid().upper() key = pdbid + chid mapping = map_dict.get(key) if mapping is None: LOGGER.warn('map_dict does not have the mapping for {0}'.format(key)) return None tar_indices = mapping[0] chn_indices = mapping[1] chain_res_list = [res for res in chain] amatch = [] bmatch = [] n_match = 0 n_mapped = 0 for i, a in enumerate(target): ares = a.getResidue() amatch.append(ares) if i in tar_indices: try: n = tar_indices.index(i) except IndexError: LOGGER.warn('\nthe number of residues in the map_dict ({0} residues) is inconsistent with {2} ({1} residues)' .format(max(tar_indices)+1, len(chain_res_list), target.getTitle())) return None try: b = chain_res_list[chn_indices[n]] except IndexError: LOGGER.warn('\nthe number of residues in the map_dict ({0} residues) is inconsistent with {2} ({1} residues)' .format(max(chn_indices)+1, len(chain_res_list), chain.getTitle())) return None bres = b.getResidue() bmatch.append(bres) if a.getResname() == b.getResname(): n_match += 1 n_mapped += 1 else: bmatch.append(None) return amatch, bmatch, n_match, n_mapped
def alignPDBEnsemble(ensemble, suffix='_aligned', outdir='.', gzip=False): """Align PDB files using transformations from *ensemble*, which may be a :class:`.PDBEnsemble` or a :class:`.PDBConformation` instance. Label of the conformation (see :meth:`~.PDBConformation.getLabel`) will be used to determine the PDB structure and model number. First four characters of the label is expected to be the PDB identifier and ending numbers to be the model number. For example, the :class:`.Transformation` from conformation with label *2k39_ca_selection_'resnum_<_71'_m116* will be applied to 116th model of structure **2k39**. After applicable transformations are made, structure will be written into *outputdir* as :file:`2k39_aligned.pdb`. If *gzip* is **True**, output files will be compressed. Return value is the output filename or list of filenames, in the order files are processed. Note that if multiple models from a file are aligned, that filename will appear in the list multiple times.""" if not isinstance(ensemble, (PDBEnsemble, PDBConformation)): raise TypeError('ensemble must be a PDBEnsemble or PDBConformation') if isinstance(ensemble, PDBConformation): ensemble = [ensemble] if gzip: gzip = '.gz' else: gzip = '' output = [] pdbdict = {} for conf in ensemble: trans = conf.getTransformation() if trans is None: raise ValueError('transformations are not calculated, call ' '`superpose` or `iterpose`') label = conf.getLabel() pdb = label[:4] filename = pdbdict.get(pdb, fetchPDB(pdb)) if filename is None: LOGGER.warning('PDB file for conformation {0} is not found.' .format(label)) output.append(None) continue LOGGER.info('Parsing PDB file {0} for conformation {1}.' .format(pdb, label)) acsi = None model = label.rfind('m') if model > 3: model = label[model+1:] if model.isdigit(): acsi = int(model) - 1 LOGGER.info('Applying transformation to model {0}.' .format(model)) if isinstance(filename, str): ag = parsePDB(filename) else: ag = filename if acsi is not None: if acsi >= ag.numCoordsets(): LOGGER.warn('Model number {0} for {1} is out of range.' .format(model, pdb)) output.append(None) continue ag.setACSIndex(acsi) trans.apply(ag) outfn = os.path.join(outdir, pdb + suffix + '.pdb' + gzip) if ag.numCoordsets() > 1: pdbdict[pdb] = ag else: writePDB(outfn, ag) output.append(os.path.normpath(outfn)) for pdb, ag in pdbdict.items(): # PY3K: OK writePDB(os.path.join(outdir, pdb + suffix + '.pdb' + gzip), ag) if len(output) == 1: return output[0] else: return output
def searchPfam(query, **kwargs): """Return Pfam search results in a dictionary. Matching Pfam accession as keys will map to evalue, alignment start and end residue positions. :arg query: UniProt ID, PDB identifier, protein sequence, or a sequence file, sequence queries must not contain without gaps and must be at least 16 characters long :type query: str :arg timeout: timeout for blocking connection attempt in seconds, default is 60 :type timeout: int *query* can also be a PDB identifier, e.g. ``'1mkp'`` or ``'1mkpA'`` with chain identifier. UniProt ID of the specified chain, or the first protein chain will be used for searching the Pfam database.""" prefix = "{http://pfam.xfam.org/}" query = str(query) if isfile(query): from prody.sequence import MSAFile try: seq = next(MSAFile(query)) except: with openFile(query) as inp: seq = "".join(inp.read().split()) else: seq = seq[0][1] if not seq.isalpha(): raise ValueError("could not parse a sequence without gaps from " + query) else: seq = "".join(query.split()) import xml.etree.cElementTree as ET LOGGER.timeit("_pfam") timeout = int(kwargs.get("timeout", 60)) if len(seq) >= MINSEQLEN: if not seq.isalpha(): raise ValueError(repr(seq) + " is not a valid sequence") fseq = ">Seq\n" + seq parameters = {"hmmdb": "pfam", "seq": fseq} enc_params = urllib.urlencode(parameters) request = urllib2.Request("http://hmmer.janelia.org/search/hmmscan", enc_params) url = urllib2.urlopen(request).geturl() + "?output=xml" LOGGER.debug('Submitted Pfam search for sequence "{0}...".'.format(seq[:MINSEQLEN])) xml = openURL(url, timeout=timeout).read() try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) matches = {} for child in root[0]: if child.tag == "hits": accession = child.get("acc") pfam_id = accession.split(".")[0] matches[pfam_id] = {} matches[pfam_id]["accession"] = accession matches[pfam_id]["class"] = "Domain" matches[pfam_id]["id"] = child.get("name") matches[pfam_id]["locations"] = {} matches[pfam_id]["locations"]["ali_end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["ali_start"] = child[0].get("alisqfrom") matches[pfam_id]["locations"]["bitscore"] = child[0].get("bitscore") matches[pfam_id]["locations"]["end"] = child[0].get("alisqto") matches[pfam_id]["locations"]["evalue"] = child.get("evalue") matches[pfam_id]["locations"]["evidence"] = "hmmer v3.0" matches[pfam_id]["locations"]["hmm_end"] = child[0].get("alihmmto") matches[pfam_id]["locations"]["hmm_start"] = child[0].get("alihmmfrom") matches[pfam_id]["locations"]["significant"] = child[0].get("significant") matches[pfam_id]["locations"]["start"] = child[0].get("alisqfrom") matches[pfam_id]["type"] = "Pfam-A" return matches else: if len(seq) <= 5: idcode = None from prody import parsePDBHeader try: polymers = parsePDBHeader(seq[:4], "polymers") except Exception as err: LOGGER.warn("failed to parse header for {0} ({1})".format(seq[:4], str(err))) else: chid = seq[4:].upper() for poly in polymers: if chid and poly.chid != chid: continue for dbref in poly.dbrefs: if dbref.database != "UniProt": continue idcode = dbref.idcode LOGGER.info( "UniProt ID code {0} for {1} chain " "{2} will be used.".format(idcode, seq[:4], poly.chid) ) break if idcode is not None: break if idcode is None: LOGGER.warn("A UniProt ID code for PDB {0} could not be " "parsed.".format(repr(seq))) url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + idcode + "?output=xml" else: url = "http://pfam.xfam.org/protein/" + seq + "?output=xml" LOGGER.debug("Retrieving Pfam search results: " + url) xml = None while LOGGER.timing("_pfam") < timeout: try: xml = openURL(url, timeout=timeout).read() except Exception: pass else: if xml: break if not xml: raise IOError("Pfam search timed out or failed to parse results " "XML, check URL: " + url) else: LOGGER.report("Pfam search completed in %.2fs.", "_pfam") if xml.find(b"There was a system error on your last request.") > 0: LOGGER.warn("No Pfam matches found for: " + seq) return None try: root = ET.XML(xml) except Exception as err: raise ValueError("failed to parse results XML, check URL: " + url) if len(seq) >= MINSEQLEN: try: xml_matches = root[0][0][0][0] except IndexError: raise ValueError("failed to parse results XML, check URL: " + url) else: results = dictElement(root[0], prefix) try: xml_matches = results["matches"] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) matches = dict() for child in xml_matches: try: accession = child.attrib["accession"][:7] except KeyError: raise ValueError("failed to parse results XML, check URL: " + url) if not re.search("^P(F|B)[0-9]{5}$", accession): raise ValueError("{0} does not match pfam accession" " format".format(accession)) match = matches.setdefault(accession, dict(child.items())) locations = match.setdefault("locations", []) for loc in child: locations.append(dict(loc.items())) if len(seq) < MINSEQLEN: query = "Query " + repr(query) else: query = "Query sequence" if matches: LOGGER.info(query + " matched {0} Pfam families.".format(len(matches))) else: LOGGER.info(query + " did not match any Pfam families.") return matches
def fetchPDBLigand(cci, filename=None): """Fetch PDB ligand data from PDB_ for chemical component *cci*. *cci* may be 3-letter chemical component identifier or a valid XML filename. If *filename* is given, XML file will be saved with that name. If you query ligand data frequently, you may configure ProDy to save XML files in your computer. Set ``ligand_xml_save`` option **True**, i.e. ``confProDy(ligand_xml_save=True)``. Compressed XML files will be save to ProDy package folder, e.g. :file:`/home/user/.prody/pdbligands`. Each file is around 5Kb when compressed. This function is compatible with PDBx/PDBML v 4.0. Ligand data is returned in a dictionary. Ligand coordinate atom data with *model* and *ideal* coordinate sets are also stored in this dictionary. Note that this dictionary will contain data that is present in the XML file and all Ligand Expo XML files do not contain every possible data field. So, it may be better if you use :meth:`dict.get` instead of indexing the dictionary, e.g. to retrieve formula weight (or relative molar mass) of the chemical component use ``data.get('formula_weight')`` instead of ``data['formula_weight']`` to avoid exceptions when this data field is not found in the XML file. URL and/or path of the XML file are returned in the dictionary with keys ``url`` and ``path``, respectively. Following example downloads data for ligand STI (a.k.a. Gleevec and Imatinib) and calculates RMSD between model (X-ray structure 1IEP) and ideal (energy minimized) coordinate sets: .. ipython:: python from prody import * ligand_data = fetchPDBLigand('STI') ligand_data['model_coordinates_db_code'] ligand_model = ligand_data['model'] ligand_ideal = ligand_data['ideal'] transformation = superpose(ligand_ideal.noh, ligand_model.noh) calcRMSD(ligand_ideal.noh, ligand_model.noh)""" if not isinstance(cci, str): raise TypeError('cci must be a string') if isfile(cci): inp = openFile(cci) xml = inp.read() inp.close() url = None path = cci cci = splitext(splitext(split(cci)[1])[0])[0].upper() elif len(cci) > 4 or not cci.isalnum(): raise ValueError('cci must be 3-letters long and alphanumeric or ' 'a valid filename') else: xml = None cci = cci.upper() if SETTINGS.get('ligand_xml_save'): folder = join(getPackagePath(), 'pdbligands') if not isdir(folder): makePath(folder) xmlgz = path = join(folder, cci + '.xml.gz') if isfile(xmlgz): with openFile(xmlgz) as inp: xml = inp.read() else: path = None #url = ('http://ligand-expo.rcsb.org/reports/{0[0]}/{0}/{0}' # '.xml'.format(cci.upper())) url = 'http://www.pdb.org/pdb/files/ligand/{0}.xml'.format(cci.upper()) if not xml: #'http://www.pdb.org/pdb/files/ligand/{0}.xml' try: inp = openURL(url) except IOError: raise IOError('XML file for ligand {0} is not found online' .format(cci)) else: xml = inp.read() inp.close() if filename: out = openFile(filename, mode='w', folder=folder) out.write(xml) out.close() if SETTINGS.get('ligand_xml_save'): with openFile(xmlgz, 'w') as out: out.write(xml) import xml.etree.cElementTree as ET root = ET.XML(xml) if (root.get('{http://www.w3.org/2001/XMLSchema-instance}' 'schemaLocation') != 'http://pdbml.pdb.org/schema/pdbx-v40.xsd pdbx-v40.xsd'): LOGGER.warn('XML is not in PDBx/PDBML v 4.0 format, resulting ' 'dictionary may not contain all data fields') ns = root.tag[:root.tag.rfind('}')+1] len_ns = len(ns) dict_ = {'url': url, 'path': path} for child in list(root.find(ns + 'chem_compCategory')[0]): tag = child.tag[len_ns:] if tag.startswith('pdbx_'): tag = tag[5:] dict_[tag] = child.text dict_['formula_weight'] = float(dict_.get('formula_weight')) identifiers_and_descriptors = [] results = root.find(ns + 'pdbx_chem_comp_identifierCategory') if results: identifiers_and_descriptors.extend(results) results = root.find(ns + 'pdbx_chem_comp_descriptorCategory') if results: identifiers_and_descriptors.extend(results) for child in identifiers_and_descriptors: program = child.get('program').replace(' ', '_') type_ = child.get('type').replace(' ', '_') dict_[program + '_' + type_] = child[0].text dict_[program + '_version'] = child.get('program_version') dict_['audits'] = [(audit.get('action_type'), audit.get('date')) for audit in list(root.find(ns + 'pdbx_chem_comp_auditCategory'))] atoms = list(root.find(ns + 'chem_comp_atomCategory')) n_atoms = len(atoms) ideal_coords = np.zeros((n_atoms, 3)) model_coords = np.zeros((n_atoms, 3)) atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) elements = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['element'].dtype) resnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['resname'].dtype) charges = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) resnums = np.ones(n_atoms, dtype=ATOMIC_FIELDS['charge'].dtype) alternate_atomnames = np.zeros(n_atoms, dtype=ATOMIC_FIELDS['name'].dtype) leaving_atom_flags = np.zeros(n_atoms, np.bool) aromatic_flags = np.zeros(n_atoms, np.bool) stereo_configs = np.zeros(n_atoms, np.bool) ordinals = np.zeros(n_atoms, int) name2index = {} for i, atom in enumerate(atoms): data = dict([(child.tag[len_ns:], child.text) for child in list(atom)]) name = data.get('pdbx_component_atom_id', 'X') name2index[name] = i atomnames[i] = name elements[i] = data.get('type_symbol', 'X') resnames[i] = data.get('pdbx_component_comp_id', 'UNK') charges[i] = float(data.get('charge', 0)) alternate_atomnames[i] = data.get('alt_atom_id', 'X') leaving_atom_flags[i] = data.get('pdbx_leaving_atom_flag') == 'Y' aromatic_flags[i] = data.get('pdbx_atomatic_flag') == 'Y' stereo_configs[i] = data.get('pdbx_stereo_config') == 'Y' ordinals[i] = int(data.get('pdbx_ordinal', 0)) model_coords[i, 0] = float(data.get('model_Cartn_x', 0)) model_coords[i, 1] = float(data.get('model_Cartn_y', 0)) model_coords[i, 2] = float(data.get('model_Cartn_z', 0)) ideal_coords[i, 0] = float(data.get('pdbx_model_Cartn_x_ideal', 0)) ideal_coords[i, 1] = float(data.get('pdbx_model_Cartn_y_ideal', 0)) ideal_coords[i, 2] = float(data.get('pdbx_model_Cartn_z_ideal', 0)) pdbid = dict_.get('model_coordinates_db_code') if pdbid: model = AtomGroup(cci + ' model ({0})'.format(pdbid)) else: model = AtomGroup(cci + ' model') model.setCoords(model_coords) model.setNames(atomnames) model.setResnames(resnames) model.setResnums(resnums) model.setElements(elements) model.setCharges(charges) model.setFlags('leaving_atom_flags', leaving_atom_flags) model.setFlags('aromatic_flags', aromatic_flags) model.setFlags('stereo_configs', stereo_configs) model.setData('ordinals', ordinals) model.setData('alternate_atomnames', alternate_atomnames) dict_['model'] = model ideal = model.copy() ideal.setTitle(cci + ' ideal') ideal.setCoords(ideal_coords) dict_['ideal'] = ideal bonds = [] warned = set() for bond in list(root.find(ns + 'chem_comp_bondCategory') or bonds): name_1 = bond.get('atom_id_1') name_2 = bond.get('atom_id_2') try: bonds.append((name2index[name_1], name2index[name_2])) except KeyError: if name_1 not in warned and name_1 not in name2index: warned.add(name_1) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_1), cci)) if name_2 not in warned and name_2 not in name2index: warned.add(name_2) LOGGER.warn('{0} specified {1} in bond category is not ' 'a valid atom name.'.format(repr(name_2), cci)) if bonds: bonds = np.array(bonds, int) model.setBonds(bonds) ideal.setBonds(bonds) return dict_
def psiBlastCycle(sequence=None, filename=None, **kwargs): """Returns a :class:`PDBBlastRecord` instance that contains results from a single cycle of EBI psiblast. :arg sequence: an object with an associated sequence string or a sequence string itself :type sequence: :class:`Atomic`, :class:`Sequence`, or str :arg filename: a *filename* to save the results in XML format :type filename: str The following search parameters can be adjusted by the user. We use the same default values as http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/ wherever applicable. :arg email: email address for reporting problems default is [email protected] :type email: str with an @ before a . :arg matrix: The comparison matrix to be used to score alignments when searching the database possible values are 'BLOSUM45', 'BLOSUM62', 'BLOSUM80', 'PAM30' and 'PAM70' default is 'BLOSUM62' :type matrix: str :arg gapopen: Penalty taken away from the score when a gap is created in sequence alignments. Increasing the gap opening penalty will decrease the number of gaps in the final alignment. Possible values range from 8 to 16 inclusive, default is 11 :type gapopen: int :arg gapext: Penalty taken away from the score for each base or residue in the gap. Increasing the gap extension penalty favors short gaps in the final alignment, conversly decreasing the gap extension penalty favors long gaps in the final alignment. Possible values range from 0 to 3, default is 1 :type gapext: int :arg expthr: Expectation threshold that limits the number of scores and alignments reported. This is the maximum number of times the match is expected to occur by chance. Possible values are 1.0e-200, 1.0e-100, 1.0e-50, 1.0e-10, 1.0e-5, 1.0e-4, 1.0e-3, 1.0e-2, 0.1, 1.0, 10.0, 100, 1000 default is 10.0 :type expthr: float :arg psithr: Expectation value threshold for automatic selection of matched sequences for inclusion in the PSSM at each iteration. Possible values are 1.0e-6, 1.0e-5, 1.0e-4, 2.0e-4, 5.0e-4, 1.0e-3, 2.0e-3, 5.0e-3, 1.0e-2, 2.0e-2, 0.1, 0.3, 0.5, 1.0, 3.0, 10.0 default is 1.0e-3 :type psithr: float :arg scores: Maximum number of match score summaries reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type scores: int :arg alignments: Maximum number of match alignments reported in the result output. Possible values are 5, 10, 20, 50, 100, 200, 500, 750, 1000, or 5000 Default is 500 :type alignmets: int :arg dropoff: The amount a score can drop before extension of word hits is halted Possible values are 0, 2, 4, 6, 8, 10, 15, 20, 25, or 30 Default is 15 :type dropoff: int :arg finaldropoff: Dropoff value for final gapped alignment Possible values are 10, 12, 14, 16, 18, 20, 22, 24, 25, 26, 28, or 30 Default is 25 :type finaldropoff: int :arg filter: Filter regions of low sequence complexity. This can avoid issues with low complexity sequences where matches are found due to composition rather than meaningful sequence similarity. However, in some cases filtering also masks regions of interest and so should be used with caution. Possible values are T and F, default is F :type filter: str :arg seqrange: Specify a range or section of the input sequence to use in the search. Example: Specifying '34-89' in an input sequence of total length 100, will tell BLAST to only use residues 34 to 89, inclusive. :type seqrange: str of form START-END :arg database: a database name from those available. See http://www.ebi.ac.uk/Tools/services/rest/psiblast/parameterdetails/database default is pdb :type database: str :arg previousjobid: The job identifier for the previous PSI-BLAST iteration. default is None You can change this if you want to continue from a previous run :type previousjobid: str :arg selectedHits: Name of a file containing a list of identifiers of the hits from the previous iteration to use to construct the search PSSM for this iteration. default is None :type selectedHits: str :arg cpfile: Name of a Checkpoint file from the previous iteration. default is None :type cpfile: str :arg sleep: how long to wait to reconnect for status Sleep time is multiplied by 1.5 when results are not ready. default is 2 seconds :type sleep: float :arg timeout: when to give up waiting for the results default is 120 seconds :type timeout: float :arg cycle: cycle number :type cycle: int """ cycle = kwargs.get('cycle',0) if sequence == 'runexample': sequence = ('ASFPVEILPFLYLGCAKDSTNLDVLEEFGIKYILNVTPNLPNLFENAGEFKYKQIPI' 'SDHWSQNLSQFFPEAISFIDEARGKNCGVLVHSLAGISRSVTVTVAYLMQKLNLSMN' 'DAYDIVKMKKSNISPNFNFMGQLLDFERTL') elif isinstance(sequence, Atomic): sequence = sequence.calpha.getSequence() elif isinstance(sequence, Sequence): sequence = str(sequence) elif isinstance(sequence, str): if len(sequence) in [4, 5, 6]: ag = parsePDB(sequence) sequence = ag.calpha.getSequence() sequence = ''.join(sequence.split()) elif sequence is None: if cycle == 0: cycle = 1 else: raise TypeError('sequence must be Atomic, Sequence, or str not {0}' .format(type(sequence))) if cycle == 0: query = [('sequence', sequence)] else: query = [] email = kwargs.get('email','*****@*****.**') if not isinstance(email, str): raise TypeError('email must be a string') elif email.find('@') == -1 or email.find('.') == -1 or len(email.split('@')) != 2: raise ValueError('email must be a valid email address with at least one . and exactly one @ sign') elif not email.find('@') < email.find(email.split('.')[-1]): raise ValueError('email must be a valid email address with a . after the @ sign') query.append(('email', email)) query.append(('title', 'ProDy psiBlastPDB request')) previousjobid = kwargs.get('previousjobid','') if previousjobid is not '': query.append(('previousjobid',previousjobid)) selectedHits = kwargs.get('selectedHits','') if selectedHits is not '': query.append(('selectedHits',selectedHits)) database = kwargs.get('database','pdb') checkPsiBlastParameter('database', database) query.append(('database',database)) matrix = kwargs.get('matrix', 'BLOSUM62') checkPsiBlastParameter('matrix', matrix) query.append(('matrix',matrix)) gapopen = kwargs.get('gapopen',11) checkPsiBlastParameter('gapopen', gapopen) query.append(('gapopen',gapopen)) gapext = kwargs.get('gapext',1) checkPsiBlastParameter('gapext', gapext) query.append(('gapext',gapext)) expthr = kwargs.get('expthr', 10.) checkPsiBlastParameter('expthr', expthr) query.append(('expthr',expthr)) psithr = kwargs.get('psithr',1.0e-3) checkPsiBlastParameter('psithr', psithr) query.append(('psithr',psithr)) scores = kwargs.get('scores',500) checkPsiBlastParameter('scores', scores) query.append(('scores',scores)) alignments = kwargs.get('alignments',500) checkPsiBlastParameter('alignments', alignments) query.append(('alignments',alignments)) query.append(('alignView',0)) dropoff = kwargs.get('dropoff',15) checkPsiBlastParameter('dropoff', dropoff) query.append(('dropoff',dropoff)) finaldropoff = kwargs.get('finaldropoff',25) checkPsiBlastParameter('finaldropoff', finaldropoff) query.append(('finaldropoff',finaldropoff)) filter = kwargs.get('filter','F') checkPsiBlastParameter('filter', filter) query.append(('filter',filter)) if previousjobid is '' and selectedHits is '': seqrange = kwargs.get('seqrange', None) if seqrange is None: seqrange = '0-' + str(len(sequence)) elif not isinstance(seqrange, str): raise TypeError('seqrange should be a string') elif len(seqrange.split('-')) != 2: raise ValueError('seqrange should take the form START-END') try: start = int(seqrange.split('-')[0]) end = int(seqrange.split('-')[1]) except: raise ValueError('seqrange should be START-END with START and END being integers') query.append(('seqrange',seqrange)) headers = { 'User-Agent' : 'ProDy' } try: import urllib.parse urlencode = lambda data: bytes(urllib.parse.urlencode(data), 'utf-8') except ImportError: from urllib import urlencode sleep = float(kwargs.pop('sleep', 2)) timeout = float(kwargs.pop('timeout', 120)) data = urlencode(query) # submit the job base_url = 'http://www.ebi.ac.uk/Tools/services/rest/psiblast/' url = base_url + 'run/' LOGGER.timeit('_prody_psi-blast') if cycle == 0: LOGGER.info('PSI-Blast searching PDB database for "{0}..."' .format(sequence[:5])) else: LOGGER.info('PSI-Blast searching PDB database, cycle={0}' .format(cycle)) handle = openURL(url, data=data, headers=headers) job_id = handle.read() handle.close() # check the status url = base_url + 'status/' + job_id handle = openURL(url) status = handle.read() handle.close() # keep checking the status until it's no longer running while status == 'RUNNING': LOGGER.sleep(int(sleep), 'to reconnect to EBI for status.') LOGGER.write('Connecting to EBI for status...') handle = openURL(url) status = handle.read() LOGGER.clear() sleep = int(sleep * 1.5) if LOGGER.timing('_prody_psi-blast') > timeout: LOGGER.warn('PSI-Blast search time out.') return None LOGGER.info('The status is {0}'.format(status)) LOGGER.clear() LOGGER.report('PSI-Blast search completed in %.1fs.', '_prody_psi-blast') if cycle != 1: # get the results url = base_url + 'result/' + job_id + '/xml' handle = openURL(url) results = handle.read() handle.close() try: ext_xml = filename.lower().endswith('.xml') except AttributeError: pass else: if not ext_xml: filename += '.xml' f_out = open(filename, 'w') f_out.write(results) f_out.close() LOGGER.info('Results are saved as {0}.'.format(repr(filename))) return job_id, PsiBlastRecord(results, sequence) else: return job_id
def writePDBStream(stream, atoms, csets=None, **kwargs): """Write *atoms* in PDB format to a *stream*. :arg stream: anything that implements a :meth:`write` method (e.g. file, buffer, stdout) :arg renumber: whether to renumber atoms with serial indices Default is **True** :type renumber: bool """ renumber = kwargs.get('renumber',True) remark = str(atoms) try: coordsets = atoms._getCoordsets(csets) except AttributeError: try: coordsets = atoms._getCoords() except AttributeError: raise TypeError('atoms must be an object with coordinate sets') if coordsets is not None: coordsets = [coordsets] else: if coordsets.ndim == 2: coordsets = [coordsets] if coordsets is None: raise ValueError('atoms does not have any coordinate sets') try: acsi = atoms.getACSIndex() except AttributeError: try: atoms = atoms.getAtoms() except AttributeError: raise TypeError('atoms must be an Atomic instance or an object ' 'with `getAtoms` method') else: if atoms is None: raise ValueError('atoms is not associated with an Atomic ' 'instance') try: acsi = atoms.getACSIndex() except AttributeError: raise TypeError('atoms does not have a valid type') try: atoms.getIndex() except AttributeError: pass else: atoms = atoms.select('all') n_atoms = atoms.numAtoms() occupancy = kwargs.get('occupancy') if occupancy is None: occupancies = atoms._getOccupancies() if occupancies is None: occupancies = np.zeros(n_atoms, float) else: occupancies = np.array(occupancy) if len(occupancies) != n_atoms: raise ValueError('len(occupancy) must be equal to number of atoms') beta = kwargs.get('beta') if beta is None: bfactors = atoms._getBetas() if bfactors is None: bfactors = np.zeros(n_atoms, float) else: bfactors = np.array(beta) if len(bfactors) != n_atoms: raise ValueError('len(beta) must be equal to number of atoms') atomnames = atoms.getNames() if atomnames is None: raise ValueError('atom names are not set') for i, an in enumerate(atomnames): if len(an) < 4: atomnames[i] = ' ' + an s_or_u = np.array(['a']).dtype.char altlocs = atoms._getAltlocs() if altlocs is None: altlocs = np.zeros(n_atoms, s_or_u + '1') resnames = atoms._getResnames() if resnames is None: resnames = ['UNK'] * n_atoms chainids = atoms._getChids() if chainids is None: chainids = np.zeros(n_atoms, s_or_u + '1') resnums = atoms._getResnums() if resnums is None: resnums = np.ones(n_atoms, int) serials = atoms._getSerials() if serials is None or renumber: serials = np.arange(n_atoms, dtype=int) + 1 icodes = atoms._getIcodes() if icodes is None: icodes = np.zeros(n_atoms, s_or_u + '1') hetero = ['ATOM'] * n_atoms heteroflags = atoms._getFlags('hetatm') if heteroflags is None: heteroflags = atoms._getFlags('hetero') if heteroflags is not None: hetero = np.array(hetero, s_or_u + '6') hetero[heteroflags] = 'HETATM' elements = atoms._getElements() if elements is None: elements = np.zeros(n_atoms, s_or_u + '1') else: elements = np.char.rjust(elements, 2) segments = atoms._getSegnames() if segments is None: segments = np.zeros(n_atoms, s_or_u + '6') # write remarks stream.write('REMARK {0}\n'.format(remark)) # write secondary structures (if any) secondary = kwargs.get('secondary', True) secstrs = atoms._getSecstrs() if secstrs is not None and secondary: secindices = atoms._getSecindices() secclasses = atoms._getSecclasses() secids = atoms._getSecids() # write helices for i in range(1,max(secindices)+1): torf = np.logical_and(isHelix(secstrs), secindices==i) if torf.any(): helix_resnums = resnums[torf] helix_chainids = chainids[torf] helix_resnames = resnames[torf] helix_secclasses = secclasses[torf] helix_secids = secids[torf] helix_icodes = icodes[torf] L = helix_resnums[-1] - helix_resnums[0] + 1 stream.write(HELIXLINE.format(serNum=i, helixID=helix_secids[0], initResName=helix_resnames[0], initChainID=helix_chainids[0], initSeqNum=helix_resnums[0], initICode=helix_icodes[0], endResName=helix_resnames[-1], endChainID=helix_chainids[-1], endSeqNum=helix_resnums[-1], endICode=helix_icodes[-1], helixClass=helix_secclasses[0], length=L)) # write strands torf_all_sheets = isSheet(secstrs) sheet_secids = secids[torf_all_sheets] for sheet_id in np.unique(sheet_secids): torf_strands_in_sheet = np.logical_and(torf_all_sheets, secids==sheet_id) strand_indices = secindices[torf_strands_in_sheet] numStrands = len(np.unique(strand_indices)) for i in np.unique(strand_indices): torf_strand = np.logical_and(torf_strands_in_sheet, secindices==i) strand_resnums = resnums[torf_strand] strand_chainids = chainids[torf_strand] strand_resnames = resnames[torf_strand] strand_secclasses = secclasses[torf_strand] strand_icodes = icodes[torf_strand] stream.write(SHEETLINE.format(strand=i, sheetID=sheet_id, numStrands=numStrands, initResName=strand_resnames[0], initChainID=strand_chainids[0], initSeqNum=strand_resnums[0], initICode=strand_icodes[0], endResName=strand_resnames[-1], endChainID=strand_chainids[-1], endSeqNum=strand_resnums[-1], endICode=strand_icodes[-1], sense=strand_secclasses[0])) pass # write atoms multi = len(coordsets) > 1 write = stream.write for m, coords in enumerate(coordsets): pdbline = PDBLINE_LT100K if multi: write('MODEL{0:9d}\n'.format(m+1)) for i, xyz in enumerate(coords): if pdbline != PDBLINE_GE100K and (i == MAX_N_ATOM or serials[i] > MAX_N_ATOM): LOGGER.warn('Indices are exceeding 99999 and hexadecimal format is being used') pdbline = PDBLINE_GE100K write(pdbline % (hetero[i], serials[i], atomnames[i], altlocs[i], resnames[i], chainids[i], resnums[i], icodes[i], xyz[0], xyz[1], xyz[2], occupancies[i], bfactors[i], segments[i], elements[i])) if multi: write('ENDMDL\n') altlocs = np.zeros(n_atoms, s_or_u + '1')
def _parsePDBLines(atomgroup, lines, split, model, chain, subset, altloc_torf, format='PDB'): """Returns an AtomGroup. See also :func:`.parsePDBStream()`. :arg lines: PDB/PQR lines :arg split: starting index for coordinate data lines""" format = format.upper() if format == 'PDB': isPDB = True else: isPDB = False if subset: if subset == 'ca': subset = set(('CA',)) elif subset in 'bb': subset = flags.BACKBONE only_subset = True protein_resnames = flags.AMINOACIDS else: only_subset = False if chain is None: only_chains = False else: only_chains = True onlycoords = False n_atoms = atomgroup.numAtoms() if n_atoms > 0: asize = n_atoms else: asize = len(lines) - split addcoords = False if atomgroup.numCoordsets() > 0: addcoords = True alength = asize coordinates = np.zeros((asize, 3), dtype=float) atomnames = np.zeros(asize, dtype=ATOMIC_FIELDS['name'].dtype) resnames = np.zeros(asize, dtype=ATOMIC_FIELDS['resname'].dtype) resnums = np.zeros(asize, dtype=ATOMIC_FIELDS['resnum'].dtype) chainids = np.zeros(asize, dtype=ATOMIC_FIELDS['chain'].dtype) hetero = np.zeros(asize, dtype=bool) termini = np.zeros(asize, dtype=bool) altlocs = np.zeros(asize, dtype=ATOMIC_FIELDS['altloc'].dtype) icodes = np.zeros(asize, dtype=ATOMIC_FIELDS['icode'].dtype) serials = np.zeros(asize, dtype=ATOMIC_FIELDS['serial'].dtype) charges = np.zeros(asize, dtype=ATOMIC_FIELDS['charge'].dtype) if isPDB: segnames = np.zeros(asize, dtype=ATOMIC_FIELDS['segment'].dtype) elements = np.zeros(asize, dtype=ATOMIC_FIELDS['element'].dtype) bfactors = np.zeros(asize, dtype=ATOMIC_FIELDS['beta'].dtype) occupancies = np.zeros(asize, dtype=ATOMIC_FIELDS['occupancy'].dtype) anisou = None siguij = None else: radii = np.zeros(asize, dtype=ATOMIC_FIELDS['radius'].dtype) asize = 2000 # increase array length by this much when needed start = split stop = len(lines) nmodel = 0 # if a specific model is requested, skip lines until that one if isPDB and model is not None and model != 1: for i in range(split, len(lines)): if lines[i][:5] == 'MODEL': nmodel += 1 if model == nmodel: start = i+1 stop = len(lines) break if nmodel != model: raise PDBParseError('model {0} is not found'.format(model)) if isinstance(altloc_torf, str): if altloc_torf.strip() != 'A': LOGGER.info('Parsing alternate locations {0}.' .format(altloc_torf)) which_altlocs = ' ' + ''.join(altloc_torf.split()) else: which_altlocs = ' A' altloc_torf = False else: which_altlocs = ' A' altloc_torf = True acount = 0 coordsets = None altloc = defaultdict(list) i = start END = False while i < stop: line = lines[i] if not isPDB: fields = line.split() if len(fields) == 10: fields.insert(4, '') elif len(fields) != 11: LOGGER.warn('wrong number of fields for PQR format at line %d'%i) i += 1 continue if isPDB: startswith = line[0:6].strip() else: startswith = fields[0] if startswith == 'ATOM' or startswith == 'HETATM': if isPDB: atomname = line[12:16].strip() resname = line[17:21].strip() else: atomname= fields[2] resname = fields[3] if only_subset: if not (atomname in subset and resname in protein_resnames): i += 1 continue if isPDB: chid = line[21] else: chid = fields[4] if only_chains: if not chid in chain: i += 1 continue if isPDB: alt = line[16] if alt not in which_altlocs: altloc[alt].append((line, i)) i += 1 continue else: alt = ' ' try: if isPDB: coordinates[acount, 0] = line[30:38] coordinates[acount, 1] = line[38:46] coordinates[acount, 2] = line[46:54] else: coordinates[acount, 0] = fields[6] coordinates[acount, 1] = fields[7] coordinates[acount, 2] = fields[8] except: if acount >= n_atoms > 0: if nmodel == 0: raise ValueError(format + 'file and AtomGroup ag must ' 'have same number of atoms') LOGGER.warn('Discarding model {0}, which contains {1} more ' 'atoms than first model does.' .format(nmodel+1,acount-n_atoms+1)) acount = 0 nmodel += 1 coordinates = np.zeros((n_atoms, 3), dtype=float) if isPDB: while lines[i][:6] != 'ENDMDL': i += 1 else: raise PDBParseError('invalid or missing coordinate(s) at ' 'line {0}'.format(i+1)) if onlycoords: acount += 1 i += 1 continue try: serials[acount] = int(line[6:11]) if isPDB else int(fields[1]) except ValueError: try: serials[acount] = int(line[6:11], 16) if isPDB else int(fields[1], 16) except ValueError: LOGGER.warn('failed to parse serial number in line {0}' .format(i)) serials[acount] = serials[acount-1]+1 altlocs[acount] = alt atomnames[acount] = atomname resnames[acount] = resname chainids[acount] = chid if isPDB: resnums[acount] = line[22:26] icodes[acount] = line[26] else: resnum = fields[5] if resnum[-1].isalpha(): icode = resnum[-1] else: icode = ' ' resnums[acount] = resnum icodes[acount] = icode if isPDB: try: occupancies[acount] = line[54:60] except: LOGGER.warn('failed to parse occupancy at line {0}' .format(i)) try: bfactors[acount] = line[60:66] except: LOGGER.warn('failed to parse beta-factor at line {0}' .format(i)) hetero[acount] = startswith[0] == 'H' segnames[acount] = line[72:76] elements[acount] = line[76:78] try: charges[acount] = int(line[79] + line[78]) except: charges[acount] = 0 else: try: charges[acount] = fields[9] except: LOGGER.warn('failed to parse charge at line {0}' .format(i)) try: radii[acount] = fields[10] except: LOGGER.warn('failed to parse radius at line {0}' .format(i)) acount += 1 if n_atoms == 0 and acount >= alength: # if arrays are short extend them with zeros alength += asize coordinates = np.concatenate( (coordinates, np.zeros((asize, 3), float))) atomnames = np.concatenate((atomnames, np.zeros(asize, ATOMIC_FIELDS['name'].dtype))) resnames = np.concatenate((resnames, np.zeros(asize, ATOMIC_FIELDS['resname'].dtype))) resnums = np.concatenate((resnums, np.zeros(asize, ATOMIC_FIELDS['resnum'].dtype))) chainids = np.concatenate((chainids, np.zeros(asize, ATOMIC_FIELDS['chain'].dtype))) hetero = np.concatenate((hetero, np.zeros(asize, bool))) termini = np.concatenate((termini, np.zeros(asize, bool))) altlocs = np.concatenate((altlocs, np.zeros(asize, ATOMIC_FIELDS['altloc'].dtype))) icodes = np.concatenate((icodes, np.zeros(asize, ATOMIC_FIELDS['icode'].dtype))) serials = np.concatenate((serials, np.zeros(asize, ATOMIC_FIELDS['serial'].dtype))) if isPDB: bfactors = np.concatenate((bfactors, np.zeros(asize, ATOMIC_FIELDS['beta'].dtype))) occupancies = np.concatenate((occupancies, np.zeros(asize, ATOMIC_FIELDS['occupancy'].dtype))) segnames = np.concatenate((segnames, np.zeros(asize, ATOMIC_FIELDS['segment'].dtype))) elements = np.concatenate((elements, np.zeros(asize, ATOMIC_FIELDS['element'].dtype))) if anisou is not None: anisou = np.concatenate((anisou, np.zeros((asize, 6), ATOMIC_FIELDS['anisou'].dtype))) if siguij is not None: siguij = np.concatenate((siguij, np.zeros((asize, 6), ATOMIC_FIELDS['siguij'].dtype))) else: charges = np.concatenate((charges, np.zeros(asize, ATOMIC_FIELDS['charge'].dtype))) radii = np.concatenate((radii, np.zeros(asize, ATOMIC_FIELDS['radius'].dtype))) #elif startswith == 'END ' or startswith == 'CONECT': # i += 1 # break elif not onlycoords and (startswith == 'TER ' or startswith.strip() == 'TER'): termini[acount - 1] = True elif startswith == 'ENDMDL' or startswith[:3] == 'END': if acount == 0: # If there is no atom record between ENDMDL & END skip to next i += 1 continue if model is not None: i += 1 break diff = stop - i - 1 END = diff < acount if coordsets is not None: END = END or nmodel >= coordsets.shape[0] if onlycoords: if acount < n_atoms: LOGGER.warn('Discarding model {0}, which contains ' '{1} fewer atoms than the first model ' 'does.'.format(nmodel+1, n_atoms-acount)) else: coordsets[nmodel] = coordinates nmodel += 1 acount = 0 if not END: coordinates = coordsets[nmodel] else: if acount != n_atoms > 0: raise ValueError('PDB file and AtomGroup ag must have ' 'same number of atoms') # this is where to decide if more coordsets should be expected if END: coordinates.resize((acount, 3), refcheck=False) if addcoords: atomgroup.addCoordset(coordinates) else: atomgroup._setCoords(coordinates) else: coordsets = np.zeros((int(diff//acount+1), acount, 3)) coordsets[0] = coordinates[:acount] onlycoords = True atomnames.resize(acount, refcheck=False) resnames.resize(acount, refcheck=False) resnums.resize(acount, refcheck=False) chainids.resize(acount, refcheck=False) hetero.resize(acount, refcheck=False) termini.resize(acount, refcheck=False) altlocs.resize(acount, refcheck=False) icodes.resize(acount, refcheck=False) serials.resize(acount, refcheck=False) if not only_subset: atomnames = np.char.strip(atomnames) resnames = np.char.strip(resnames) atomgroup.setNames(atomnames) atomgroup.setResnames(resnames) atomgroup.setResnums(resnums) atomgroup.setChids(chainids) atomgroup.setFlags('hetatm', hetero) atomgroup.setFlags('pdbter', termini) atomgroup.setAltlocs(altlocs) atomgroup.setIcodes(np.char.strip(icodes)) atomgroup.setSerials(serials) if isPDB: bfactors.resize(acount, refcheck=False) occupancies.resize(acount, refcheck=False) segnames.resize(acount, refcheck=False) elements.resize(acount, refcheck=False) atomgroup.setBetas(bfactors) atomgroup.setOccupancies(occupancies) atomgroup.setSegnames(np.char.strip(segnames)) atomgroup.setElements(np.char.strip(elements)) from prody.utilities.misctools import getMasses atomgroup.setMasses(getMasses(np.char.strip(elements))) if anisou is not None: anisou.resize((acount, 6), refcheck=False) atomgroup.setAnisous(anisou / 10000) if siguij is not None: siguij.resize((acount, 6), refcheck=False) atomgroup.setAnistds(siguij / 10000) else: charges.resize(acount, refcheck=False) radii.resize(acount, refcheck=False) atomgroup.setCharges(charges) atomgroup.setRadii(radii) nmodel += 1 n_atoms = acount acount = 0 coordinates = np.zeros((n_atoms, 3), dtype=float) if altloc and altloc_torf: _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames) altloc = defaultdict(list) if END: break elif isPDB and startswith == 'ANISOU': if anisou is None: anisou = True anisou = np.zeros((alength, 6), dtype=ATOMIC_FIELDS['anisou'].dtype) try: index = acount - 1 anisou[index, 0] = line[28:35] anisou[index, 1] = line[35:42] anisou[index, 2] = line[43:49] anisou[index, 3] = line[49:56] anisou[index, 4] = line[56:63] anisou[index, 5] = line[63:70] except: LOGGER.warn('failed to parse anisotropic temperature ' 'factors at line {0}'.format(i)) elif isPDB and startswith =='SIGUIJ': if siguij is None: siguij = np.zeros((alength, 6), dtype=ATOMIC_FIELDS['siguij'].dtype) try: index = acount - 1 siguij[index, 0] = line[28:35] siguij[index, 1] = line[35:42] siguij[index, 2] = line[43:49] siguij[index, 3] = line[49:56] siguij[index, 4] = line[56:63] siguij[index, 5] = line[63:70] except: LOGGER.warn('failed to parse standard deviations of ' 'anisotropic temperature factors at line {0}'.format(i)) elif startswith =='SIGATM': pass i += 1 if onlycoords: if acount == atomgroup.numAtoms(): coordsets[nmodel] = coordinates nmodel += 1 del coordinates coordsets.resize((nmodel, atomgroup.numAtoms(), 3), refcheck=False) if addcoords: atomgroup.addCoordset(coordsets) else: atomgroup._setCoords(coordsets) elif not END: # this means last line was an ATOM line, so atomgroup is not decorated coordinates.resize((acount, 3), refcheck=False) if addcoords: atomgroup.addCoordset(coordinates) else: atomgroup._setCoords(coordinates) atomnames.resize(acount, refcheck=False) resnames.resize(acount, refcheck=False) resnums.resize(acount, refcheck=False) chainids.resize(acount, refcheck=False) hetero.resize(acount, refcheck=False) termini.resize(acount, refcheck=False) altlocs.resize(acount, refcheck=False) icodes.resize(acount, refcheck=False) serials.resize(acount, refcheck=False) if not only_subset: atomnames = np.char.strip(atomnames) resnames = np.char.strip(resnames) atomgroup.setNames(atomnames) atomgroup.setResnames(resnames) atomgroup.setResnums(resnums) atomgroup.setChids(chainids) atomgroup.setFlags('hetatm', hetero) atomgroup.setFlags('pdbter', termini) atomgroup.setAltlocs(altlocs) atomgroup.setIcodes(np.char.strip(icodes)) atomgroup.setSerials(serials) if isPDB: if anisou is not None: anisou.resize((acount, 6), refcheck=False) atomgroup.setAnisous(anisou / 10000) if siguij is not None: siguij.resize((acount, 6), refcheck=False) atomgroup.setAnistds(siguij / 10000) bfactors.resize(acount, refcheck=False) occupancies.resize(acount, refcheck=False) segnames.resize(acount, refcheck=False) elements.resize(acount, refcheck=False) atomgroup.setSegnames(np.char.strip(segnames)) atomgroup.setElements(np.char.strip(elements)) from prody.utilities.misctools import getMasses atomgroup.setMasses(getMasses(np.char.strip(elements))) atomgroup.setBetas(bfactors) atomgroup.setOccupancies(occupancies) else: charges.resize(acount, refcheck=False) radii.resize(acount, refcheck=False) atomgroup.setCharges(charges) atomgroup.setRadii(radii) if altloc and altloc_torf: _evalAltlocs(atomgroup, altloc, chainids, resnums, resnames, atomnames) return atomgroup