def prody_pca(coords, **kwargs): """Perform PCA calculations for PDB or DCD format *coords* file. """ for key in DEFAULTS: if not key in kwargs: kwargs[key] = DEFAULTS[key] from os.path import isdir, splitext, join outdir = kwargs.get('outdir') if not isdir(outdir): raise IOError('{0} is not a valid path'.format(repr(outdir))) import prody LOGGER = prody.LOGGER prefix = kwargs.get('prefix') nmodes = kwargs.get('nmodes') selstr = kwargs.get('select') ext = splitext(coords)[1].lower() if ext == '.gz': ext = splitext(coords[:-3])[1].lower() if ext == '.dcd': pdb = kwargs.get('psf') or kwargs.get('pdb') if pdb: if splitext(pdb)[1].lower() == '.psf': pdb = prody.parsePSF(pdb) else: pdb = prody.parsePDB(pdb) dcd = prody.DCDFile(coords) if prefix == '_pca' or prefix == '_eda': prefix = dcd.getTitle() + prefix if len(dcd) < 2: raise ValueError('DCD file must have multiple frames') if pdb: if pdb.numAtoms() == dcd.numAtoms(): select = pdb.select(selstr) dcd.setAtoms(select) LOGGER.info('{0} atoms are selected for calculations.' .format(len(select))) else: select = pdb.select(selstr) if select.numAtoms() != dcd.numAtoms(): raise ValueError('number of selected atoms ({0}) does ' 'not match number of atoms in the DCD ' 'file ({1})'.format(select.numAtoms(), dcd.numAtoms())) if pdb.numCoordsets(): dcd.setCoords(select.getCoords()) else: select = prody.AtomGroup() select.setCoords(dcd.getCoords()) pca = prody.PCA(dcd.getTitle()) if len(dcd) > 1000: pca.buildCovariance(dcd, aligned=kwargs.get('aligned')) pca.calcModes(nmodes) ensemble = dcd else: ensemble = dcd[:] if not kwargs.get('aligned'): ensemble.iterpose() pca.performSVD(ensemble) else: pdb = prody.parsePDB(coords) if pdb.numCoordsets() < 2: raise ValueError('PDB file must contain multiple models') if prefix == '_pca' or prefix == '_eda': prefix = pdb.getTitle() + prefix select = pdb.select(selstr) LOGGER.info('{0} atoms are selected for calculations.' .format(len(select))) if select is None: raise ValueError('selection {0} do not match any atoms' .format(repr(selstr))) LOGGER.info('{0} atoms will be used for PCA calculations.' .format(len(select))) ensemble = prody.Ensemble(select) pca = prody.PCA(pdb.getTitle()) if not kwargs.get('aligned'): ensemble.iterpose() pca.performSVD(ensemble) LOGGER.info('Writing numerical output.') if kwargs.get('outnpz'): prody.saveModel(pca, join(outdir, prefix)) prody.writeNMD(join(outdir, prefix + '.nmd'), pca[:nmodes], select) extend = kwargs.get('extend') if extend: if pdb: if extend == 'all': extended = prody.extendModel(pca[:nmodes], select, pdb) else: extended = prody.extendModel(pca[:nmodes], select, select | pdb.bb) prody.writeNMD(join(outdir, prefix + '_extended_' + extend + '.nmd'), *extended) else: prody.LOGGER.warn('Model could not be extended, provide a PDB or ' 'PSF file.') outall = kwargs.get('outall') delim = kwargs.get('numdelim') ext = kwargs.get('numext') format = kwargs.get('numformat') if outall or kwargs.get('outeig'): prody.writeArray(join(outdir, prefix + '_evectors'+ext), pca.getArray(), delimiter=delim, format=format) prody.writeArray(join(outdir, prefix + '_evalues'+ext), pca.getEigvals(), delimiter=delim, format=format) if outall or kwargs.get('outcov'): prody.writeArray(join(outdir, prefix + '_covariance'+ext), pca.getCovariance(), delimiter=delim, format=format) if outall or kwargs.get('outcc') or kwargs.get('outhm'): cc = prody.calcCrossCorr(pca) if outall or kwargs.get('outcc'): prody.writeArray(join(outdir, prefix + '_cross-correlations' + ext), cc, delimiter=delim, format=format) if outall or kwargs.get('outhm'): resnums = select.getResnums() hmargs = {} if resnums is None else {'resnums': resnums} prody.writeHeatmap(join(outdir, prefix + '_cross-correlations.hm'), cc, xlabel='Residue', ylabel='Residue', title=pca.getTitle() + ' cross-correlations', **hmargs) if outall or kwargs.get('outsf'): prody.writeArray(join(outdir, prefix + '_sqfluct'+ext), prody.calcSqFlucts(pca), delimiter=delim, format=format) if outall or kwargs.get('outproj'): prody.writeArray(join(outdir, prefix + '_proj'+ext), prody.calcProjection(ensemble, pca), delimiter=delim, format=format) figall = kwargs.get('figall') cc = kwargs.get('figcc') sf = kwargs.get('figsf') sp = kwargs.get('figproj') if figall or cc or sf or sp: try: import matplotlib.pyplot as plt except ImportError: LOGGER.warning('Matplotlib could not be imported. ' 'Figures are not saved.') else: prody.SETTINGS['auto_show'] = False LOGGER.info('Saving graphical output.') format = kwargs.get('figformat') width = kwargs.get('figwidth') height = kwargs.get('figheight') dpi = kwargs.get('figdpi') format = format.lower() if figall or cc: plt.figure(figsize=(width, height)) prody.showCrossCorr(pca) plt.savefig(join(outdir, prefix + '_cc.'+format), dpi=dpi, format=format) plt.close('all') if figall or sf: plt.figure(figsize=(width, height)) prody.showSqFlucts(pca) plt.savefig(join(outdir, prefix + '_sf.'+format), dpi=dpi, format=format) plt.close('all') if figall or sp: indices = [] for item in sp.split(): try: if '-' in item: item = item.split('-') if len(item) == 2: indices.append(list(range(int(item[0])-1, int(item[1])))) elif ',' in item: indices.append([int(i)-1 for i in item.split(',')]) else: indices.append(int(item)-1) except: pass for index in indices: plt.figure(figsize=(width, height)) prody.showProjection(ensemble, pca[index]) if isinstance(index, int): index = [index] index = [str(i+1) for i in index] plt.savefig(join(outdir, prefix + '_proj_' + '_'.join(index) + '.' + format), dpi=dpi, format=format) plt.close('all')
def prody_pca(opt): """Perform PCA calculations based on command line arguments.""" outdir = opt.outdir if not os.path.isdir(outdir): opt.subparser.error('{0:s} is not a valid path'.format(outdir)) import prody LOGGER = prody.LOGGER coords = opt.coords prefix = opt.prefix nmodes, selstr = opt.nmodes, opt.select if os.path.splitext(coords)[1].lower() == '.dcd': ag = opt.psf or opt.pdb if ag: if os.path.splitext(ag)[1].lower() == '.psf': ag = prody.parsePSF(ag) else: ag = prody.parsePDB(ag) dcd = prody.DCDFile(opt.coords) if len(dcd) < 2: opt.subparser("DCD file must contain multiple frames.") if ag: dcd.setAtomGroup(ag) select = dcd.select(selstr) LOGGER.info('{0:d} atoms are selected for calculations.' .format(len(select))) else: select = prody.AtomGroup() select.setCoords(dcd.getCoords()) pca = prody.PCA(dcd.getTitle()) if len(dcd) > 1000: pca.buildCovariance(dcd) pca.calcModes(dcd) else: pca.performSVD(dcd[:]) else: pdb = prody.parsePDB(opt.coords) if pdb.numCoordsets() < 2: opt.subparser("PDB file must contain multiple models.") if prefix == '_pca': prefix = pdb.getTitle() + '_pca' select = pdb.select(selstr) LOGGER.info('{0:d} atoms are selected for calculations.' .format(len(select))) if select is None: opt.subparser('Selection "{0:s}" do not match any atoms.' .format(selstr)) LOGGER.info('{0:d} atoms will be used for PCA calculations.' .format(len(select))) ensemble = prody.Ensemble(select) pca = prody.PCA(pdb.getTitle()) ensemble.iterpose() pca.performSVD(ensemble) LOGGER.info('Writing numerical output.') if opt.npz: prody.saveModel(pca) prody.writeNMD(os.path.join(outdir, prefix + '.nmd'), pca[:nmodes], select) outall = opt.all delim, ext, format = opt.delim, opt.ext, opt.numformat if outall or opt.eigen: prody.writeArray(os.path.join(outdir, prefix + '_evectors'+ext), pca.getArray(), delimiter=delim, format=format) prody.writeArray(os.path.join(outdir, prefix + '_evalues'+ext), pca.getEigenvalues(), delimiter=delim, format=format) if outall or opt.covar: prody.writeArray(os.path.join(outdir, prefix + '_covariance'+ext), pca.getCovariance(), delimiter=delim, format=format) if outall or opt.ccorr: prody.writeArray(os.path.join(outdir, prefix + '_cross-correlations' + ext), prody.calcCrossCorr(pca), delimiter=delim, format=format) if outall or opt.sqflucts: prody.writeArray(os.path.join(outdir, prefix + '_sqfluct'+ext), prody.calcSqFlucts(pca), delimiter=delim, format=format) if outall or opt.proj: prody.writeArray(os.path.join(outdir, prefix + '_proj'+ext), prody.calcProjection(ensemble, pca), delimiter=delim, format=format) figall, cc, sf, sp = opt.figures, opt.cc, opt.sf, opt.sp if figall or cc or sf or sp: try: import matplotlib.pyplot as plt except ImportError: LOGGER.warning('Matplotlib could not be imported. ' 'Figures are not saved.') else: LOGGER.info('Saving graphical output.') format, width, height, dpi = \ opt.figformat, opt.width, opt.height, opt.dpi format = format.lower() if figall or cc: plt.figure(figsize=(width, height)) prody.showCrossCorr(pca) plt.savefig(os.path.join(outdir, prefix + '_cc.'+format), dpi=dpi, format=format) plt.close('all') if figall or sf: plt.figure(figsize=(width, height)) prody.showSqFlucts(pca) plt.savefig(os.path.join(outdir, prefix + '_sf.'+format), dpi=dpi, format=format) plt.close('all') if figall or sp: indices = [] for item in sp.split(): try: if '-' in item: item = item.split('-') if len(item) == 2: indices.append(range(int(item[0])-1, int(item[1]))) elif ',' in item: indices.append([int(i)-1 for i in item.split(',')]) else: indices.append(int(item)-1) except: pass for index in indices: plt.figure(figsize=(width, height)) prody.showProjection(ensemble, pca[index]) if isinstance(index, int): index = [index] index = [str(i+1) for i in index] plt.savefig(os.path.join(outdir, prefix + '_proj_' + '_'.join(index) + '.' + format), dpi=dpi, format=format) plt.close('all')
def prody_pca(coords, **kwargs): """Perform PCA calculations for PDB or DCD format *coords* file. """ for key in DEFAULTS: if not key in kwargs: kwargs[key] = DEFAULTS[key] from os.path import isdir, splitext, join outdir = kwargs.get('outdir') if not isdir(outdir): raise IOError('{0} is not a valid path'.format(repr(outdir))) import prody LOGGER = prody.LOGGER prefix = kwargs.get('prefix') nmodes = kwargs.get('nmodes') selstr = kwargs.get('select') quiet = kwargs.pop('quiet', False) altloc = kwargs.get('altloc') ext = splitext(coords)[1].lower() if ext == '.gz': ext = splitext(coords[:-3])[1].lower() if ext == '.dcd': pdb = kwargs.get('psf') or kwargs.get('pdb') if pdb: if splitext(pdb)[1].lower() == '.psf': pdb = prody.parsePSF(pdb) else: pdb = prody.parsePDB(pdb, altlocs=altlocs) dcd = prody.DCDFile(coords) if prefix == '_pca' or prefix == '_eda': prefix = dcd.getTitle() + prefix if len(dcd) < 2: raise ValueError('DCD file must have multiple frames') if pdb: if pdb.numAtoms() == dcd.numAtoms(): select = pdb.select(selstr) dcd.setAtoms(select) LOGGER.info('{0} atoms are selected for calculations.'.format( len(select))) else: select = pdb.select(selstr) if select.numAtoms() != dcd.numAtoms(): raise ValueError('number of selected atoms ({0}) does ' 'not match number of atoms in the DCD ' 'file ({1})'.format( select.numAtoms(), dcd.numAtoms())) if pdb.numCoordsets(): dcd.setCoords(select.getCoords()) else: select = prody.AtomGroup() select.setCoords(dcd.getCoords()) pca = prody.PCA(dcd.getTitle()) nproc = kwargs.get('nproc') if nproc: try: from threadpoolctl import threadpool_limits except ImportError: raise ImportError( 'Please install threadpoolctl to control threads') with threadpool_limits(limits=nproc, user_api="blas"): if len(dcd) > 1000: pca.buildCovariance(dcd, aligned=kwargs.get('aligned'), quiet=quiet) pca.calcModes(nmodes) ensemble = dcd else: ensemble = dcd[:] if not kwargs.get('aligned'): ensemble.iterpose(quiet=quiet) pca.performSVD(ensemble) nmodes = pca.numModes() else: if len(dcd) > 1000: pca.buildCovariance(dcd, aligned=kwargs.get('aligned'), quiet=quiet) pca.calcModes(nmodes) ensemble = dcd else: ensemble = dcd[:] if not kwargs.get('aligned'): ensemble.iterpose(quiet=quiet) pca.performSVD(ensemble) nmodes = pca.numModes() else: pdb = prody.parsePDB(coords) if pdb.numCoordsets() < 2: raise ValueError('PDB file must contain multiple models') if prefix == '_pca' or prefix == '_eda': prefix = pdb.getTitle() + prefix select = pdb.select(selstr) LOGGER.info('{0} atoms are selected for calculations.'.format( len(select))) if select is None: raise ValueError('selection {0} do not match any atoms'.format( repr(selstr))) LOGGER.info('{0} atoms will be used for PCA calculations.'.format( len(select))) ensemble = prody.Ensemble(select) pca = prody.PCA(pdb.getTitle()) if not kwargs.get('aligned'): ensemble.iterpose() nproc = kwargs.get('nproc') if nproc: try: from threadpoolctl import threadpool_limits except ImportError: raise ImportError( 'Please install threadpoolctl to control threads') with threadpool_limits(limits=nproc, user_api="blas"): pca.performSVD(ensemble) else: pca.performSVD(ensemble) LOGGER.info('Writing numerical output.') if kwargs.get('outnpz'): prody.saveModel(pca, join(outdir, prefix)) if kwargs.get('outscipion'): prody.writeScipionModes(outdir, pca) prody.writeNMD(join(outdir, prefix + '.nmd'), pca[:nmodes], select) extend = kwargs.get('extend') if extend: if pdb: if extend == 'all': extended = prody.extendModel(pca[:nmodes], select, pdb) else: extended = prody.extendModel(pca[:nmodes], select, select | pdb.bb) prody.writeNMD( join(outdir, prefix + '_extended_' + extend + '.nmd'), *extended) else: prody.LOGGER.warn('Model could not be extended, provide a PDB or ' 'PSF file.') outall = kwargs.get('outall') delim = kwargs.get('numdelim') ext = kwargs.get('numext') format = kwargs.get('numformat') if outall or kwargs.get('outeig'): prody.writeArray(join(outdir, prefix + '_evectors' + ext), pca.getArray(), delimiter=delim, format=format) prody.writeArray(join(outdir, prefix + '_evalues' + ext), pca.getEigvals(), delimiter=delim, format=format) if outall or kwargs.get('outcov'): prody.writeArray(join(outdir, prefix + '_covariance' + ext), pca.getCovariance(), delimiter=delim, format=format) if outall or kwargs.get('outcc') or kwargs.get('outhm'): cc = prody.calcCrossCorr(pca) if outall or kwargs.get('outcc'): prody.writeArray(join(outdir, prefix + '_cross-correlations' + ext), cc, delimiter=delim, format=format) if outall or kwargs.get('outhm'): resnums = select.getResnums() hmargs = {} if resnums is None else {'resnums': resnums} prody.writeHeatmap(join(outdir, prefix + '_cross-correlations.hm'), cc, xlabel='Residue', ylabel='Residue', title=pca.getTitle() + ' cross-correlations', **hmargs) if outall or kwargs.get('outsf'): prody.writeArray(join(outdir, prefix + '_sqfluct' + ext), prody.calcSqFlucts(pca), delimiter=delim, format=format) if outall or kwargs.get('outproj'): prody.writeArray(join(outdir, prefix + '_proj' + ext), prody.calcProjection(ensemble, pca), delimiter=delim, format=format) figall = kwargs.get('figall') cc = kwargs.get('figcc') sf = kwargs.get('figsf') sp = kwargs.get('figproj') if figall or cc or sf or sp: try: import matplotlib.pyplot as plt except ImportError: LOGGER.warning('Matplotlib could not be imported. ' 'Figures are not saved.') else: prody.SETTINGS['auto_show'] = False LOGGER.info('Saving graphical output.') format = kwargs.get('figformat') width = kwargs.get('figwidth') height = kwargs.get('figheight') dpi = kwargs.get('figdpi') format = format.lower() if figall or cc: plt.figure(figsize=(width, height)) prody.showCrossCorr(pca) plt.savefig(join(outdir, prefix + '_cc.' + format), dpi=dpi, format=format) plt.close('all') if figall or sf: plt.figure(figsize=(width, height)) prody.showSqFlucts(pca) plt.savefig(join(outdir, prefix + '_sf.' + format), dpi=dpi, format=format) plt.close('all') if figall or sp: indices = [] for item in sp.split(): try: if '-' in item: item = item.split('-') if len(item) == 2: indices.append( list(range(int(item[0]) - 1, int(item[1])))) elif ',' in item: indices.append( [int(i) - 1 for i in item.split(',')]) else: indices.append(int(item) - 1) except: pass for index in indices: plt.figure(figsize=(width, height)) prody.showProjection(ensemble, pca[index]) if isinstance(index, Integral): index = [index] index = [str(i + 1) for i in index] plt.savefig(join( outdir, prefix + '_proj_' + '_'.join(index) + '.' + format), dpi=dpi, format=format) plt.close('all')