def rg_analysis(model, **kwargs): from htmd.model import getStateStatistic from IDP_htmd.MetricRadiusGyration import metricRG from IDP_htmd.model_utils import get_data import numpy as np rg_data = get_data(model, metricRG, **kwargs) rg_mean = getStateStatistic(model, rg_data, states=range(model.macronum)) rg_std = getStateStatistic(model, rg_data, states=range(model.macronum), method=np.std) aggregate_dat = [] for i in rg_data.dat: aggregate_dat += i.ravel().tolist() aggregate_dat = np.array(aggregate_dat) rg_mean = np.append(rg_mean, np.mean(aggregate_dat)) rg_std = np.append(rg_std, np.std(aggregate_dat)) return np.array([rg_mean, rg_std])
def aux_plot(model, mol, plot_func, metric=None, skip=1, normalize=False, method=np.mean, data=None, **kwargs): """Summary Parameters ---------- model : TYPE Model to extract the data metric : TYPE Metric object to project the simlist of the model mol : TYPE Description plot_func : TYPE Plotting function to plot the projected data skip : int, optional Skip frames from the simlist normalize : bool, optional Whether to normalize by the number of atoms method : TYPE, optional Method to perform the aggregation of the data by macrostate **kwargs Additional arguments for the plotting function """ if not metric and not data: raise Exception("Either a metric or a data object must be provided") if not data: data = get_data(model, metric, skip=skip) data_summary = getStateStatistic(model, data, method=method, states=range(model.macronum), statetype="macro") if normalize: _, counts = np.unique(mol.resid, return_counts=True) data_summary = np.array(data_summary) / counts try: plot_func(data_summary, mol, **kwargs) except Exception as e: print("Plotting error: ", e)
def cluster_macro(model, data, macro, method=np.mean, cluster_method=MiniBatchKMeans): """Modifies the model by splitting a macrostate. In first place, the mean for the given data is calculated for each micro of the model. This data is then clustered using the MiniBatchKMeans algorithm Parameters ---------- model : <htmd.model.Model> Model to be modified data : TYPE Description macro : int Macrostate to be splitted method : TYPE, optional Description """ # from sklearn.cluster import MiniBatchKMeans, AffinityPropagation #from IDP_htmd.IDP_model import plot_contacts metastable_states(model) if isinstance(macro, int): macro = [macro] all_micros = np.array([], dtype=int) for i in macro: if i < 0 or i > model.macronum: raise Exception("Macro out of bounds") all_micros = np.concatenate([all_micros, model.metastable_sets[i]]) data_by_micro = getStateStatistic(model, data, states=all_micros, statetype="micro", method=method) clusters = cluster_method().fit(data_by_micro) new_macro_assignment = [] for i in range(len(clusters.cluster_centers_)): new_macro_assignment.append( all_micros[np.where(clusters.labels_ == i)[0]]) return np.array(new_macro_assignment)
def create_bulk(model, metric=None, data=None, threshold=0.2, skip=1): """Creates a bulk macrosates Modifies passed model It is intended to be used in ligand binding escenarios. Parameters ---------- model : TYPE Model to extract a bulk metric : TYPE Metric to describe a bulk vs not-bulk situation. In general is the contacts between protein and ligand selection with groupsels set to 'all' data : None, optional Description Returns ------- TYPE Description Raises ------ Exception Description """ if not metric and not data: raise Exception("Either a metric or a data object must be provided") if metric and not data: data = get_data(model, metric, skip=skip) data_by_micro = np.array( getStateStatistic(model, data, states=range(model.micronum), statetype="micro")) min_contacts = np.where(data_by_micro < threshold)[0] if len(min_contacts) == 0: min_contacts = [np.argmin(data_by_micro < threshold)] model.createState(min_contacts) print(f"Macrostate created with micros: {min_contacts}") return min_contacts
def plotClusters( self, dimX, dimY, resolution=100, s=4, c=None, cmap="Greys", logplot=False, plot=True, save=None, data=None, levels=7, ): """Plot a scatter-plot of the locations of the clusters on top of the count histogram. Parameters ---------- dimX : int Index of projected dimension to use for the X axis. dimY : int Index of projected dimension to use for the Y axis. resolution : int Resolution of bincount grid. s : float Marker size for clusters. c : list Colors or indexes for each cluster. cmap : matplotlib.colors.Colormap Matplotlib colormap for the scatter plot. logplot : bool Set True to plot the logarithm of counts. plot : bool If the method should display the plot save : str Path of the file in which to save the figure data : :class:`MetricData` object Optionally you can pass a different MetricData object than the one used for clustering. For example if the user wants to cluster on distances but wants to plot the centers on top of RMSD values. The new MetricData object needs to have the same simlist as this object. """ if self.Centers is None: raise RuntimeError("Data has not been clustered yet. Cannot plot clusters.") from matplotlib import pylab as plt if data is None: data = self centers = self.Centers else: from htmd.model import getStateStatistic if self.numFrames != data.numFrames or ~np.all( [s1 == s2 for s1, s2 in zip(self.simlist, data.simlist)] ): raise RuntimeError( "The data argument you provided uses a different simlist than this object." ) centers = np.vstack( getStateStatistic(self, data, range(self.K), statetype="cluster") ) if data.description is not None: xlabel = data.description.description[dimX] else: xlabel = "Dimension {}".format(dimX) if data.description is not None: ylabel = data.description.description[dimY] else: ylabel = "Dimension {}".format(dimY) title = "Clusters plotted onto counts histogram" if logplot: title = "Clusters plotted onto logarithmic counts histogram" f, ax, cf = self._plotCounts( dimX, dimY, resolution=resolution, logplot=logplot, levels=levels, cmap=cmap, title=title, xlabel=xlabel, ylabel=ylabel, ) y = ax.scatter( centers[:, dimX], centers[:, dimY], s=s, c=c if c is not None else "r", cmap=cmap, linewidths=0, marker="o", ) if c is not None: self._setColorbar(f, y, "Cluster groups") if save is not None: plt.savefig(save, dpi=300, bbox_inches="tight", pad_inches=0.2) if plot: plt.show()
from htmd.projections.metric import MetricData from htmd.projections.metricdistance import MetricDistance from htmd.model import Model from htmd.molecule.molecule import Molecule import numpy as np data = MetricData() data.load( "/workspace8/p27_sj403/10-11-2018_p27_short_sj403/analysis/17_11_2018/testing.dat" ) model = Model() model.load( "/workspace8/p27_sj403/10-11-2018_p27_short_sj403/analysis/17_11_2018/model.dat" ) mol = Molecule(model.data.simlist[0].molfile) mean_dat = getStateStatistic(model, data, range(model.macronum)) met = MetricDistance(sel1="noh and protein or resname MOL", sel2="noh and protein or resname MOL", groupsel1="residue", groupsel2="residue", metric="distances", pbc=False) mapping = met.getMapping(mol) contact_plot(mean_dat, mol, rows=2, cols=2, model=model, plot=False, save="/home/pablo/test.png", mapping=mapping)
def plotClusters(self, dimX, dimY, resolution=100, s=4, c=None, cmap=None, logplot=False, plot=True, save=None, data=None): """ Plot a scatter-plot of the locations of the clusters on top of the count histogram. Parameters ---------- dimX : int Index of projected dimension to use for the X axis. dimY : int Index of projected dimension to use for the Y axis. resolution : int Resolution of bincount grid. s : float Marker size for clusters. c : list Colors or indexes for each cluster. cmap : matplotlib.colors.Colormap Matplotlib colormap for the scatter plot. logplot : bool Set True to plot the logarithm of counts. plot : bool If the method should display the plot save : str Path of the file in which to save the figure data : :class:`MetricData` object Optionally you can pass a different MetricData object than the one used for clustering. For example if the user wants to cluster on distances but wants to plot the centers on top of RMSD values. The new MetricData object needs to have the same simlist as this object. """ if self.Centers is None: raise RuntimeError('Data has not been clustered yet. Cannot plot clusters.') from matplotlib import pylab as plt if data is None: data = self centers = self.Centers else: from htmd.model import getStateStatistic if self.numFrames != data.numFrames or ~np.all([s1 == s2 for s1, s2 in zip(self.simlist, data.simlist)]): raise RuntimeError('The data argument you provided uses a different simlist than this object.') centers = np.vstack(getStateStatistic(self, data, range(self.K), statetype='cluster')) if cmap is None: cmap = plt.cm.jet if data.description is not None: xlabel = data.description.description[dimX] else: xlabel = 'Dimension {}'.format(dimX) if data.description is not None: ylabel = data.description.description[dimY] else: ylabel = 'Dimension {}'.format(dimY) title = 'Clusters plotted onto counts histogram' dc = np.concatenate(data.dat) f, ax, cf = self._contourPlot(dc[:, dimX], dc[:, dimY], resolution=resolution, xlabel=xlabel, ylabel=ylabel, title=title, logplot=logplot) y = ax.scatter(centers[:, dimX], centers[:, dimY], s=s, c=c, cmap=cmap, linewidths=0, marker='o') if c is not None: self._setColorbar(f, y, 'Cluster groups') if save is not None: plt.savefig(save, dpi=300, bbox_inches='tight', pad_inches=0.2) if plot: plt.show()
def plot_model_by(model, dat1, dat2, method1=np.mean, s=15, method2=np.mean, cmap="Set1", legend=True, ylabel=None, xlabel=None, ylim=(None, None), xlim=(None, None)): import matplotlib as mpl if method1: cum_dat1 = np.array( getStateStatistic(model, dat1, states=range(model.micronum), statetype="micro", method=method1)).ravel() else: cum_dat1 = dat1 if method2: cum_dat2 = np.array( getStateStatistic(model, dat2, states=range(model.micronum), statetype="micro", method=method2)).ravel() else: cum_dat2 = dat2 cmap = mpl.cm.get_cmap(cmap, model.macronum) c = [cmap(model.macro_ofmicro[i]) for i in range(model.micronum)] macro_pop = np.round(model.eqDistribution(plot=False) * 100, 1) for i in range(model.macronum): c = cmap(i) macro_in_micro = np.where(model.macro_ofmicro == i)[0] tmp_x = cum_dat1[macro_in_micro] tmp_y = cum_dat2[macro_in_micro] sc = plt.scatter(tmp_x, tmp_y, color=c, s=s, alpha=0.6, edgecolor=c, label=f"Macro {i}, {macro_pop[i]}%") if legend: plt.legend() if xlabel: plt.xlabel(xlabel) if ylabel: plt.ylabel(ylabel) xmin, xmax = xlim if xmin is None: xmin = np.min(cum_dat1) * 0.8 if xmax is None: xmax = np.max(cum_dat1) * 1.2 ymin, ymax = ylim if ymin is None: ymin = np.min(cum_dat2) * 0.8 if ymax is None: ymax = np.max(cum_dat2) * 1.2 _ = plt.ylim(ymin, ymax) _ = plt.xlim(xmin, xmax) return cum_dat1, cum_dat2
def plot_model_by_rmsd(model, rmsd_dat=None, rmsd_mean=None, rmsd_std=None, cmap='jet', legend=True, save=None, ax=None): import matplotlib as mpl if rmsd_dat is None and rmsd_mean is None and rmsd_std is None: raise RuntimeError( "Either rmsd_dat or rmsd_mean & rmsd_std should be defined") if rmsd_dat: rmsd_mean = getStateStatistic(model, rmsd_dat, states=range(model.micronum), statetype="micro", method=np.mean) # rmsd_min = getStateStatistic(model, rmsd_dat, states=range(model.micronum), statetype="micro", method=np.min) rmsd_std = getStateStatistic(model, rmsd_dat, states=range(model.micronum), statetype="micro", method=np.std) rmsd_mean = np.array(rmsd_mean).ravel() # rmsd_min = np.array(rmsd_min).ravel() rmsd_std = np.array(rmsd_std).ravel() plt.sca(ax) if ax else plt.figure(figsize=(8, 8)) cmap = mpl.cm.get_cmap(cmap, model.macronum) c = [cmap(model.macro_ofmicro[i]) for i in range(model.micronum)] macro_pop = np.round(model.eqDistribution(plot=False) * 100, 1) macro_sort = np.argsort(macro_pop)[::-1] macros = range(model.macronum) for idx, i in enumerate(macro_sort): c = cmap(len(macros) - 1 - idx) macro_in_micro = np.where(model.macro_ofmicro == i)[0] tmp_x = rmsd_mean[macro_in_micro] tmp_y = rmsd_std[macro_in_micro] sc = plt.scatter(tmp_x, tmp_y, color=c, s=15, alpha=0.5, edgecolor=c, label=f"Macro {i}, {macro_pop[i]}%") if legend: plt.legend() plt.xlabel("Mean RMSD \n by microstate (Å)") plt.ylabel(r"$\it{SD\ RMSD \ by\ microstate\ (Å)}$") _ = plt.ylim(0, np.max(rmsd_std) * 1.2) _ = plt.xlim(0, np.max(rmsd_mean) * 1.2) if save: plt.savefig(save, dpi=300, bbox_inches='tight', pad_inches=0.2) return rmsd_mean, rmsd_std