def calculate_scores(data, kde_bw=0.15, pca_component=0.95, score_weights=None): """Calculate scores based on probability density. Parameters ---------- data : ndarray, shape=(n_atoms, n_samples, n_dims) kde_bw : scalar pca_component : scalar score_weights : None or dict A dictionary that contains the weight for n_dims, {n_dim: weight} Returns --------- scores : ndarray, shape=(n_samples,) See also -------- pylipid.func.collect_bound_poses pylipid.func.vectorize_poses """ weights = {atom_idx: 1 for atom_idx in np.arange(np.shape(data)[0])} if score_weights is not None: weights.update(score_weights) kde_funcs = {} try: for atom_idx in np.arange(np.shape(data)[0]): transformed_data = PCA(n_components=pca_component).fit_transform( data[atom_idx]) var_type = "" bw = [] for dummy in range(len(transformed_data[0])): var_type += "c" bw.append(kde_bw) kde_funcs[atom_idx] = kde(data=transformed_data, var_type=var_type, bw=bw) # evaluate binding poses scores = np.sum([ weights[atom_idx] * kde_funcs[atom_idx].pdf() for atom_idx in np.arange(np.shape(data)[0]) ], axis=0) return scores except ValueError: print( "Pose generation error -- possibly due to insufficient number of binding event." )
bins = np.linspace(0, XL, 50) # Illustrate axs[0].hist(P, bins, density=True, label='Example sample') axs[1].hist(q, bins, density=True, label='Proposal sample and pdf') axs[2].hist(q, bins, density=True, label='Proposal sample - weighted', weights=w) axs[3].hist(r, bins, density=True, label='resmpl: Residual') axs[4].hist(s, bins, density=True, label='resmpl: Systematic') axs[5].hist(t, bins, density=True, label='resmpl: Stochastic') # Add actual pdfs axs[0].plot(xx, pdf(xx, dof), label='pdf: Target') axs[1].plot(xx, qdf(xx, scl), label='pdf: Proposal') # kde pdf comparison axs[6].plot(xx, pdf(xx, dof), c='k', lw=3, label='pdf: Target') axs[6].plot(xx, qdf(xx, scl), c='k', lw=2, label='pdf: Proposal') axs[6].plot(xx, kde(r, 'c', bw=[0.1]).pdf(xx), label='kde: Residual') axs[6].plot(xx, kde(s, 'c', bw=[0.1]).pdf(xx), label='kde: Systematic') axs[6].plot(xx, kde(t, 'c', bw=[0.1]).pdf(xx), label='kde: Stochastic') axs[0].set_yticklabels([]) for ax in f.axes: ax.legend() plt.pause(.1)
def calculate_scores(dist_matrix, kde_bw=0.15, pca_component=0.90, score_weights=None): r"""Calculate scores based on probability density. This function first lower the dimension of dist_matrix by using a `PCA <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`_. Then the distribution of the distance vectors for each atom is estimated using `KDEMultivariate <https://www.statsmodels.org/devel/generated/statsmodels.nonparametric.kernel_density.KDEMultivariate.html>`_. The score of a lipid pose is calculated based on the probability density function of the atom positions in the binding site and weights given to the atoms: .. math:: \text { score }=\sum_{i} W_{i} \cdot \hat{f}_{i, H}(D) where :math:`W_{i}` is the weight given to atom i of the lipid molecule, H is the bandwidth and :math:`\hat{f}_{i, H}(D)` is a multivariate kernel density etimation of the position of atom i in the specified binding site. :math:`\hat{f}_{i, H}(D)` is calculated from all the bound lipid poses in that binding site. Parameters ---------- dist_matrix : numpy.ndarray, shape=(n_lipid_atoms, n_poses, n_binding_site_residues) The distance vectors describing the position of bound poses in the binding site. This dist_matrix can be generated by :meth:`~vectorize_poses`. kde_bw : scalar, default=0.15 The bandwidth for kernel density estimation. Used by `KDEMultivariate <https://www.statsmodels.org/devel/generated/statsmodels.nonparametric.kernel_density.KDEMultivariate.html>`_. By default, the bandwidth is set to 0.15nm which roughly corresponds to the vdw radius of MARTINI 2 beads. pca_component : scalar, default=0.9 The number of components to keep. if ``0 < pca_component<1``, select the number of components such that the amount of variance that needs to be explained is greater than the percentage specified by n_components. It is used by `PCA <https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html>`_. score_weights : None or dict A dictionary that contains the weight for n_lipid_atoms, {idx_atom: weight} Returns --------- scores : numpy.ndarray, shape=(n_samples,) Scores for bound poses. See also -------- pylipid.func.collect_bound_poses Collect bound poses from trajectories. pylipid.func.vectorize_poses Convert bound poses to distance vectors. """ weights = {atom_idx: 1 for atom_idx in np.arange(np.shape(dist_matrix)[0])} if score_weights is not None: weights.update(score_weights) kde_funcs = {} try: for atom_idx in np.arange(np.shape(dist_matrix)[0]): transformed_data = PCA(n_components=pca_component).fit_transform( dist_matrix[atom_idx]) var_type = "" bw = [] for dummy in range(len(transformed_data[0])): var_type += "c" bw.append(kde_bw) kde_funcs[atom_idx] = kde(data=transformed_data, var_type=var_type, bw=bw) # evaluate binding poses scores = np.sum([ weights[atom_idx] * kde_funcs[atom_idx].pdf() for atom_idx in np.arange(np.shape(dist_matrix)[0]) ], axis=0) return scores except ValueError: print( "Pose generation error -- possibly due to insufficient number of binding event." )