def test_weights_equal_to_zero(self): n = 1000 data = [np.random.random(size=(n, 2)) for _ in range(5)] # create some artificial correlations data[0][:, 0] *= np.random.randint(n) weights = [np.ones(n, dtype=np.float32) for _ in range(5)] # omit the first trajectory by setting a weight close to zero. weights[0][:] = 0 weights[0][800:850] = 1 est = OnlineCovariance(compute_c0t=True) for x, w in zip(data, weights): est.partial_fit((x[:-3], x[3:]), w[:-3]) cov = est.fetch_model() zeros = sum((sum(w == 0) for w in weights)) assert np.all(cov.cov_00 < 1), cov.cov_00 assert np.all(cov.cov_00 > 0), cov.cov_00
def test_weights_close_to_zero(self): n = 1000 data = [np.random.random(size=(n, 2)) for _ in range(5)] # create some artificial correlations data[0][:, 0] *= np.random.randint(n) data = np.asarray(data) weights = [np.ones(n, dtype=np.float32) for _ in range(5)] # omit the first trajectory by setting a weight close to zero. weights[0][:] = 1E-44 weights = np.asarray(weights) est = OnlineCovariance(compute_c0t=True) for data_traj, weights_traj in zip(data, weights): est.partial_fit((data_traj[:-3], data_traj[3:]), weights=weights_traj[:-3]) cov = est.fetch_model() # cov = covariance_lagged(data, lag=3, weights=weights, chunksize=10) assert np.all(cov.cov_00 < 1)
class TICA(Estimator, Transformer): r""" Time-lagged independent component analysis (TICA) [1]_, [2]_, [3]_. Parameters ---------- lagtime : int the time of the lag dim : int or float, optional, default 0.95 Number of dimensions (independent components) to project onto. * if dim is not set (None) all available ranks are kept: `n_components == min(n_samples, n_uncorrelated_features)` * if dim is an integer >= 1, this number specifies the number of dimensions to keep. * if dim is a float with ``0 < dim <= 1``, select the number of dimensions such that the amount of kinetic variance that needs to be explained is greater than the percentage specified by dim. epsilon : float eigenvalue norm cutoff. Eigenvalues of C0 with norms <= epsilon will be cut off. The remaining number of eigenvalues define the size of the output. reversible: bool, default=True symmetrize correlation matrices C_0, C_{\tau}. scaling: str or None, default='kinetic_map' * None: unscaled. * 'kinetic_map': Eigenvectors will be scaled by eigenvalues. As a result, Euclidean distances in the transformed data approximate kinetic distances [4]_. This is a good choice when the data is further processed by clustering. * 'commute_map': Eigenvector_i will be scaled by sqrt(timescale_i / 2). As a result, Euclidean distances in the transformed data will approximate commute distances [5]_. Notes ----- Given a sequence of multivariate data :math:`X_t`, computes the mean-free covariance and time-lagged covariance matrix: .. math:: C_0 &= (X_t - \mu)^T (X_t - \mu) \\ C_{\tau} &= (X_t - \mu)^T (X_{t + \tau} - \mu) and solves the eigenvalue problem .. math:: C_{\tau} r_i = C_0 \lambda_i(tau) r_i, where :math:`r_i` are the independent components and :math:`\lambda_i(tau)` are their respective normalized time-autocorrelations. The eigenvalues are related to the relaxation timescale by .. math:: t_i(tau) = -\tau / \ln |\lambda_i|. When used as a dimension reduction method, the input data is projected onto the dominant independent components. References ---------- .. [1] Perez-Hernandez G, F Paul, T Giorgino, G De Fabritiis and F Noe. 2013. Identification of slow molecular order parameters for Markov model construction J. Chem. Phys. 139, 015102. doi:10.1063/1.4811489 .. [2] Schwantes C, V S Pande. 2013. Improvements in Markov State Model Construction Reveal Many Non-Native Interactions in the Folding of NTL9 J. Chem. Theory. Comput. 9, 2000-2009. doi:10.1021/ct300878a .. [3] L. Molgedey and H. G. Schuster. 1994. Separation of a mixture of independent signals using time delayed correlations Phys. Rev. Lett. 72, 3634. .. [4] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 .. [5] Noe, F., Banisch, R., Clementi, C. 2016. Commute maps: separating slowly-mixing molecular configurations for kinetic modeling. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.6b00762 """ def __init__(self, lagtime: int, epsilon=1e-6, reversible=True, dim=0.95, scaling='kinetic_map', ncov=5): super(TICA, self).__init__() # tica parameters self._model.epsilon = epsilon self._model.dim = dim self._model.scaling = scaling self._model.lagtime = lagtime # online cov parameters self.reversible = reversible self._covar = OnlineCovariance(compute_c00=True, compute_c0t=True, compute_ctt=False, remove_data_mean=True, reversible=self.reversible, bessel=False, ncov=ncov) def _create_model(self) -> TICAModel: return TICAModel() def transform(self, data): r"""Projects the data onto the dominant independent components. Parameters ---------- data : ndarray(n, m) the input data Returns ------- Y : ndarray(n,) the projected data """ return self.fetch_model().transform(data) def partial_fit(self, X): """ incrementally update the covariances and mean. Parameters ---------- X: array, list of arrays input data. """ self._covar.partial_fit(X) return self def fit(self, X, **kw): self._covar.fit(X, **kw) return self def fetch_model(self) -> TICAModel: covar_model = self._covar.fetch_model() self._model.cov_00 = covar_model.cov_00 self._model.cov_0t = covar_model.cov_0t self._model.mean_0 = covar_model.mean_0 return self._model
from sktime.covariance.online_covariance import OnlineCovariance if __name__ == '__main__': data = np.random.normal(size=(500000, 10)) ################################################################################################ # compute covariance matrix C00 ################################################################################################ # configure estimator with estimator-global parameters estimator = OnlineCovariance(compute_c00=True, remove_data_mean=True) for batch in np.array_split(data, 100): # during fit or partial fit parameters can be entered that are relevant for that batch only estimator.partial_fit(batch, weights=None, column_selection=None) # this finalizes the partial estimation (ie extracts means & covariance matrices from running covar) # and returns the current model model = estimator.fetch_model() print(model.mean_0) # retrieves copy of current model model_copy = model.copy() assert np.all( model_copy.mean_0 == model.mean_0) and model_copy is not model ################################################################################################ # compute covariance matrix C0t ################################################################################################
class VAMP(Estimator): r"""Variational approach for Markov processes (VAMP)""" def __init__(self, lagtime=1, dim=None, scaling=None, right=False, epsilon=1e-6, ncov=float('inf')): r""" Variational approach for Markov processes (VAMP) [1]_. Parameters ---------- dim : float or int, default=None Number of dimensions to keep: * if dim is not set (None) all available ranks are kept: `n_components == min(n_samples, n_uncorrelated_features)` * if dim is an integer >= 1, this number specifies the number of dimensions to keep. * if dim is a float with ``0 < dim < 1``, select the number of dimensions such that the amount of kinetic variance that needs to be explained is greater than the percentage specified by dim. scaling : None or string Scaling to be applied to the VAMP order parameters upon transformation * None: no scaling will be applied, variance of the order parameters is 1 * 'kinetic map' or 'km': order parameters are scaled by singular value. Only the left singular functions induce a kinetic map wrt the conventional forward propagator. The right singular functions induce a kinetic map wrt the backward propagator. right : boolean Whether to compute the right singular functions. If `right==True`, `get_output()` will return the right singular functions. Otherwise, `get_output()` will return the left singular functions. Beware that only `frames[tau:, :]` of each trajectory returned by `get_output()` contain valid values of the right singular functions. Conversely, only `frames[0:-tau, :]` of each trajectory returned by `get_output()` contain valid values of the left singular functions. The remaining frames might possibly be interpreted as some extrapolation. epsilon : float eigenvalue cutoff. Eigenvalues of :math:`C_{00}` and :math:`C_{11}` with norms <= epsilon will be cut off. The remaining number of eigenvalues together with the value of `dim` define the size of the output. ncov : int, default=infinity limit the memory usage of the algorithm from [3]_ to an amount that corresponds to ncov additional copies of each correlation matrix Notes ----- VAMP is a method for dimensionality reduction of Markov processes. The Koopman operator :math:`\mathcal{K}` is an integral operator that describes conditional future expectation values. Let :math:`p(\mathbf{x},\,\mathbf{y})` be the conditional probability density of visiting an infinitesimal phase space volume around point :math:`\mathbf{y}` at time :math:`t+\tau` given that the phase space point :math:`\mathbf{x}` was visited at the earlier time :math:`t`. Then the action of the Koopman operator on a function :math:`f` can be written as follows: .. math:: \mathcal{K}f=\int p(\mathbf{x},\,\mathbf{y})f(\mathbf{y})\,\mathrm{dy}=\mathbb{E}\left[f(\mathbf{x}_{t+\tau}\mid\mathbf{x}_{t}=\mathbf{x})\right] The Koopman operator is defined without any reference to an equilibrium distribution. Therefore it is well-defined in situations where the dynamics is irreversible or/and non-stationary such that no equilibrium distribution exists. If we approximate :math:`f` by a linear superposition of ansatz functions :math:`\boldsymbol{\chi}` of the conformational degrees of freedom (features), the operator :math:`\mathcal{K}` can be approximated by a (finite-dimensional) matrix :math:`\mathbf{K}`. The approximation is computed as follows: From the time-dependent input features :math:`\boldsymbol{\chi}(t)`, we compute the mean :math:`\boldsymbol{\mu}_{0}` (:math:`\boldsymbol{\mu}_{1}`) from all data excluding the last (first) :math:`\tau` steps of every trajectory as follows: .. math:: \boldsymbol{\mu}_{0} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\boldsymbol{\chi}(t) \boldsymbol{\mu}_{1} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\boldsymbol{\chi}(t) Next, we compute the instantaneous covariance matrices :math:`\mathbf{C}_{00}` and :math:`\mathbf{C}_{11}` and the time-lagged covariance matrix :math:`\mathbf{C}_{01}` as follows: .. math:: \mathbf{C}_{00} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] \mathbf{C}_{11} :=\frac{1}{T-\tau}\sum_{t=\tau}^{T}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right]\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] \mathbf{C}_{01} :=\frac{1}{T-\tau}\sum_{t=0}^{T-\tau}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right]\left[\boldsymbol{\chi}(t+\tau)-\boldsymbol{\mu}_{1}\right] The Koopman matrix is then computed as follows: .. math:: \mathbf{K}=\mathbf{C}_{00}^{-1}\mathbf{C}_{01} It can be shown [1]_ that the leading singular functions of the half-weighted Koopman matrix .. math:: \bar{\mathbf{K}}:=\mathbf{C}_{00}^{-\frac{1}{2}}\mathbf{C}_{01}\mathbf{C}_{11}^{-\frac{1}{2}} encode the best reduced dynamical model for the time series. The singular functions can be computed by first performing the singular value decomposition .. math:: \bar{\mathbf{K}}=\mathbf{U}^{\prime}\mathbf{S}\mathbf{V}^{\prime} and then mapping the input conformation to the left singular functions :math:`\boldsymbol{\psi}` and right singular functions :math:`\boldsymbol{\phi}` as follows: .. math:: \boldsymbol{\psi}(t):=\mathbf{U}^{\prime\top}\mathbf{C}_{00}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{0}\right] \boldsymbol{\phi}(t):=\mathbf{V}^{\prime\top}\mathbf{C}_{11}^{-\frac{1}{2}}\left[\boldsymbol{\chi}(t)-\boldsymbol{\mu}_{1}\right] References ---------- .. [1] Wu, H. and Noe, F. 2017. Variational approach for learning Markov processes from time series data. arXiv:1707.04659v1 .. [2] Noe, F. and Clementi, C. 2015. Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory. Comput. doi:10.1021/acs.jctc.5b00553 .. [3] Chan, T. F., Golub G. H., LeVeque R. J. 1979. Updating formulae and pairwise algorithms for computing sample variances. Technical Report STAN-CS-79-773, Department of Computer Science, Stanford University. """ self.dim = dim self.scaling = scaling self.right = right self.epsilon = epsilon self.ncov = ncov self._covar = OnlineCovariance(lagtime=lagtime, compute_c00=True, compute_c0t=True, compute_ctt=True, remove_data_mean=True, reversible=False, bessels_correction=False, ncov=self.ncov) self.lagtime = lagtime super(VAMP, self).__init__() def fit(self, data, **kw): self._model = VAMPModel(dim=self.dim, epsilon=self.epsilon, scaling=self.scaling, right=self.right) self._covar.fit(data, **kw) self.fetch_model() return self def partial_fit(self, X): """ incrementally update the covariances and mean. Parameters ---------- X: array, list of arrays, PyEMMA reader input data. Notes ----- The projection matrix is first being calculated upon its first access. """ if self._model is None: self._model = VAMPModel(dim=self.dim, epsilon=self.epsilon, scaling=self.scaling, right=self.right) self._covar.partial_fit(X) return self def fetch_model(self) -> VAMPModel: covar_model = self._covar.fetch_model() self._model.cov_00 = covar_model.cov_00 self._model.cov_0t = covar_model.cov_0t self._model.cov_tt = covar_model.cov_tt self._model.mean_0 = covar_model.mean_0 self._model.mean_t = covar_model.mean_t self._model._diagonalize() return self._model @property def lagtime(self): return self._covar.lagtime @lagtime.setter def lagtime(self, value): self._covar.lagtime = value