def default(dtrajs, n_states: int, lagtime: int, n_samples: int = 100, stride: Union[str, int] = 'effective', p0_prior: Optional[Union[str, float, np.ndarray]] = 'mixed', transition_matrix_prior: Union[str, np.ndarray] = 'mixed', separate: Optional[Union[int, List[int]]] = None, store_hidden: bool = False, reversible: bool = True, stationary: bool = False, dt_traj: str = '1 step'): """ Computes a default prior for a BHMSM and uses that for error estimation. For a more detailed description of the arguments please refer to :class:`HMSM <sktime.markovprocess.hidden_markov_model.HMSM>` or :class:`BayesianHMSM <sktime.markovprocess.bayesian_hmsm.BayesianHMSM>`. """ dtrajs = ensure_dtraj_list(dtrajs) prior_est = BayesianHMSM.default_prior_estimator(n_states=n_states, lagtime=lagtime, stride=stride, reversible=reversible, stationary=stationary, separate=separate, dt_traj=dt_traj) prior = prior_est.fit(dtrajs).fetch_model().submodel_largest(connectivity_threshold='1/n', dtrajs=dtrajs) estimator = BayesianHMSM(init_hmsm=prior, n_states=n_states, lagtime=lagtime, n_samples=n_samples, stride=stride, p0_prior=p0_prior, transition_matrix_prior=transition_matrix_prior, store_hidden=store_hidden, reversible=reversible, stationary=stationary) return estimator
def count_states(dtrajs, ignore_negative: bool = False): r"""Computes a histogram over the visited states in one or multiple discretized trajectories. Parameters ---------- dtrajs : array_like or list of array_like Discretized trajectory or list of discretized trajectories ignore_negative : bool, default=False Ignore negative elements. By default, a negative element will cause an exception Returns ------- count : ndarray((n), dtype=int) the number of occurrences of each state. n=max+1 where max is the largest state index found. """ dtrajs = ensure_dtraj_list(dtrajs) max_n_states = 0 histograms = [] for discrete_trajectory in dtrajs: if ignore_negative: discrete_trajectory = discrete_trajectory[np.where(discrete_trajectory >= 0)] trajectory_histogram = np.bincount(discrete_trajectory) max_n_states = max(max_n_states, trajectory_histogram.shape[0]) histograms.append(trajectory_histogram) # allocate space for histogram res = np.zeros(max_n_states, dtype=int) # aggregate histograms over trajectories for trajectory_histogram in histograms: res[:trajectory_histogram.shape[0]] += trajectory_histogram return res
def lag_observations(observations, lag, stride=1): r""" Create new trajectories that are subsampled at lag but shifted Given a trajectory (s0, s1, s2, s3, s4, ...) and lag 3, this function will generate 3 trajectories (s0, s3, s6, ...), (s1, s4, s7, ...) and (s2, s5, s8, ...). Use this function in order to parametrize a MLE at lag times larger than 1 without discarding data. Do not use this function for Bayesian estimators, where data must be given such that subsequent transitions are uncorrelated. Parameters ---------- observations : array_like or list of array_like observation trajectories lag : int lag time stride : int, default=1 will return only one trajectory for every stride. Use this for Bayesian analysis. """ # todo cppify observations = ensure_dtraj_list(observations) obsnew = [] for obs in observations: for shift in range(0, lag, stride): obs_lagged = obs[shift::lag] if len(obs_lagged) > 1: obsnew.append(obs_lagged) return obsnew
def initial_guess_gaussian_from_data(dtrajs, n_hidden_states, reversible): r""" Makes an initial guess :class:`HMM <HiddenMarkovStateModel>` with Gaussian output model. To this end, a Gaussian mixture model is estimated using `scikit-learn <https://scikit-learn.org/>`_. Parameters ---------- dtrajs : array_like or list of array_like Trajectories which are used for making the initial guess. n_hidden_states : int Number of hidden states. reversible : bool Whether the hidden transition matrix is estimated so that it is reversible. Returns ------- hmm_init : HiddenMarkovStateModel An initial guess for the HMM See Also -------- GaussianOutputModel : The type of output model this heuristic uses. initial_guess_discrete_from_data : Initial guess with :class:`Discrete output model <sktime.markov.hmm.DiscreteOutputModel>`. initial_guess_discrete_from_msm : Initial guess from an already existing :class:`MSM <sktime.markov.msm.MarkovStateModel>` with discrete output model. """ from sklearn.mixture import GaussianMixture dtrajs = ensure_dtraj_list(dtrajs) collected_observations = np.concatenate(dtrajs) gmm = GaussianMixture(n_components=n_hidden_states) gmm.fit(collected_observations[:, None]) output_model = GaussianOutputModel(n_hidden_states, means=gmm.means_[:, 0], sigmas=np.sqrt(gmm.covariances_[:, 0])) # Compute fractional state memberships. Nij = np.zeros((n_hidden_states, n_hidden_states)) for o_t in dtrajs: # length of trajectory T = o_t.shape[0] # output probability pobs = output_model.to_state_probability_trajectory(o_t) # normalize pobs /= pobs.sum(axis=1)[:, None] # Accumulate fractional transition counts from this trajectory. for t in range(T - 1): Nij += np.outer(pobs[t, :], pobs[t + 1, :]) # Compute transition matrix maximum likelihood estimate. import msmtools.estimation as msmest import msmtools.analysis as msmana Tij = msmest.transition_matrix(Nij, reversible=reversible) pi = msmana.stationary_distribution(Tij) return HiddenMarkovStateModel(transition_model=Tij, output_model=output_model, initial_distribution=pi)
def nonempty_obs(self, dtrajs): if dtrajs is None: raise ValueError("Needs nonempty dtrajs to evaluate nonempty obs.") dtrajs = ensure_dtraj_list(dtrajs) dtrajs_lagged_strided = compute_dtrajs_effective( dtrajs, self.count_model.lagtime, self.count_model.n_states_full, self.stride) obs = np.where(count_states(dtrajs_lagged_strided) > 0)[0] return obs
def test_2state_rev_step(self): obs = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1], dtype=int) dtrajs = ensure_dtraj_list(obs) init_hmm = initial_guess_discrete_from_data(dtrajs, 2, 1, regularize=False) hmm = MaximumLikelihoodHMSM(init_hmm, lagtime=1).fit(dtrajs).fetch_model() # this will generate disconnected count matrices and should fail: with self.assertRaises(NotImplementedError): BayesianHMSM(hmm).fit(obs)
def visited_set(dtrajs): r"""returns the set of states that have at least one count Parameters ---------- dtraj : array_like or list of array_like Discretized trajectory or list of discretized trajectories Returns ------- vis : ndarray((n), dtype=int) the set of states that have at least one count. """ dtrajs = ensure_dtraj_list(dtrajs) hist = count_states(dtrajs) return np.argwhere(hist > 0)[:, 0]
def compute_effective_stride(dtrajs, lagtime, n_states) -> int: r""" Computes the effective stride which is an estimate of the striding required to produce uncorrelated samples. By default this is the lagtime (lag sampling). A nonreversible MSM is estimated, if its number of states is larger than the number of states provided to this method, stride is set to the minimum of lagtime and two times the correlation time of the next neglected timescale. Parameters ---------- dtrajs : array_like or list of array_like Discretized trajectory or list of discretized trajectories lagtime : int Lagtime n_states : int Number of resolved states Returns ------- stride : int Estimated effective stride to produce approximately uncorrelated samples """ dtrajs = ensure_dtraj_list(dtrajs) # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make stride = lagtime # get a quick fit from the spectral radius of the non-reversible from sktime.markovprocess import TransitionCountEstimator count_model = TransitionCountEstimator(lagtime=lagtime, count_mode="sliding").fit(dtrajs).fetch_model() count_model = count_model.submodel_largest() from sktime.markovprocess import MaximumLikelihoodMSM msm_non_rev = MaximumLikelihoodMSM(reversible=False, sparse=False).fit(count_model).fetch_model() # if we have more than n_states timescales in our MSM, we use the next (neglected) timescale as an # fit of the de-correlation time if msm_non_rev.n_states > n_states: # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning import warnings from msmtools.util.exceptions import ImaginaryEigenValueWarning with warnings.catch_warnings(): warnings.filterwarnings('ignore', category=ImaginaryEigenValueWarning, module='msmtools.analysis.dense.decomposition') correlation_time = max(1, msm_non_rev.timescales()[n_states - 1]) # use the smaller of these two pessimistic estimates stride = int(min(lagtime, 2 * correlation_time)) return stride
def transform_discrete_trajectories_to_observed_symbols(self, dtrajs): r"""A list of integer arrays with the discrete trajectories mapped to the currently used set of observation symbols. For example, if there has been a subselection of the model for connectivity='largest', the indices will be given within the connected set, frames that do not correspond to a considered symbol are set to -1. Parameters ---------- dtrajs : array_like or list of array_like discretized trajectories Returns ------- array_like or list of array_like Curated discretized trajectories so that unconsidered symbols are mapped to -1. """ dtrajs = ensure_dtraj_list(dtrajs) mapping = -1 * np.ones(self.n_observation_states_full, dtype=np.int32) mapping[self.observation_state_symbols] = np.arange( self.n_observation_states) return [mapping[dtraj] for dtraj in dtrajs]
def nonempty_obs(self, dtrajs) -> np.ndarray: r""" Computes the set of visited observable states given a set of discrete trajectories. Parameters ---------- dtrajs : array_like observable trajectory Returns ------- symbols : np.ndarray The observation symbols which are visited. """ from sktime.markov.util import compute_dtrajs_effective, count_states if dtrajs is None: raise ValueError("Needs nonempty dtrajs to evaluate nonempty obs.") dtrajs = ensure_dtraj_list(dtrajs) dtrajs_lagged_strided = compute_dtrajs_effective( dtrajs, self.transition_model.lagtime, self.transition_model.count_model.n_states_full, self.stride) obs = np.where(count_states(dtrajs_lagged_strided) > 0)[0] return obs
def compute_index_states(dtrajs, subset=None) -> typing.List[np.ndarray]: """Generates a trajectory/time indices for the given list of states Parameters ---------- dtraj : array_like or list of array_like Discretized trajectory or list of discretized trajectories. Negative elements will be ignored subset : ndarray((n)), optional, default = None array of states to be indexed. By default all states in dtrajs will be used Returns ------- indices : list of ndarray( (N_i, 2) ) For each state, all trajectory and time indices where this state occurs. Each matrix has a number of rows equal to the number of occurrences of the corresponding state, with rows consisting of a tuple (i, t), where i is the index of the trajectory and t is the time index within the trajectory. """ # check input from . import _markovprocess_bindings as bd dtrajs = ensure_dtraj_list(dtrajs) return bd.sample.index_states(dtrajs, subset)
def compute_viterbi_paths(self, observations) -> List[np.ndarray]: r""" Computes the Viterbi paths using the current HMM model. Parameters ---------- observations : list of array_like or array_like observations Returns ------- paths : list of np.ndarray the computed viterbi paths """ observations = ensure_dtraj_list(observations) A = self.transition_model.transition_matrix pi = self.initial_distribution state_probabilities = [ self.output_model.to_state_probability_trajectory(obs) for obs in observations ] paths = [viterbi(A, obs, pi) for obs in state_probabilities] return paths
def fit(self, data, *args, **kw): r""" Counts transitions at given lag time according to configuration of the estimator. Parameters ---------- data : array_like or list of array_like discretized trajectories """ dtrajs = ensure_dtraj_list(data) # basic count statistics histogram = count_states(dtrajs, ignore_negative=True) # Compute count matrix count_mode = self.count_mode lagtime = self.lagtime if count_mode == 'sliding' or count_mode == 'sliding-effective': count_matrix = msmest.count_matrix(dtrajs, lagtime, sliding=True, sparse_return=self.sparse) if count_mode == 'sliding-effective': count_matrix /= lagtime elif count_mode == 'sample': count_matrix = msmest.count_matrix(dtrajs, lagtime, sliding=False, sparse_return=self.sparse) elif count_mode == 'effective': count_matrix = msmest.effective_count_matrix(dtrajs, lagtime) if not self.sparse and issparse(count_matrix): count_matrix = count_matrix.toarray() else: raise ValueError('Count mode {} is unknown.'.format(count_mode)) # initially state symbols, full count matrix, and full histogram can be left None because they coincide # with the input arguments self._model = TransitionCountModel( count_matrix=count_matrix, counting_mode=count_mode, lagtime=lagtime, state_histogram=histogram, physical_time=self.physical_time ) return self
def fit(self, dtrajs, **kwargs): dtrajs = ensure_dtraj_list(dtrajs) # CHECK LAG trajlengths = [len(dtraj) for dtraj in dtrajs] if self.lagtime >= np.max(trajlengths): raise ValueError( f'Illegal lag time {self.lagtime} exceeds longest trajectory length' ) if self.lagtime > np.mean(trajlengths): warnings.warn( f'Lag time {self.lagtime} is on the order of mean trajectory length ' f'{np.mean(trajlengths)}. It is recommended to fit four lag times in each ' 'trajectory. HMM might be inaccurate.') dtrajs_lagged_strided = compute_dtrajs_effective( dtrajs, lagtime=self.lagtime, n_states=self.n_hidden_states, stride=self.stride) # INIT HMM if isinstance(self.msm_init, str): args = dict(observations=dtrajs_lagged_strided, n_states=self.n_hidden_states, lag=1, reversible=self.reversible, stationary=True, regularize=True, separate=self.separate) if self.msm_init == 'largest-strong': args['method'] = 'lcs-spectral' elif self.msm_init == 'all': args['method'] = 'spectral' hmm_init = init_discrete_hmm(**args) elif isinstance(self.msm_init, MarkovStateModel): msm_count_model = self.msm_init.count_model p0, P0, pobs0 = init_discrete_hmm_spectral( msm_count_model.count_matrix.toarray(), self.n_hidden_states, reversible=self.reversible, stationary=True, P=self.msm_init.transition_matrix, separate=self.separate) hmm_init = discrete_hmm(p0, P0, pobs0) else: raise RuntimeError( "msm init was neither a string (largest-strong or spectral) nor " "a MarkovStateModel: {}".format(self.msm_init)) # --------------------------------------------------------------------------------------- # Estimate discrete HMM # --------------------------------------------------------------------------------------- from .bhmm.estimators.maximum_likelihood import MaximumLikelihoodHMM hmm_est = MaximumLikelihoodHMM(self.n_hidden_states, initial_model=hmm_init, output='discrete', reversible=self.reversible, stationary=self.stationary, accuracy=self.accuracy, maxit=self.maxit) hmm = hmm_est.fit(dtrajs_lagged_strided).fetch_model() # observation_state_symbols = np.unique(np.concatenate(dtrajs_lagged_strided)) # update the count matrix from the counts obtained via the Viterbi paths. hmm_count_model = TransitionCountModel( count_matrix=hmm.transition_counts, lagtime=self.lagtime, physical_time=self.physical_time) # set model parameters self._model = HiddenMarkovStateModel( transition_matrix=hmm.transition_matrix, observation_probabilities=hmm.output_model.output_probabilities, stride=self.stride, stationary_distribution=hmm.stationary_distribution, initial_counts=hmm.initial_count, reversible=self.reversible, initial_distribution=hmm.initial_distribution, count_model=hmm_count_model, bhmm_model=hmm, observation_state_symbols=None) return self
def fit(self, dtrajs, callback=None): dtrajs = ensure_dtraj_list(dtrajs) model = BayesianHMMPosterior() # check if n_states and lag are compatible if self.lagtime != self.init_hmsm.lagtime: raise ValueError('BayesianHMSM cannot be initialized with init_hmsm with incompatible lagtime.') if self.n_states != self.init_hmsm.n_states: raise ValueError('BayesianHMSM cannot be initialized with init_hmsm with incompatible n_states.') # EVALUATE STRIDE init_stride = self.init_hmsm.stride if self.stride == 'effective': from sktime.markovprocess.util import compute_effective_stride self.stride = compute_effective_stride(dtrajs, self.lagtime, self.n_states) # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible dtrajs_lagged_strided = compute_dtrajs_effective( dtrajs, lagtime=self.lagtime, n_states=self.n_states, stride=self.stride ) if self.stride != init_stride: symbols = np.unique(np.concatenate(dtrajs_lagged_strided)) if not np.all(self.init_hmsm.observation_state_symbols == symbols): raise ValueError('Choice of stride has excluded a different set of microstates than in ' 'init_hmsm. Set of observed microstates in time-lagged strided trajectories ' 'must match to the one used for init_hmsm estimation.') # as mentioned in the docstring, take init_hmsm observed set observation probabilities self.observe_nonempty = False # update HMM Model model.prior = self.init_hmsm.copy() prior = model.prior prior_count_model = prior.count_model # check if we have a valid initial model if self.reversible and not is_connected(prior_count_model.count_matrix): raise NotImplementedError(f'Encountered disconnected count matrix:\n{self.count_matrix} ' f'with reversible Bayesian HMM sampler using lag={self.lag}' f' and stride={self.stride}. Consider using shorter lag, ' 'or shorter stride (to use more of the data), ' 'or using a lower value for mincount_connectivity.') # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the # Bayesian HMM sampler. This is just an initialization. n_states_full = number_of_states(dtrajs) if prior.n_observation_states < n_states_full: eps = 0.01 / n_states_full # default output probability, in order to avoid zero columns # full state space output matrix. make sure there are no zero columns B_init = eps * np.ones((self.n_states, n_states_full), dtype=np.float64) # fill active states B_init[:, prior.observation_state_symbols] = np.maximum(eps, prior.observation_probabilities) # renormalize B to make it row-stochastic B_init /= B_init.sum(axis=1)[:, None] else: B_init = prior.observation_probabilities # HMM sampler if self.init_hmsm is not None: hmm_mle = self.init_hmsm.bhmm_model else: hmm_mle = discrete_hmm(prior.initial_distribution, prior.transition_matrix, B_init) sampled_hmm = bayesian_hmm(dtrajs_lagged_strided, hmm_mle, nsample=self.n_samples, reversible=self.reversible, stationary=self.stationary, p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior, store_hidden=self.store_hidden, callback=callback).fetch_model() # repackage samples as HMSM objects and re-normalize after restricting to observable set samples = [] for sample in sampled_hmm: # restrict to observable set if necessary P = sample.transition_matrix pi = sample.stationary_distribution pobs = sample.output_model.output_probabilities init_dist = sample.initial_distribution Bobs = pobs[:, prior.observation_state_symbols] pobs = Bobs / Bobs.sum(axis=1)[:, None] # renormalize samples.append(HiddenMarkovStateModel(P, pobs, stationary_distribution=pi, count_model=prior_count_model, initial_counts=sample.initial_count, reversible=self.reversible, initial_distribution=init_dist)) # store results if self.store_hidden: model.hidden_state_trajectories_samples = [s.hidden_state_trajectories for s in sampled_hmm] model.samples = samples # set new model self._model = model return self
def submodel_populous(self, strong=True, connectivity_threshold='1/n', observe_nonempty=True, dtrajs=None): dtrajs = ensure_dtraj_list(dtrajs) states = self.prior.states_populous(strong=strong, connectivity_threshold=connectivity_threshold) obs = self.prior.nonempty_obs(dtrajs) if observe_nonempty else None return self.submodel(states=states, obs=obs)
def score_cv(estimator: _MSMBaseEstimator, dtrajs, lagtime, n=10, count_mode="sliding", score_method='VAMP2', score_k=10, random_state=None): r""" Scores the MSM using the variational approach for Markov processes [1]_ [2]_ and cross-validation [3]_ . Divides the data into training and test data, fits a MSM using the training data using the parameters of this estimator, and scores is using the test data. Currently only one way of splitting is implemented, where for each n, the data is randomly divided into two approximately equally large sets of discrete trajectory fragments with lengths of at least the lagtime. Currently only implemented using dense matrices - will be slow for large state spaces. Parameters ---------- estimator : MSMBaseEstimator like estimator to produce models for CV. dtrajs : list of array_like Test data (discrete trajectories). lagtime : int lag time n : number of samples Number of repetitions of the cross-validation. Use large n to get solid means of the score. count_mode : str, optional, default='sliding' counting mode of count matrix estimator, if sliding the trajectory is split in a sliding window fashion. Supports 'sliding' and 'sample'. score_method : str, optional, default='VAMP2' Overwrite scoring method to be used if desired. If `None`, the estimators scoring method will be used. Available scores are based on the variational approach for Markov processes [1]_ [2]_ : * 'VAMP1' Sum of singular values of the symmetrized transition matrix [2]_ . If the MSM is reversible, this is equal to the sum of transition matrix eigenvalues, also called Rayleigh quotient [1]_ [3]_ . * 'VAMP2' Sum of squared singular values of the symmetrized transition matrix [2]_ . If the MSM is reversible, this is equal to the kinetic variance [4]_ . score_k : int or None The maximum number of eigenvalues or singular values used in the score. If set to None, all available eigenvalues will be used. References ---------- .. [1] Noe, F. and F. Nueske: A variational approach to modeling slow processes in stochastic dynamical systems. SIAM Multiscale Model. Simul. 11, 635-655 (2013). .. [2] Wu, H and F. Noe: Variational approach for learning Markov processes from time series data (in preparation). .. [3] McGibbon, R and V. S. Pande: Variational cross-validation of slow dynamical modes in molecular kinetics, J. Chem. Phys. 142, 124105 (2015). .. [4] Noe, F. and C. Clementi: Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015). """ from sktime.markovprocess import TransitionCountEstimator from sktime.util import ensure_dtraj_list dtrajs = ensure_dtraj_list(dtrajs) # ensure format if count_mode not in ('sliding', 'sample'): raise ValueError( 'score_cv currently only supports count modes "sliding" and "sample"' ) sliding = count_mode == 'sliding' scores = [] for fold in range(n): dtrajs_split = blocksplit_dtrajs(dtrajs, lag=lagtime, sliding=sliding, random_state=random_state) dtrajs_train, dtrajs_test = cvsplit_dtrajs(dtrajs_split, random_state=random_state) cc = TransitionCountEstimator( lagtime, count_mode).fit(dtrajs_train).fetch_model().submodel_largest() model = estimator.fit(cc).fetch_model() s = model.score(dtrajs_test, score_method=score_method, score_k=score_k) scores.append(s) return np.array(scores)
def initial_guess_discrete_from_data( dtrajs, n_hidden_states, lagtime, stride=1, mode='largest-regularized', reversible: bool = True, stationary: bool = False, separate_symbols=None, states: Optional[np.ndarray] = None, regularize: bool = True, connectivity_threshold: Union[str, float] = 0.): r"""Estimates an initial guess :class:`HMM <sktime.markov.hmm.HiddenMarkovStateModel>` from given discrete trajectories. Following the procedure described in [1]_: First a :class:`MSM <sktime.markov.msm.MarkovStateModel>` is estimated, which is then subsequently coarse-grained with PCCA+ [2]_. After estimation of the MSM, this method class :meth:`initial_guess_discrete_from_msm`. Parameters ---------- dtrajs : array_like or list of array_like A discrete trajectory or a list of discrete trajectories. n_hidden_states : int Number of hidden states. lagtime : int The lagtime at which transitions are counted. stride : int or str, optional, default=1 stride between two lagged trajectories extracted from the input trajectories. Given trajectory :code:`s[t]`, stride and lag will result in trajectories :code:`s[0], s[lag], s[2 lag], ...` :code:`s[stride], s[stride + lag], s[stride + 2 lag], ...` Setting stride = 1 will result in using all data (useful for maximum likelihood estimator), while a Bayesian estimator requires a longer stride in order to have statistically uncorrelated trajectories. Setting :code:`stride='effective'` uses the largest neglected timescale as an estimate for the correlation time and sets the stride accordingly. mode : str, optional, default='largest-regularized' The mode at which the markov state model is estimated. Since the process is assumed to be reversible and finite statistics might lead to unconnected regions in state space, a subselection can automatically be made and the count matrix can be regularized. The following options are available: * 'all': all available states are taken into account * 'largest': the largest connected state set is selected, see :meth:`TransitionCountModel.submodel_largest <sktime.markov.TransitionCountModel.submodel_largest>`. * populus: the connected set with the largest population in the data, see :meth:`TransitionCountModel.submodel_largest <sktime.markov.TransitionCountModel.submodel_largest>`. For regularization, each of the options can be suffixed by a '-regularized', e.g., 'largest-regularized'. This means that the count matrix has no zero entries and everything is reversibly connected. In particular, a prior of the form .. math:: b_{ij}=\left \{ \begin{array}{rl} \alpha & \text{, if }c_{ij}+c_{ji}>0, \\ 0 & \text{, otherwise,} \end{array} \right . with :math:`\alpha=10^{-3}` is added and all non-reversibly connected components are artifically connected by adding backward paths. reversible : bool, optional, default=True Whether the HMM transition matrix is estimated so that it is reversibe. stationary : bool, optional, default=False If True, the initial distribution of hidden states is self-consistently computed as the stationary distribution of the transition matrix. If False, it will be estimated from the starting states. Only set this to true if you're sure that the observation trajectories are initiated from a global equilibrium distribution. separate_symbols : array_like, optional, default=None Force the given set of observed states to stay in a separate hidden state. The remaining nstates-1 states will be assigned by a metastable decomposition. states : (dtype=int) ndarray, optional, default=None Artifically restrict count model to selection of states, even before regularization. regularize : bool, optional, default=True If set to True, makes sure that the hidden initial distribution and transition matrix have nonzero probabilities by setting them to eps and then renormalizing. Avoids zeros that would cause estimation algorithms to crash or get stuck in suboptimal states. connectivity_threshold : float or '1/n', optional, default=0. Connectivity threshold. counts that are below the specified value are disregarded when finding connected sets. In case of '1/n', the threshold gets resolved to :math:`1 / \mathrm{n\_states\_full}`. Returns ------- hmm_init : HiddenMarkovStateModel An initial guess for the HMM See Also -------- DiscreteOutputModel : The type of output model this heuristic uses. initial_guess_discrete_from_msm : Initial guess from an already existing :class:`MSM <sktime.markov.msm.MarkovStateModel>`. initial_guess_gaussian_from_data : Initial guess with :class:`Gaussian output model <sktime.markov.hmm.GaussianOutputModel>`. References ---------- .. [1] F. Noe, H. Wu, J.-H. Prinz and N. Plattner: Projected and hidden Markov models for calculating kinetics and metastable states of complex molecules. J. Chem. Phys. 139, 184114 (2013) .. [2] S. Roeblitz and M. Weber, Fuzzy spectral clustering by PCCA+: application to Markov state models and data classification. Adv Data Anal Classif 7, 147-179 (2013). """ if mode not in initial_guess_discrete_from_data.VALID_MODES \ + [m + "-regularized" for m in initial_guess_discrete_from_data.VALID_MODES]: raise ValueError("mode can only be one of [{}]".format(", ".join( initial_guess_discrete_from_data.VALID_MODES))) dtrajs = ensure_dtraj_list(dtrajs) dtrajs = compute_dtrajs_effective(dtrajs, lagtime=lagtime, n_states=n_hidden_states, stride=stride) counts = TransitionCountEstimator(1, 'sliding', sparse=False).fit(dtrajs).fetch_model() if states is not None: counts = counts.submodel(states) if '-regularized' in mode: import msmtools.estimation as memest counts.count_matrix[...] += memest.prior_neighbor( counts.count_matrix, 0.001) nonempty = np.where( counts.count_matrix.sum(axis=0) + counts.count_matrix.sum(axis=1) > 0)[0] counts.count_matrix[nonempty, nonempty] = np.maximum( counts.count_matrix[nonempty, nonempty], 0.001) if 'all' in mode: pass # no-op if 'largest' in mode: counts = counts.submodel_largest( directed=True, connectivity_threshold=connectivity_threshold, sort_by_population=False) if 'populous' in mode: counts = counts.submodel_largest( directed=True, connectivity_threshold=connectivity_threshold, sort_by_population=True) msm = MaximumLikelihoodMSM(reversible=True, allow_disconnected=True, maxerr=1e-3, maxiter=10000).fit(counts).fetch_model() return initial_guess_discrete_from_msm(msm, n_hidden_states, reversible, stationary, separate_symbols, regularize)
def fit(self, dtrajs, initial_model=None, **kwargs): r""" Fits a new :class:`HMM <HiddenMarkovStateModel>` to data. Parameters ---------- dtrajs : array_like or list of array_like Timeseries data. initial_model : HiddenMarkovStateModel, optional, default=None Override for :attr:`initial_transition_model`. **kwargs Ignored kwargs for scikit-learn compatibility. Returns ------- self : MaximumLikelihoodHMSM Reference to self. """ if initial_model is None: initial_model = self.initial_transition_model if initial_model is None or not isinstance(initial_model, HiddenMarkovStateModel): raise ValueError( "For estimation, an initial model of type " "`sktime.markov.hmm.HiddenMarkovStateModel` is required.") # copy initial model transition_matrix = initial_model.transition_model.transition_matrix if issparse(transition_matrix): # want dense matrix, toarray makes a copy transition_matrix = transition_matrix.toarray() else: # new instance transition_matrix = np.copy(transition_matrix) hmm_data = MaximumLikelihoodHMSM._HMMModelStorage( transition_matrix=transition_matrix, output_model=initial_model.output_model.copy(), initial_distribution=initial_model.initial_distribution.copy()) dtrajs = ensure_dtraj_list(dtrajs) dtrajs = compute_dtrajs_effective( dtrajs, lagtime=self.lagtime, n_states=initial_model.n_hidden_states, stride=self.stride) max_n_frames = max(len(obs) for obs in dtrajs) # pre-construct hidden variables N = initial_model.n_hidden_states alpha = np.zeros((max_n_frames, N)) beta = np.zeros((max_n_frames, N)) gammas = [np.zeros((len(obs), N)) for obs in dtrajs] count_matrices = [np.zeros((N, N)) for _ in dtrajs] it = 0 likelihoods = np.empty(self.maxit) # flag if connectivity has changed (e.g. state lost) - in that case the likelihood # is discontinuous and can't be used as a convergence criterion in that iteration. tmatrix_nonzeros = hmm_data.transition_matrix.nonzero() converged = False while not converged and it < self.maxit: loglik = 0.0 for obs, gamma, counts in zip(dtrajs, gammas, count_matrices): loglik_update, _ = self._forward_backward( hmm_data, obs, alpha, beta, gamma, counts) loglik += loglik_update assert np.isfinite(loglik), it # convergence check if it > 0: dL = loglik - likelihoods[it - 1] if dL < self.accuracy: converged = True # update model self._update_model(hmm_data, dtrajs, gammas, count_matrices, maxiter=self.maxit_reversible) # connectivity change check tmatrix_nonzeros_new = hmm_data.transition_matrix.nonzero() if not np.array_equal(tmatrix_nonzeros, tmatrix_nonzeros_new): converged = False # unset converged tmatrix_nonzeros = tmatrix_nonzeros_new # end of iteration likelihoods[it] = loglik it += 1 likelihoods = np.resize(likelihoods, it) transition_counts = self._reduce_transition_counts(count_matrices) count_model = TransitionCountModel(count_matrix=transition_counts, lagtime=self.lagtime, physical_time=self.physical_time) transition_model = MarkovStateModel(hmm_data.transition_matrix, reversible=self.reversible, count_model=count_model) hidden_state_trajs = [ viterbi(hmm_data.transition_matrix, hmm_data.output_model.to_state_probability_trajectory(obs), hmm_data.initial_distribution) for obs in dtrajs ] model = HiddenMarkovStateModel( transition_model=transition_model, output_model=hmm_data.output_model, initial_distribution=hmm_data.initial_distribution, likelihoods=likelihoods, state_probabilities=gammas, initial_count=self._init_counts(gammas), hidden_state_trajectories=hidden_state_trajs, stride=self.stride) self._model = model return self