def test_msm_submodel_statdist(disconnected_states, lag, reversible, count_mode): count_model = TransitionCountEstimator( lagtime=lag, count_mode=count_mode).fit(disconnected_states.dtrajs).fetch_model() for cset in count_model.connected_sets(): submodel = count_model.submodel(cset) estimator = MaximumLikelihoodMSM(reversible=reversible).fit(submodel) msm = estimator.fetch_model() C = submodel.count_matrix P = C / np.sum(C, axis=-1)[:, None] import scipy.linalg as salg eigval, eigvec = salg.eig(P, left=True, right=False) pi = np.real(eigvec)[:, np.where( np.real(eigval) > 1. - 1e-3)[0]].squeeze() if np.any(pi < 0): pi *= -1. pi = pi / np.sum(pi) assert_array_almost_equal(msm.stationary_distribution, pi, decimal=1, err_msg="Failed for cset {} with " "cmat {}".format(cset, submodel.count_matrix))
def test_weakly_connected_count_matrix(): count_matrix = np.array([[10, 1, 0, 0], [0, 1, 1, 0], [0, 1, 1, 1], [0, 0, 0, 1]], dtype=np.float32) assert_equal(MaximumLikelihoodMSM().fit(count_matrix).fetch_model().n_connected_msms, 3, err_msg="Count matrix not strongly connected, should decay into three sets.") # count matrix weakly connected, this should work msm = MaximumLikelihoodMSM(reversible=False).fit(count_matrix).fetch_model() assert_equal(msm.reversible, False) assert_equal(msm.n_states, 4) assert_equal(msm.lagtime, 1) assert_(msm.count_model is not None) assert_equal(msm.count_model.count_matrix, count_matrix) # last state is sink state assert_equal(msm.stationary_distribution, [0, 0, 0, 1]) assert_array_almost_equal(msm.transition_matrix, [[10. / 11, 1. / 11, 0, 0], [0, 0.5, 0.5, 0], [0, 1. / 3, 1. / 3, 1. / 3], [0, 0, 0, 1]]) assert_equal(msm.n_eigenvalues, 4) assert_equal(msm.sparse, False) msm = msm.submodel(np.array([1, 2])) assert_equal(msm.reversible, False) assert_equal(msm.n_states, 2) assert_equal(msm.count_model.state_symbols, [1, 2]) assert_equal(msm.lagtime, 1) assert_equal(msm.count_model.count_matrix, [[1, 1], [1, 1]]) assert_equal(msm.stationary_distribution, [0.5, 0.5]) assert_array_almost_equal(msm.transition_matrix, [[0.5, 0.5], [0.5, 0.5]]) assert_equal(msm.n_eigenvalues, 2) assert_equal(msm.sparse, False)
def msm_double_well(lagtime=100, reversible=True, **kwargs) -> MaximumLikelihoodMSM: count_model = TransitionCountEstimator(lagtime=lagtime, count_mode="sliding")\ .fit(datasets.double_well_discrete().dtraj).fetch_model().submodel_largest() est = MaximumLikelihoodMSM(reversible=reversible, **kwargs) est.fit(count_model) return est
def test_recover_timescale(): trajs = double_well_discrete().simulate_trajectories(n_trajectories=100, n_steps=50000) ts = double_well_discrete().analytic_msm.timescales(1)[0] counts = TransitionCountEstimator(1, 'sliding').fit(trajs).fetch_model() msm = MaximumLikelihoodMSM().fit(counts.submodel_largest()).fetch_model() ts_rec = msm.timescales(1)[0] np.testing.assert_(np.abs(ts - ts_rec) <= 200.)
def estimate_markov_model(dtrajs, lag, **kw) -> MarkovStateModel: statdist_constraint = kw.pop('statdist', None) connectivity = kw.pop('connectivity_threshold', 0.) sparse = kw.pop('sparse', False) count_model = TransitionCountEstimator(lagtime=lag, count_mode="sliding", sparse=sparse).fit(dtrajs).fetch_model() count_model = count_model.submodel_largest(probability_constraint=statdist_constraint, connectivity_threshold=connectivity) est = MaximumLikelihoodMSM(stationary_distribution_constraint=statdist_constraint, sparse=sparse, **kw) est.fit(count_model) return est.fetch_model()
def test_reversible_disconnected(disconnected_states, lag, count_mode): r"""disconnected states: 2 <- 0 <-> 1 <-> 3 | 7 -> 4 <-> 5 | 6""" count_model = TransitionCountEstimator(lagtime=lag, count_mode=count_mode) \ .fit(disconnected_states.dtrajs).fetch_model() msm = MaximumLikelihoodMSM(reversible=True).fit(count_model).fetch_model() assert_equal(msm.n_connected_msms, len(disconnected_states.connected_sets)) for i, subset in enumerate(disconnected_states.connected_sets): # can do this because subsets are ordered in decreasing cardinality assert_equal(msm.state_symbols(i), subset) non_reversibly_connected_set = [0, 1, 2, 3] submodel = count_model.submodel(non_reversibly_connected_set) msm = MaximumLikelihoodMSM(reversible=True).fit(submodel).fetch_model() assert_equal(msm.n_connected_msms, 2) assert_equal(msm.state_symbols(0), [0, 1, 3]) assert_equal(msm.state_symbols(1), [2]) fully_disconnected_set = [6, 2] submodel = count_model.submodel(fully_disconnected_set) msm = MaximumLikelihoodMSM(reversible=True).fit(submodel).fetch_model() assert_equal(msm.n_connected_msms, 2) assert_equal(msm.state_symbols(0), [6]) assert_equal(msm.state_symbols(1), [2])
def test_empirical_vs_ground_truth_koopman_model(): bdc = BirthDeathChain([0, .5, .5], [.5, .5, 0.]) dtraj = bdc.msm.simulate(10000) est = MaximumLikelihoodMSM( reversible=True, stationary_distribution_constraint=bdc.stationary_distribution, lagtime=1) msm_ref = est.fit_fetch(dtraj) assert_almost_equal(bdc.msm.koopman_model.score(r=2), msm_ref.score(r=2), decimal=2)
def test_estimator_params(reversible, statdist, sparse, maxiter, maxerr): if statdist is not None and (np.any(statdist > 1) or np.any(statdist < 0)): with assert_raises(ValueError): MaximumLikelihoodMSM(reversible=reversible, stationary_distribution_constraint=statdist, sparse=sparse, maxiter=maxiter, maxerr=maxerr) else: msm = MaximumLikelihoodMSM(reversible=reversible, stationary_distribution_constraint=statdist, sparse=sparse, maxiter=maxiter, maxerr=maxerr) assert_equal(msm.reversible, reversible) assert_equal(msm.stationary_distribution_constraint, statdist / np.sum(statdist) if statdist is not None else None) assert_equal(msm.sparse, sparse) assert_equal(msm.maxiter, maxiter) assert_equal(msm.maxerr, maxerr)
def compute_effective_stride(dtrajs, lagtime, n_states) -> int: r""" Computes the effective stride which is an estimate of the striding required to produce uncorrelated samples. By default this is the lagtime (lag sampling). A nonreversible MSM is estimated, if its number of states is larger than the number of states provided to this method, stride is set to the minimum of lagtime and two times the correlation time of the next neglected timescale. Parameters ---------- dtrajs : array_like or list of array_like Discretized trajectory or list of discretized trajectories lagtime : int Lagtime n_states : int Number of resolved states Returns ------- stride : int Estimated effective stride to produce approximately uncorrelated samples """ from deeptime.util.types import ensure_dtraj_list dtrajs = ensure_dtraj_list(dtrajs) # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make stride = lagtime # get a quick fit from the spectral radius of the non-reversible from deeptime.markov import TransitionCountEstimator count_model = TransitionCountEstimator( lagtime=lagtime, count_mode="sliding").fit(dtrajs).fetch_model() count_model = count_model.submodel_largest() from deeptime.markov.msm import MaximumLikelihoodMSM msm_non_rev = MaximumLikelihoodMSM( reversible=False, sparse=False).fit(count_model).fetch_model() # if we have more than n_states timescales in our MSM, we use the next (neglected) timescale as an # fit of the de-correlation time if msm_non_rev.n_states > n_states: # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning import warnings with warnings.catch_warnings(): from deeptime.util.exceptions import ImaginaryEigenValueWarning warnings.filterwarnings('ignore', category=ImaginaryEigenValueWarning) correlation_time = max(1, msm_non_rev.timescales()[n_states - 1]) # use the smaller of these two pessimistic estimates stride = int(min(lagtime, 2 * correlation_time)) return stride
def test_estimator(fixed_seed): data = deeptime.data.ellipsoids() obs = data.observations(6000, n_dim=10).astype(np.float32) # set up the lobe lobe = nn.Sequential(nn.Linear(10, 1), nn.Tanh()) # train the lobe opt = torch.optim.Adam(lobe.parameters(), lr=1e-2) for _ in range(50): for X, Y in deeptime.util.data.timeshifted_split(obs, lagtime=1, chunksize=512): opt.zero_grad() lval = vampnet_loss(lobe(torch.from_numpy(X)), lobe(torch.from_numpy(Y))) lval.backward() opt.step() # now let's compare lobe.eval() ds = TrajectoryDataset(1, obs) loader = DataLoader(ds, batch_size=512) loader_val = DataLoader(ds, batch_size=512) vampnet = VAMPNet(lobe=lobe) vampnet_model = vampnet.fit(loader, validation_loader=loader_val).fetch_model() assert_(len(vampnet.train_scores) > 0) assert_(len(vampnet.validation_scores) > 0) # reference model w/o learnt featurization projection = VAMP(lagtime=1, observable_transform=vampnet_model).fit(obs).transform(obs, propagate=True) dtraj = KMeans(2).fit(projection).transform(projection) msm_vampnet = MaximumLikelihoodMSM().fit(dtraj, lagtime=1).fetch_model() np.testing.assert_array_almost_equal(msm_vampnet.transition_matrix, data.msm.transition_matrix, decimal=2)
def test_estimator_fit(dtype): data = deeptime.data.ellipsoids() obs = data.observations(60000, n_dim=2).astype(dtype) train, val = torch.utils.data.random_split(deeptime.data.TimeLaggedDataset.from_trajectory(1, obs), [50000, 9999]) # set up the lobe linear_layer = nn.Linear(2, 1) lobe = nn.Sequential(linear_layer, nn.Tanh()) with torch.no_grad(): linear_layer.weight[0, 0] = -0.3030 linear_layer.weight[0, 1] = 0.3060 linear_layer.bias[0] = -0.7392 net = VAMPNet(lobe=lobe, dtype=dtype, learning_rate=1e-8) train_loader = create_timelagged_data_loader(train, lagtime=1, batch_size=512) val_loader = create_timelagged_data_loader(val, lagtime=1, batch_size=512) net.fit(train_loader, n_epochs=1, validation_data=val_loader, validation_score_callback=lambda *x: x) projection = net.transform(obs) # reference model w/o learnt featurization projection = VAMP(lagtime=1).fit(projection).fetch_model().transform(projection) dtraj = Kmeans(2).fit(projection).transform(projection) msm_vampnet = MaximumLikelihoodMSM().fit(dtraj, lagtime=1).fetch_model() np.testing.assert_array_almost_equal(msm_vampnet.transition_matrix, data.msm.transition_matrix, decimal=2)
def test_estimator(): data = deeptime.data.ellipsoids() obs = data.observations(60000, n_dim=10).astype(np.float32) # set up the lobe lobe = nn.Sequential(nn.Linear(10, 1), nn.Tanh()) # train the lobe opt = torch.optim.Adam(lobe.parameters(), lr=5e-4) for _ in range(50): for X, Y in deeptime.data.timeshifted_split(obs, lagtime=1, chunksize=512): opt.zero_grad() lval = loss(lobe(torch.from_numpy(X)), lobe(torch.from_numpy(Y))) lval.backward() opt.step() # now let's compare lobe.eval() loader = create_timelagged_data_loader(obs, lagtime=1, batch_size=512) vampnet = VAMPNet(lobe=lobe) vampnet_model = vampnet.fit(loader).fetch_model() # np.testing.assert_array_less(vamp_model.timescales()[0], vampnet_model.timescales()[0]) projection = vampnet_model.transform(obs) # reference model w/o learnt featurization projection = VAMP(lagtime=1).fit(projection).fetch_model().transform(projection) dtraj = Kmeans(2).fit(projection).transform(projection) msm_vampnet = MaximumLikelihoodMSM().fit(dtraj, lagtime=1).fetch_model() np.testing.assert_array_almost_equal(msm_vampnet.transition_matrix, data.msm.transition_matrix, decimal=2)
def test_nonreversible_disconnected(): msm1 = MarkovStateModel([[.7, .3], [.3, .7]]) msm2 = MarkovStateModel([[.9, .05, .05], [.3, .6, .1], [.1, .1, .8]]) traj = np.concatenate([msm1.simulate(1000000), 2 + msm2.simulate(1000000)]) counts = TransitionCountEstimator(lagtime=1, count_mode="sliding").fit(traj) msm = MaximumLikelihoodMSM(reversible=True).fit(counts).fetch_model() assert_equal(msm.transition_matrix.shape, (3, 3)) assert_equal(msm.stationary_distribution.shape, (3,)) assert_equal(msm.state_symbols(), [2, 3, 4]) assert_equal(msm.state_symbols(1), [0, 1]) msm.select(1) assert_equal(msm.transition_matrix.shape, (2, 2)) assert_equal(msm.stationary_distribution.shape, (2,)) assert_equal(msm.state_symbols(), [0, 1]) assert_equal(msm.state_symbols(0), [2, 3, 4]) with assert_raises(IndexError): msm.select(2)
def __init__(self, reversible, statdist_constraint, sparse, count_mode="sliding"): super().__init__(statdist_constraint=statdist_constraint, sparse=sparse, count_mode=count_mode) maxerr = 1e-12 if statdist_constraint: est = MaximumLikelihoodMSM(reversible=reversible, maxerr=maxerr, stationary_distribution_constraint=self. stationary_distribution, sparse=sparse) else: est = MaximumLikelihoodMSM(reversible=reversible, maxerr=maxerr, sparse=sparse) est.fit(self.counts) self._msm = est.fetch_model() self._msm_estimator = est self._expectation = 31.73 if not reversible: self._timescales = np.array([310.49376926, 8.48302712, 5.02649564]) else: self._timescales = np.array([310.87, 8.5, 5.09])
def test_invalid_arguments(): with assert_raises(ValueError): # negative counts MaximumLikelihoodMSM().fit(-1 * np.ones((5, 5))).fetch_model() with assert_raises(ValueError): # non quadratic count matrix MaximumLikelihoodMSM().fit(np.ones((3, 5))).fetch_model() with assert_raises(ValueError): # stationary distribution not over whole state space MaximumLikelihoodMSM(stationary_distribution_constraint=np.array([1 / 3, 1 / 3, 1 / 3])).fit(np.ones((5, 5))) with assert_raises(ValueError): # no counts but statdist constraint MaximumLikelihoodMSM(stationary_distribution_constraint=np.array([.5, .5])).fit(np.zeros((2, 2))) with assert_raises(ValueError): # fit with transition count estimator that hasn't been fit MaximumLikelihoodMSM().fit(TransitionCountEstimator(1, "sliding")) with assert_raises(ValueError): # fit with bogus object MaximumLikelihoodMSM().fit(object()) with assert_raises(ValueError): # fit from timeseries without lagtime MaximumLikelihoodMSM().fit(np.array([0, 1, 2, 3, 4, 5, 6])) with assert_raises(ValueError): # empty collection is not allowed MarkovStateModelCollection([], [], False, [], 1.) with assert_raises(ValueError): # number of elements in lists must match MarkovStateModelCollection([np.array([[.5, .5], [.5, .5]])], [], False, [], 1.) with assert_raises(ValueError): # number of states in lists must match MarkovStateModelCollection([np.array([[.5, .5], [.5, .5]])], [None], False, [TransitionCountModel(np.ones((3, 3)))], 1.)
def test_mlmsm_pipeline(self): file = mdshare.fetch('hmm-doublewell-2d-100k.npz', working_directory='data') with np.load(file) as fh: data = fh['trajectory'] transition_matrix = fh['transition_matrix'] pipeline = Pipeline(steps=[ ('tica', TICA(dim=1, lagtime=1)), ('cluster', KMeans(n_clusters=2, max_iter=500)), ('counts', TransitionCountEstimator(lagtime=1, count_mode="sliding")) ]) pipeline.fit(data) counts = pipeline[-1].fetch_model().submodel_largest() mlmsm = MaximumLikelihoodMSM().fit(counts).fetch_model() P = mlmsm.pcca(2).coarse_grained_transition_matrix mindist = min(np.linalg.norm(P - transition_matrix), np.linalg.norm(P - transition_matrix.T)) assert mindist < 0.05
def test_msm_invalid_statdist_constraint(disconnected_states, lagtime, reversible, count_mode): pi = np.ones(4) / 4. count_model = TransitionCountEstimator(lagtime=lagtime, count_mode=count_mode) \ .fit(disconnected_states.dtrajs).fetch_model() for cset in count_model.connected_sets(): submodel = count_model.submodel(cset) with assert_raises(ValueError): MaximumLikelihoodMSM(reversible=reversible, stationary_distribution_constraint=pi).fit(submodel)
def test_fit_with_invalid_args(): estimator_without_model = TransitionCountEstimator(lagtime=1, count_mode='sliding') with assert_raises(ValueError): MaximumLikelihoodMSM().fit(estimator_without_model) with assert_raises(ValueError): MaximumLikelihoodMSM().fit_from_counts(estimator_without_model) with assert_raises(ValueError): class Bogus: pass MaximumLikelihoodMSM().fit(Bogus()) with assert_raises(ValueError): class Bogus: pass MaximumLikelihoodMSM().fit_from_counts(Bogus())
def test_mlmsm_pipeline(self): hmm = HiddenMarkovModel(transition_model=MarkovStateModel([[.8, .2], [.1, .9]]), output_model=GaussianOutputModel( n_states=2, means=[-10, 10], sigmas=[.1, .1])) htraj, traj = hmm.simulate(10000) transition_matrix = hmm.transition_model.transition_matrix pipeline = Pipeline(steps=[( 'tica', TICA(dim=1, lagtime=1) ), ( 'cluster', KMeans(n_clusters=2, max_iter=500) ), ('counts', TransitionCountEstimator(lagtime=1, count_mode="sliding"))]) pipeline.fit(traj[..., None]) counts = pipeline[-1].fetch_model().submodel_largest() mlmsm = MaximumLikelihoodMSM().fit(counts).fetch_model() P = mlmsm.pcca(2).coarse_grained_transition_matrix mindist = min(np.linalg.norm(P - transition_matrix), np.linalg.norm(P - transition_matrix.T)) assert mindist < 0.05
def test_strongly_connected_count_matrix(): # transitions 6->1->2->3->4->6, disconnected are 0 and 5 dtraj = np.array([0, 6, 1, 2, 3, 4, 6, 5]) counts = TransitionCountEstimator(lagtime=1, count_mode="sliding").fit(dtraj).fetch_model() assert_equal(counts.n_states, 7) sets = counts.connected_sets(directed=True) assert_equal(len(sets), 3) assert_equal(len(sets[0]), 5) with assert_raises(BaseException, msg="count matrix not strongly connected, expected failure in rev. case"): MaximumLikelihoodMSM().fit(counts) counts = counts.submodel_largest(directed=True) # now we are strongly connected # due to reversible we get 6<->1<->2<->3<->4<->6 msm = MaximumLikelihoodMSM(reversible=True).fit(counts).fetch_model() # check that the msm has symbols 1,2,3,4,6 assert_(np.all([i in msm.count_model.state_symbols for i in [1, 2, 3, 4, 6]])) assert_equal(msm.reversible, True) assert_equal(msm.n_states, 5) assert_equal(msm.lagtime, 1) assert_array_almost_equal(msm.transition_matrix, [ [0., .5, 0., 0., .5], [.5, 0., .5, 0., 0.], [0., .5, 0., .5, 0.], [0., 0., .5, 0., .5], [.5, 0., 0., .5, 0.] ]) assert_array_almost_equal(msm.stationary_distribution, [1. / 5] * 5) assert_equal(msm.n_eigenvalues, 5) assert_equal(msm.sparse, False) msm = msm.submodel(np.array([3, 4])) # states 3 and 4 correspond to symbols 4 and 6 assert_equal(msm.reversible, True) assert_equal(msm.n_states, 2) assert_equal(msm.lagtime, 1) assert_array_almost_equal(msm.transition_matrix, [[0, 1.], [1., 0]]) assert_array_almost_equal(msm.stationary_distribution, [0.5, 0.5]) assert_equal(msm.n_eigenvalues, 2) assert_equal(msm.sparse, False) assert_equal(msm.count_model.state_symbols, [4, 6])
def fit(self, data, callback: Callable = None): """ Performs the estimation on either a count matrix or a previously estimated TransitionCountModel. Parameters ---------- data : (N,N) count matrix or TransitionCountModel or MaximumLikelihoodMSM or MarkovStateModel a count matrix or a transition count model that was estimated from data callback: callable, optional, default=None Function to be called to indicate progress of sampling. Returns ------- self : BayesianMSM Reference to self. """ from deeptime.markov import TransitionCountModel if isinstance(data, TransitionCountModel) and data.counting_mode is not None \ and "effective" not in data.counting_mode: raise ValueError( "The transition count model was not estimated using an effective counting method, " "therefore counts are likely to be strongly correlated yielding wrong confidences." ) if isinstance(data, Estimator): if data.has_model: data = data.fetch_model() else: raise ValueError( "Can only use estimators as input if they have been fit previously." ) if isinstance(data, TransitionCountModel) or is_square_matrix(data): msm = MaximumLikelihoodMSM( reversible=self.reversible, stationary_distribution_constraint=self. stationary_distribution_constraint, sparse=self.sparse, maxiter=self.maxiter, maxerr=self.maxerr).fit(data).fetch_model() elif isinstance(data, MarkovStateModel): msm = data else: raise ValueError( "Unsupported input data, can only be count matrix (or TransitionCountModel, " "TransitionCountEstimator) or a MarkovStateModel instance or an estimator producing " "Markov state models.") return self.fit_from_msm(msm, callback=callback)
def metastable_from_data(dtrajs, n_hidden_states, lagtime, stride=1, mode='largest-regularized', reversible: bool = True, stationary: bool = False, separate_symbols=None, states: Optional[np.ndarray] = None, regularize: bool = True, connectivity_threshold: Union[str, float] = 0.): r"""Estimates an initial guess :class:`HMM <deeptime.markov.hmm.HiddenMarkovModel>` from given discrete trajectories. Following the procedure described in :footcite:`noe2013projected`: First a :class:`MSM <deeptime.markov.msm.MarkovStateModel>` is estimated, which is then subsequently coarse-grained with PCCA+ :footcite:`roblitz2013fuzzy`. After estimation of the MSM, this method calls :meth:`metastable_from_msm`. Parameters ---------- dtrajs : array_like or list of array_like A discrete trajectory or a list of discrete trajectories. n_hidden_states : int Number of hidden states. lagtime : int The lagtime at which transitions are counted. stride : int or str, optional, default=1 stride between two lagged trajectories extracted from the input trajectories. Given trajectory :code:`s[t]`, stride and lag will result in trajectories :code:`s[0], s[lag], s[2 lag], ...` :code:`s[stride], s[stride + lag], s[stride + 2 lag], ...` Setting stride = 1 will result in using all data (useful for maximum likelihood estimator), while a Bayesian estimator requires a longer stride in order to have statistically uncorrelated trajectories. Setting :code:`stride='effective'` uses the largest neglected timescale as an estimate for the correlation time and sets the stride accordingly. mode : str, optional, default='largest-regularized' The mode at which the markov state model is estimated. Since the process is assumed to be reversible and finite statistics might lead to unconnected regions in state space, a subselection can automatically be made and the count matrix can be regularized. The following options are available: * 'all': all available states are taken into account * 'largest': the largest connected state set is selected, see :meth:`TransitionCountModel.submodel_largest <deeptime.markov.TransitionCountModel.submodel_largest>`. * populus: the connected set with the largest population in the data, see :meth:`TransitionCountModel.submodel_largest <deeptime.markov.TransitionCountModel.submodel_largest>`. For regularization, each of the options can be suffixed by a '-regularized', e.g., 'largest-regularized'. This means that the count matrix has no zero entries and everything is reversibly connected. In particular, a prior of the form .. math:: b_{ij}=\left \{ \begin{array}{rl} \alpha & \text{, if }c_{ij}+c_{ji}>0, \\ 0 & \text{, otherwise,} \end{array} \right . with :math:`\alpha=10^{-3}` is added and all non-reversibly connected components are artifically connected by adding backward paths. reversible : bool, optional, default=True Whether the HMM transition matrix is estimated so that it is reversibe. stationary : bool, optional, default=False If True, the initial distribution of hidden states is self-consistently computed as the stationary distribution of the transition matrix. If False, it will be estimated from the starting states. Only set this to true if you're sure that the observation trajectories are initiated from a global equilibrium distribution. separate_symbols : array_like, optional, default=None Force the given set of observed states to stay in a separate hidden state. The remaining nstates-1 states will be assigned by a metastable decomposition. states : (dtype=int) ndarray, optional, default=None Artifically restrict count model to selection of states, even before regularization. regularize : bool, optional, default=True If set to True, makes sure that the hidden initial distribution and transition matrix have nonzero probabilities by setting them to eps and then renormalizing. Avoids zeros that would cause estimation algorithms to crash or get stuck in suboptimal states. connectivity_threshold : float or '1/n', optional, default=0. Connectivity threshold. counts that are below the specified value are disregarded when finding connected sets. In case of '1/n', the threshold gets resolved to :math:`1 / \mathrm{n\_states\_full}`. Returns ------- hmm_init : HiddenMarkovModel An initial guess for the HMM See Also -------- DiscreteOutputModel The type of output model this heuristic uses. :func:`metastable_from_msm` Initial guess from an already existing :class:`MSM <deeptime.markov.msm.MarkovStateModel>`. :func:`deeptime.markov.hmm.init.gaussian.from_data` Initial guess with :class:`Gaussian output model <deeptime.markov.hmm.GaussianOutputModel>`. References ---------- .. footbibliography:: """ if mode not in metastable_from_data.VALID_MODES \ + [m + "-regularized" for m in metastable_from_data.VALID_MODES]: raise ValueError("mode can only be one of [{}]".format(", ".join( metastable_from_data.VALID_MODES))) from deeptime.markov import compute_dtrajs_effective, TransitionCountEstimator dtrajs = ensure_dtraj_list(dtrajs) dtrajs = compute_dtrajs_effective(dtrajs, lagtime=lagtime, n_states=n_hidden_states, stride=stride) counts = TransitionCountEstimator(1, 'sliding', sparse=False).fit(dtrajs).fetch_model() if states is not None: counts = counts.submodel(states) if '-regularized' in mode: import deeptime.markov.tools.estimation as memest counts.count_matrix[...] += memest.prior_neighbor( counts.count_matrix, 0.001) nonempty = np.where( counts.count_matrix.sum(axis=0) + counts.count_matrix.sum(axis=1) > 0)[0] counts.count_matrix[nonempty, nonempty] = np.maximum( counts.count_matrix[nonempty, nonempty], 0.001) if 'all' in mode: pass # no-op if 'largest' in mode: counts = counts.submodel_largest( directed=True, connectivity_threshold=connectivity_threshold, sort_by_population=False) if 'populous' in mode: counts = counts.submodel_largest( directed=True, connectivity_threshold=connectivity_threshold, sort_by_population=True) from deeptime.markov.msm import MaximumLikelihoodMSM msm = MaximumLikelihoodMSM(reversible=True, allow_disconnected=True, maxerr=1e-3, maxiter=10000).fit(counts).fetch_model() return metastable_from_msm(msm, n_hidden_states, reversible, stationary, separate_symbols, regularize)
def test_sanity(): dtraj, traj = swissroll_model(100000) msm = MaximumLikelihoodMSM(lagtime=1).fit(dtraj).fetch_model() assert_almost_equal(msm.transition_matrix, swissroll_model.transition_matrix, decimal=2)