def test_simulate_stats(msm): # test statistics of starting state N = 5000 trajs = [msm.simulate(1, seed=i + 1) for i in range(N)] ss = np.concatenate(trajs).astype(int) pi = stationary_distribution(msm.transition_matrix) piest = count_states(ss) / float(N) np.testing.assert_allclose(piest, pi, atol=0.025) assert_(msm.stationary)
def active_count_fraction(self): """The fraction of counts in the largest connected set. """ self._check_is_estimated() from pyemma.util.discrete_trajectories import count_states hist = count_states(self._dtrajs_full) hist_active = hist[self.active_set] return float(_np.sum(hist_active)) / float(_np.sum(hist))
def test_active_state_indices(oom_msm_scenario): for msm in oom_msm_scenario.msms: dtrajs_proj = msm.count_model.transform_discrete_trajectories_to_submodel( oom_msm_scenario.dtrajs) indices = sample.compute_index_states(dtrajs_proj) np.testing.assert_equal(len(indices), msm.n_states) hist = count_states(oom_msm_scenario.dtrajs) for state in range(msm.n_states): np.testing.assert_equal(indices[state].shape[0], hist[msm.count_model.state_symbols[state]]) np.testing.assert_equal(indices[state].shape[1], 2)
def test_active_state_indices(self, setting): scenario = make_double_well(setting) from deeptime.markov import sample I = sample.compute_index_states( scenario.data.dtraj, subset=scenario.msm.count_model.state_symbols) assert (len(I) == scenario.msm.n_states) # compare to histogram from deeptime.markov import count_states hist = count_states(scenario.data.dtraj) # number of frames should match on active subset A = scenario.msm.count_model.state_symbols for i in range(A.shape[0]): assert I[i].shape[0] == hist[A[i]] assert I[i].shape[1] == 2
def test_observable_state_indices(self): from deeptime.markov import sample hmsm = self.hmm_lag10_largest I = sample.compute_index_states(self.dtrajs, subset=hmsm.observation_symbols) # I = hmsm.observable_state_indexes np.testing.assert_equal(len(I), hmsm.n_observation_states) # compare to histogram hist = count_states(self.dtrajs) # number of frames should match on active subset A = hmsm.observation_symbols for i in range(A.shape[0]): np.testing.assert_equal(I[i].shape[0], hist[A[i]]) np.testing.assert_equal(I[i].shape[1], 2)
def fit(self, data, *args, **kw): r""" Counts transitions at given lag time according to configuration of the estimator. Parameters ---------- data : array_like or list of array_like discretized trajectories """ from deeptime.markov import count_states dtrajs = ensure_dtraj_list(data) # basic count statistics histogram = count_states(dtrajs, ignore_negative=True) # Compute count matrix count_mode = self.count_mode lagtime = self.lagtime count_matrix = TransitionCountEstimator.count(count_mode, dtrajs, lagtime, sparse=self.sparse, n_jobs=kw.pop( 'n_jobs', None)) if self.n_states is not None and self.n_states > count_matrix.shape[0]: histogram = np.pad(histogram, pad_width=[ (0, self.n_states - count_matrix.shape[0]) ]) if issparse(count_matrix): count_matrix = scipy.sparse.csr_matrix( (count_matrix.data, count_matrix.indices, count_matrix.indptr), shape=(self.n_states, self.n_states)) else: n_pad = self.n_states - count_matrix.shape[0] count_matrix = np.pad(count_matrix, pad_width=[(0, n_pad), (0, n_pad)]) # initially state symbols, full count matrix, and full histogram can be left None because they coincide # with the input arguments self._model = TransitionCountModel(count_matrix=count_matrix, counting_mode=count_mode, lagtime=lagtime, state_histogram=histogram) return self
def __init__(self, dtrajs): from pyemma.util.types import ensure_dtraj_list # discrete trajectories self._dtrajs = ensure_dtraj_list(dtrajs) # TODO: extensive input checking! if any([np.any(d < -1) for d in self._dtrajs]): raise ValueError('Discrete trajectory contains elements < -1.') ## basic count statistics # histogram self._hist = count_states(self._dtrajs, ignore_negative=True) # total counts self._total_count = np.sum(self._hist) # number of states self._nstates = number_of_states(dtrajs) # not yet estimated self._counted_at_lag = False
def nonempty_obs(self, dtrajs) -> np.ndarray: r""" Computes the set of visited observable states given a set of discrete trajectories. Parameters ---------- dtrajs : array_like observable trajectory Returns ------- symbols : np.ndarray The observation symbols which are visited. """ from deeptime.markov import compute_dtrajs_effective, count_states if dtrajs is None: raise ValueError("Needs nonempty dtrajs to evaluate nonempty obs.") dtrajs = ensure_dtraj_list(dtrajs) dtrajs_lagged_strided = compute_dtrajs_effective( dtrajs, self.transition_model.lagtime, self.transition_model.count_model.n_states_full, self.stride ) obs = np.where(count_states(dtrajs_lagged_strided) > 0)[0] return obs
def trajectory_weights(self): r"""Uses the MSM to assign a probability weight to each trajectory frame. This is a powerful function for the calculation of arbitrary observables in the trajectories one has started the analysis with. The stationary probability of the MSM will be used to reweigh all states. Returns a list of weight arrays, one for each trajectory, and with a number of elements equal to trajectory frames. Given :math:`N` trajectories of lengths :math:`T_1` to :math:`T_N`, this function returns corresponding weights: .. math:: (w_{1,1}, ..., w_{1,T_1}), (w_{N,1}, ..., w_{N,T_N}) that are normalized to one: .. math:: \sum_{i=1}^N \sum_{t=1}^{T_i} w_{i,t} = 1 Suppose you are interested in computing the expectation value of a function :math:`a(x)`, where :math:`x` are your input configurations. Use this function to compute the weights of all input configurations and obtain the estimated expectation by: .. math:: \langle a \rangle = \sum_{i=1}^N \sum_{t=1}^{T_i} w_{i,t} a(x_{i,t}) Or if you are interested in computing the time-lagged correlation between functions :math:`a(x)` and :math:`b(x)` you could do: .. math:: \langle a(t) b(t+\tau) \rangle_t = \sum_{i=1}^N \sum_{t=1}^{T_i} w_{i,t} a(x_{i,t}) a(x_{i,t+\tau}) Returns ------- weights : list of ndarray The normalized trajectory weights. Given :math:`N` trajectories of lengths :math:`T_1` to :math:`T_N`, returns the corresponding weights: .. math:: (w_{1,1}, ..., w_{1,T_1}), (w_{N,1}, ..., w_{N,T_N}) """ self._check_is_estimated() # compute stationary distribution, expanded to full set statdist_full = _np.zeros([self._nstates_full]) statdist_full[self.active_set] = self.stationary_distribution # histogram observed states hist = 1.0 * count_states(self.discrete_trajectories_full) # simply read off stationary distribution and accumulate total weight W = [] wtot = 0.0 for dtraj in self.discrete_trajectories_full: w = statdist_full[dtraj] / hist[dtraj] W.append(w) wtot += _np.sum(w) # normalize for w in W: w /= wtot # done return W
def submodel(self, states=None, obs=None, mincount_connectivity='1/n', inplace=False): """Returns a HMM with restricted state space Parameters ---------- states : None, str or int-array Hidden states to restrict the model to. In addition to specifying the subset, possible options are: * None : all states - don't restrict * 'populous-strong' : strongly connected subset with maximum counts * 'populous-weak' : weakly connected subset with maximum counts * 'largest-strong' : strongly connected subset with maximum size * 'largest-weak' : weakly connected subset with maximum size obs : None, str or int-array Observed states to restrict the model to. In addition to specifying an array with the state labels to be observed, possible options are: * None : all states - don't restrict * 'nonempty' : all states with at least one observation in the estimator mincount_connectivity : float or '1/n' minimum number of counts to consider a connection between two states. Counts lower than that will count zero in the connectivity check and may thus separate the resulting transition matrix. Default value: 1/nstates. inplace : Bool if True, submodel is estimated in-place, overwriting the original estimator and possibly discarding information. Default value: False Returns ------- hmm : HMM The restricted HMM. """ if states is None and obs is None and mincount_connectivity == 0: return self if states is None: states = _np.arange(self.nstates) if obs is None: obs = _np.arange(self.nstates_obs) if str(mincount_connectivity) == '1/n': mincount_connectivity = 1.0 / float(self.nstates) # handle new connectivity cm = TransitionCountModel(self.count_matrix) S = cm.connected_sets(connectivity_threshold=mincount_connectivity, directed=True) if inplace: submodel_estimator = self else: from copy import deepcopy submodel_estimator = deepcopy(self) from deeptime.markov._transition_matrix import stationary_distribution if len(S) > 1: # keep only non-negligible transitions C = _np.zeros(self.count_matrix.shape) large = _np.where(self.count_matrix >= mincount_connectivity) C[large] = self.count_matrix[large] for s in S: # keep all (also small) transition counts within strongly connected subsets C[_np.ix_(s, s)] = self.count_matrix[_np.ix_(s, s)] # re-estimate transition matrix with disc. from deeptime.markov.msm import MaximumLikelihoodMSM msmest = MaximumLikelihoodMSM(allow_disconnected=True, reversible=self.reversible, connectivity_threshold=0) msm = msmest.fit_fetch(C) P = msm.transition_matrix pi = stationary_distribution(P, C, mincount_connectivity=0) else: C = self.count_matrix P = self.transition_matrix pi = self.stationary_distribution # determine substates if isinstance(states, str): strong = 'strong' in states largest = 'largest' in states S = cm.connected_sets(connectivity_threshold=mincount_connectivity, directed=strong) if largest: score = [len(s) for s in S] else: score = [self.count_matrix[_np.ix_(s, s)].sum() for s in S] states = _np.array(S[_np.argmax(score)]) if states is not None: # sub-transition matrix submodel_estimator._active_set = states C = C[_np.ix_(states, states)].copy() P = P[_np.ix_(states, states)].copy() P /= P.sum(axis=1)[:, None] pi = stationary_distribution(P, C) submodel_estimator.initial_count = self.initial_count[states] submodel_estimator.initial_distribution = self.initial_distribution[ states] / self.initial_distribution[states].sum() # determine observed states if str(obs) == 'nonempty': obs = _np.where( count_states(self.discrete_trajectories_lagged) > 0)[0] if obs is not None: # set observable set submodel_estimator._observable_set = obs submodel_estimator._nstates_obs = obs.size # full2active mapping _full2obs = -1 * _np.ones(self._nstates_obs_full, dtype=int) _full2obs[obs] = _np.arange(len(obs), dtype=int) # observable trajectories submodel_estimator._dtrajs_obs = [] for dtraj in self.discrete_trajectories_full: submodel_estimator._dtrajs_obs.append(_full2obs[dtraj]) # observation matrix B = self.observation_probabilities[_np.ix_(states, obs)].copy() B /= B.sum(axis=1)[:, None] else: B = self.observation_probabilities # set quantities back. submodel_estimator.update_model_params(P=P, pobs=B, pi=pi) submodel_estimator.count_matrix_EM = self.count_matrix[_np.ix_( states, states)] # unchanged count matrix submodel_estimator.count_matrix = C # count matrix consistent with P return submodel_estimator
def __init__(self, complete: bool = True): self.complete = complete data = np.load( os.path.join(os.path.dirname(os.path.realpath(__file__)), 'resources', 'TestData_OOM_MSM.npz')) if complete: self.dtrajs = [data['arr_%d' % k] for k in range(1000)] else: excluded = [ 21, 25, 30, 40, 66, 72, 74, 91, 116, 158, 171, 175, 201, 239, 246, 280, 300, 301, 310, 318, 322, 323, 339, 352, 365, 368, 407, 412, 444, 475, 486, 494, 510, 529, 560, 617, 623, 637, 676, 689, 728, 731, 778, 780, 811, 828, 838, 845, 851, 859, 868, 874, 895, 933, 935, 938, 958, 961, 968, 974, 984, 990, 999 ] self.dtrajs = [ data['arr_%d' % k] for k in np.setdiff1d(np.arange(1000), excluded) ] # Number of states: self.N = 5 # Lag time: self.tau = 5 self.dtrajs_lag = [traj[:-self.tau] for traj in self.dtrajs] # Rank: if complete: self.rank = 3 else: self.rank = 2 # Build models: self.msmrev = OOMReweightedMSM(lagtime=self.tau, rank_mode='bootstrap_trajs').fit( self.dtrajs) self.msmrev_sparse = OOMReweightedMSM(lagtime=self.tau, sparse=True, rank_mode='bootstrap_trajs') \ .fit(self.dtrajs) self.msm = OOMReweightedMSM(lagtime=self.tau, reversible=False, rank_mode='bootstrap_trajs').fit( self.dtrajs) self.msm_sparse = OOMReweightedMSM(lagtime=self.tau, reversible=False, sparse=True, rank_mode='bootstrap_trajs').fit( self.dtrajs) self.estimators = [ self.msmrev, self.msm, self.msmrev_sparse, self.msm_sparse ] self.msms = [est.fetch_model() for est in self.estimators] # Reference count matrices at lag time tau and 2*tau: if complete: self.C2t = data['C2t'] else: self.C2t = data['C2t_s'] self.Ct = np.sum(self.C2t, axis=1) if complete: self.Ct_active = self.Ct self.C2t_active = self.C2t self.active_faction = 1. else: lcc = msmest.largest_connected_set(self.Ct) self.Ct_active = msmest.largest_connected_submatrix(self.Ct, lcc=lcc) self.C2t_active = self.C2t[:4, :4, :4] self.active_fraction = np.sum(self.Ct_active) / np.sum(self.Ct) # Compute OOM-components: self.Xi, self.omega, self.sigma, self.l = oom_transformations( self.Ct_active, self.C2t_active, self.rank) # Compute corrected transition matrix: Tt_rev = compute_transition_matrix(self.Xi, self.omega, self.sigma, reversible=True) Tt = compute_transition_matrix(self.Xi, self.omega, self.sigma, reversible=False) # Build reference models: self.rmsmrev = MarkovStateModel(Tt_rev) self.rmsm = MarkovStateModel(Tt) # Active count fraction: self.hist = count_states(self.dtrajs) self.active_hist = self.hist[:-1] if not complete else self.hist self.active_count_frac = float(np.sum(self.active_hist)) / np.sum( self.hist) if not complete else 1. self.active_state_frac = 0.8 if not complete else 1. # Commitor and MFPT: a = np.array([0, 1]) b = np.array([4]) if complete else np.array([3]) self.comm_forward = self.rmsm.committor_forward(a, b) self.comm_forward_rev = self.rmsmrev.committor_forward(a, b) self.comm_backward = self.rmsm.committor_backward(a, b) self.comm_backward_rev = self.rmsmrev.committor_backward(a, b) self.mfpt = self.tau * self.rmsm.mfpt(a, b) self.mfpt_rev = self.tau * self.rmsmrev.mfpt(a, b) # PCCA: pcca = self.rmsmrev.pcca(3 if complete else 2) self.pcca_ass = pcca.assignments self.pcca_dist = pcca.metastable_distributions self.pcca_mem = pcca.memberships self.pcca_sets = pcca.sets # Experimental quantities: a = np.array([1, 2, 3, 4, 5]) b = np.array([1, -1, 0, -2, 4]) p0 = np.array([0.5, 0.2, 0.2, 0.1, 0.0]) if not complete: a = a[:-1] b = b[:-1] p0 = p0[:-1] pi = self.rmsm.stationary_distribution pi_rev = self.rmsmrev.stationary_distribution _, _, L_rev = ma.rdl_decomposition(Tt_rev) self.exp = np.dot(self.rmsm.stationary_distribution, a) self.exp_rev = np.dot(self.rmsmrev.stationary_distribution, a) self.corr_rev = np.zeros(10) self.rel = np.zeros(10) self.rel_rev = np.zeros(10) for k in range(10): Ck_rev = np.dot(np.diag(pi_rev), np.linalg.matrix_power(Tt_rev, k)) self.corr_rev[k] = np.dot(a.T, np.dot(Ck_rev, b)) self.rel[k] = np.dot(p0.T, np.dot(np.linalg.matrix_power(Tt, k), a)) self.rel_rev[k] = np.dot( p0.T, np.dot(np.linalg.matrix_power(Tt_rev, k), a)) self.fing_cor = np.dot(a.T, L_rev.T) * np.dot(b.T, L_rev.T) self.fing_rel = np.dot(a.T, L_rev.T) * np.dot((p0 / pi_rev).T, L_rev.T)