def _find_largest_connected_set(self, connectivity, connectivity_factor, progress_bar=None): estimator = TransitionCountEstimator(lagtime=self.lagtime, count_mode=self.count_mode) # make a counts model over all observed samples. full_counts_model = estimator.fit_fetch(self.dtrajs) if connectivity is None: warnings.warn(f"connectivity type is None. Data has not been restricted to the largest connected set." f"The full counts model has been returned.") return full_counts_model if connectivity == 'summed_count_matrix': # We assume the thermodynamic states have overlap when they contain counts from the same markov state. # Full counts model contains the sum of the state counts over all therm. states., and we simply ignore # the thermodynamic state indices. return full_counts_model.submodel_largest( connectivity_threshold=connectivity_factor, directed=True) if connectivity in ['post_hoc_RE', 'BAR_variance']: # get state counts for each trajectory (=for each therm. state) all_state_counts = np.asarray([estimator.fit_fetch(dtraj).state_histogram for dtraj in self.dtrajs], dtype=object) # pad with zero's so they are all the same size and easier for the cpp module to handle all_state_counts = to_zero_padded_array(all_state_counts, self.n_markov_states) # get list of all possible transitions between thermodynamic states. A transition is only possible when two # thermodynamic states have an overlapping markov state. Whether the markov state overlaps depends on the # sampled data and the connectivity settings and is computed in find_state_transitions: if connectivity == 'post_hoc_RE': connectivity_fn = tram.find_state_transitions_post_hoc_RE else: connectivity_fn = tram.find_state_transitions_BAR_variance with callbacks.Callback(progress_bar, self.n_therm_states * self.n_markov_states, "Finding connected sets") as callback: (i_s, j_s) = connectivity_fn(self.ttrajs, self.dtrajs, self.bias_matrices, all_state_counts, self.n_therm_states, self.n_markov_states, connectivity_factor, callback) print((i_s, j_s)) # add transitions that occurred within each thermodynamic state. These are simply the connected sets: for k in range(self.n_therm_states): for cset in estimator.fit_fetch(self.dtrajs[k]).connected_sets(): i_s.extend(list(cset[0:-1] + k * self.n_markov_states)) j_s.extend(list(cset[1:] + k * self.n_markov_states)) # turn the list of transitions into a boolean matrix that has a one whenever a transition has occurred data = np.ones(len(i_s), dtype=np.int32) dim = self.n_therm_states * self.n_markov_states sparse_transition_counts = sp.sparse.coo_matrix((data, (i_s, j_s)), shape=(dim, dim)) # Now we have all possible paths in the list of transitions. Get the connected set of that overlap_counts_model = TransitionCountModel(sparse_transition_counts) connected_states_ravelled = overlap_counts_model.submodel_largest(directed=False).state_symbols # unravel the index and combine all separate csets to one cset connected_states = np.unravel_index(connected_states_ravelled, (self.n_therm_states, self.n_markov_states), order='C') return full_counts_model.submodel(np.unique(connected_states[1]))
def _construct_count_models(self, dtraj_fragments): """ Construct a TransitionCountModel for each thermodynamic state based on the dtraj_fragments, and store in self.count_models. Parameters ---------- dtraj_fragments: list(list(int)) A list that contains for each thermodynamic state the fragments from all trajectories that were sampled at that thermodynamic state. fragment_indices[k][i] defines the i-th fragment sampled at thermodynamic state k. The fragments should be restricted to the largest connected set and not contain any negative state indices. Returns ------- count_models : list(TransitionCountModel) """ estimator = TransitionCountEstimator(lagtime=self.lagtime, count_mode=self.count_mode) count_models = [] for k in range(self.n_therm_states): if len(dtraj_fragments[k]) == 0 or np.all([len(frag) <= self.lagtime for frag in dtraj_fragments[k]]): warnings.warn(f"No transitions for thermodynamic state {k} after cutting the trajectories into " f"fragments that start at each replica exchange swap. Replica exchanges possibly occur " f"within the span of the lag time.") # there are no samples from this state that belong to the connected set. Make an empty count model. count_models.append(TransitionCountModel(np.zeros(self.n_markov_states, self.n_markov_states))) else: # make a counts model for the samples that belong to the connected set. traj_counts_model = estimator.fit_fetch(dtraj_fragments[k]) count_models.append(traj_counts_model) return count_models
def make_random_model(n_therm_states, n_markov_states, transition_matrices=None): transition_counts = np.zeros((n_therm_states, n_markov_states, n_markov_states)) if transition_matrices is None: # make stochastic transition matrix transition_matrices = np.random.rand(n_therm_states, n_markov_states, n_markov_states) transition_matrices /= np.sum(transition_matrices, axis=-1, keepdims=True) biased_conf_energies = np.random.rand(n_therm_states, n_markov_states) lagrangians = np.random.rand(n_therm_states, n_markov_states) modified_state_counts_log = np.log(np.random.rand(n_therm_states, n_markov_states)) count_models = [TransitionCountModel(counts) for counts in transition_counts] # construct model. return TRAMModel(count_models, transition_matrices, biased_conf_energies=biased_conf_energies, lagrangian_mult_log=lagrangians, modified_state_counts_log=modified_state_counts_log)
def _append_sample(self, models, prior, sample_model): # Save a copy of the current model. model_copy = deepcopy(sample_model) # the Viterbi path is discarded, but is needed to get a new transition matrix for each model. if not self.store_hidden: model_copy.hidden_trajs.clear() # potentially restrict sampled models to observed space # since model_copy is defined on full space, observation_symbols are also observation states count_model = TransitionCountModel(model_copy.counts, lagtime=prior.lagtime) models.append( HiddenMarkovModel( transition_model=MarkovStateModel( model_copy.transition_matrix, stationary_distribution=model_copy.stationary_distribution, reversible=self.reversible, count_model=count_model), output_model=model_copy.output_model, initial_distribution=model_copy.initial_distribution, hidden_state_trajectories=model_copy.hidden_trajs))
def test_invalid_arguments(): with assert_raises(ValueError): # negative counts MaximumLikelihoodMSM().fit(-1 * np.ones((5, 5))).fetch_model() with assert_raises(ValueError): # non quadratic count matrix MaximumLikelihoodMSM().fit(np.ones((3, 5))).fetch_model() with assert_raises(ValueError): # stationary distribution not over whole state space MaximumLikelihoodMSM( stationary_distribution_constraint=np.array([1 / 3, 1 / 3, 1 / 3])).fit( np.ones((5, 5))) with assert_raises(ValueError): # no counts but statdist constraint MaximumLikelihoodMSM( stationary_distribution_constraint=np.array([.5, .5])).fit( np.zeros((2, 2))) with assert_raises(ValueError): # fit with transition count estimator that hasn't been fit MaximumLikelihoodMSM().fit(TransitionCountEstimator(1, "sliding")) with assert_raises(ValueError): # fit with bogus object MaximumLikelihoodMSM().fit(object()) with assert_raises(ValueError): # fit from timeseries without lagtime MaximumLikelihoodMSM().fit(np.array([0, 1, 2, 3, 4, 5, 6])) with assert_raises(ValueError): # empty collection is not allowed MarkovStateModelCollection([], [], False, [], 1.) with assert_raises(ValueError): # number of elements in lists must match MarkovStateModelCollection([np.array([[.5, .5], [.5, .5]])], [], False, [], 1.) with assert_raises(ValueError): # number of states in lists must match MarkovStateModelCollection([np.array([[.5, .5], [.5, .5]])], [None], False, [TransitionCountModel(np.ones((3, 3)))], 1.)
def fit(self, data, *args, **kw): r""" Fits an AMM. Parameters ---------- data : TransitionCountModel or (N, N) ndarray Count matrix over data. *args scikit-learn compatibility argument **kw scikit-learn compatibility argument Returns ------- self : AugmentedMSMEstimator Reference to self. """ if not isinstance(data, (TransitionCountModel, np.ndarray)): raise ValueError("Can only fit on a TransitionCountModel or a count matrix directly.") if isinstance(data, np.ndarray): if data.ndim != 2 or data.shape[0] != data.shape[1] or np.any(data < 0.): raise ValueError("If fitting a count matrix directly, only non-negative square matrices can be used.") count_model = TransitionCountModel(data) else: count_model = data if len(self.experimental_measurement_weights) != self.expectations_by_state.shape[1]: raise ValueError("Experimental weights must span full observable space.") if len(self.experimental_measurements) != self.expectations_by_state.shape[1]: raise ValueError("Experimental measurements must span full observable state space.") count_matrix = count_model.count_matrix if issparse(count_matrix): count_matrix = count_matrix.toarray() # slice out active states from E matrix expectations_selected = self.expectations_by_state[count_model.state_symbols] count_matrix_symmetric = 0.5 * (count_matrix + count_matrix.T) nonzero_counts = np.nonzero(count_matrix_symmetric) counts_row_sums = np.sum(count_matrix, axis=1) expectations_confidence_interval = confidence_interval(expectations_selected, conf=self.support_confidence) measurements = self.experimental_measurements measurement_weights = self.experimental_measurement_weights count_outside = [] count_inside = [] i = 0 # Determine which experimental values are outside the support as defined by the Confidence interval for confidence_lower, confidence_upper, measurement, weight in zip( expectations_confidence_interval[0], expectations_confidence_interval[1], measurements, measurement_weights): if measurement < confidence_lower or confidence_upper < measurement: self._log.info(f"Experimental value {measurement} is outside the " f"support ({confidence_lower, confidence_upper})") count_outside.append(i) else: count_inside.append(i) i = i + 1 # A number of initializations transition_matrix, stationary_distribution = msmest.transition_matrix(count_matrix, reversible=True, return_statdist=True) if issparse(transition_matrix): transition_matrix = transition_matrix.toarray() # Determine number of slices of R-tensors computable at once with the given cache size slices_z = np.floor(self.max_cache / (transition_matrix.nbytes / 1.e6)).astype(int) # Optimizer state state = AMMOptimizerState(expectations_selected, measurements, measurement_weights, stationary_distribution, slices_z, count_matrix_symmetric, counts_row_sums) ll_old = state.log_likelihood_biased(count_matrix, transition_matrix) state.log_likelihoods.append(ll_old) # make sure everything is initialized state.update_pi_hat() state.update_m_hat() state.update_Q() state.update_X_and_pi() ll_old = state.log_likelihood_biased(count_matrix, transition_matrix) state.log_likelihood_prev = ll_old state.update_G() # # Main estimation algorithm # 2-step algorithm, lagrange multipliers and pihat have different convergence criteria # when the lagrange multipliers have converged, pihat is updated until the log-likelihood has converged # (changes are smaller than 1e-3). # These do not always converge together, but usually within a few steps of each other. # A better heuristic for the latter may be necessary. For realistic cases (the two ubiquitin examples in [1]) # this yielded results very similar to those with more stringent convergence criteria # (changes smaller than 1e-9) with convergence times # which are seconds instead of tens of minutes. # converged = False # Convergence flag for lagrange multipliers i = 0 die = False while i <= self.maxiter: pi_hat_old = state.pi_hat.copy() state.update_pi_hat() if not np.all(state.pi_hat > 0): state.pi_hat = pi_hat_old.copy() die = True self._log.warning("pihat does not have a finite probability for all states, terminating") state.update_m_hat() state.update_Q() if i > 1: X_old = np.copy(state.X) state.update_X_and_pi() if np.any(state.X[nonzero_counts] < 0) and i > 0: die = True self._log.warning( "Warning: new X is not proportional to C... reverting to previous step and terminating") state.X = X_old if not converged: self._newton_lagrange(state, count_matrix) else: # once Lagrange multipliers are converged compute likelihood here transition_matrix = state.X / state.pi[:, None] _ll_new = state.log_likelihood_biased(count_matrix, transition_matrix) state.log_likelihoods.append(_ll_new) # General case fixed-point iteration if len(count_outside) > 0: if i > 1 and np.all((np.abs(state.delta_m_hat) / self.uncertainties) < self.convergence_criterion_lagrange)\ and not converged: self._log.info(f"Converged Lagrange multipliers after {i} steps...") converged = True # Special case else: if np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1]) < 1e-8: self._log.info(f"Converged Lagrange multipliers after {i} steps...") converged = True # if Lagrange multipliers are converged, check whether log-likelihood has converged if converged and np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1]) < 1e-8: self._log.info(f"Converged pihat after {i} steps...") die = True if die: break if i == self.maxiter: ll_diff = np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1]) self._log.info(f"Failed to converge within {i} iterations. Log-likelihoods lastly changed by {ll_diff}." f" Consider increasing max_iter(now={self.max_iter})") i += 1 transition_matrix = msmest.transition_matrix(count_matrix, reversible=True, mu=state.pi_hat) self._model = AugmentedMSM(transition_matrix=transition_matrix, stationary_distribution=state.pi_hat, reversible=True, count_model=count_model, amm_optimizer_state=state) return self
def msm(): return MarkovStateModel([[0.9, 0.1], [0.1, 0.9]], count_model=TransitionCountModel([[90, 10], [10, 90]]))
def _test_submodel(self, histogram): # three connected components: ((1, 2), (0), (3)) count_matrix = np.array([[10., 0., 0., 0.], [0., 1., 1., 0.], [0., 1., 1., 0.], [0., 0., 0., 1]]) model = TransitionCountModel(count_matrix, counting_mode="effective", state_histogram=histogram) self._check_submodel_transitive_properties(histogram, count_matrix, model) if histogram is not None: assert_equal(model.selected_count_fraction, 1.) assert_equal(model.total_count, 100 + 10 + 10 + 10) assert_equal(model.visited_set, [0, 1, 2, 3]) else: with assert_raises(RuntimeError): print(model.selected_count_fraction) with assert_raises(RuntimeError): print(model.total_count) with assert_raises(RuntimeError): print(model.visited_set) assert_equal(model.count_matrix, count_matrix) assert_equal(model.selected_state_fraction, 1.) sets = model.connected_sets(connectivity_threshold=0, directed=True, probability_constraint=None) assert_equal(len(sets), 3) assert_equal(len(sets[0]), 2) assert_equal(len(sets[1]), 1) assert_equal(len(sets[2]), 1) assert_equal(model.state_symbols, [0, 1, 2, 3]) assert_(model.is_full_model) assert_equal(model.state_histogram, histogram) assert_equal(model.n_states, 4) assert 1 in sets[0] and 2 in sets[ 0], "expected states 1 and 2 in largest connected set, got {}".format( sets[0]) submodel = model.submodel(sets[0]) self._check_submodel_transitive_properties(histogram, count_matrix, submodel) if histogram is not None: assert_equal(submodel.state_histogram, [10, 10]) assert_equal(submodel.selected_count_fraction, 20. / 130.) assert_equal(submodel.total_count, 20) assert_equal(submodel.visited_set, [0, 1]) else: assert_equal(submodel.state_histogram, None) with assert_raises(RuntimeError): print(submodel.selected_count_fraction) with assert_raises(RuntimeError): print(submodel.total_count) with assert_raises(RuntimeError): print(submodel.visited_set) assert_equal(submodel.count_matrix, np.array([[1, 1], [1, 1]])) assert_equal(submodel.selected_state_fraction, 0.5) sets = submodel.connected_sets(connectivity_threshold=0, directed=True, probability_constraint=None) assert_equal(len(sets), 1) assert_equal(len(sets[0]), 2) assert 0 in sets[0] and 1 in sets[0], "states 0 and 1 should be in the connected set, " \ "but got {}".format(sets[0]) assert_equal(submodel.state_symbols, [1, 2]) assert_(not submodel.is_full_model) assert_equal(submodel.n_states, 2) subsubmodel = submodel.submodel([1]) self._check_submodel_transitive_properties(histogram, count_matrix, subsubmodel) if histogram is not None: assert_equal(subsubmodel.state_histogram, [10]) assert_equal(subsubmodel.selected_count_fraction, 10. / 130.) assert_equal(subsubmodel.total_count, 10) assert_equal(subsubmodel.visited_set, [0]) else: assert_equal(subsubmodel.state_histogram, None) with assert_raises(RuntimeError): print(subsubmodel.selected_count_fraction) with assert_raises(RuntimeError): print(subsubmodel.total_count) with assert_raises(RuntimeError): print(subsubmodel.visited_set) assert_equal(subsubmodel.count_matrix, np.array([[1]])) assert_equal(subsubmodel.selected_state_fraction, 0.25) sets = subsubmodel.connected_sets(connectivity_threshold=0, directed=True, probability_constraint=None) assert_equal(len(sets), 1) assert_equal(len(sets[0]), 1) assert 0 in sets[ 0], "state 0 should be in the connected set, but got {}".format( sets[0]) assert_equal(subsubmodel.state_symbols, [2]) assert_(not subsubmodel.is_full_model) assert_equal(subsubmodel.n_states, 1)
def fit(self, dtrajs, initial_model=None, **kwargs): r""" Fits a new :class:`HMM <HiddenMarkovModel>` to data. Parameters ---------- dtrajs : array_like or list of array_like Timeseries data. initial_model : HiddenMarkovModel, optional, default=None Override for :attr:`initial_transition_model`. **kwargs Ignored kwargs for scikit-learn compatibility. Returns ------- self : MaximumLikelihoodHMM Reference to self. """ if initial_model is None: initial_model = self.initial_transition_model if initial_model is None or not isinstance(initial_model, HiddenMarkovModel): raise ValueError( "For estimation, an initial model of type " "`deeptime.markov.hmm.HiddenMarkovModel` is required.") # copy initial model transition_matrix = initial_model.transition_model.transition_matrix if issparse(transition_matrix): # want dense matrix, toarray makes a copy transition_matrix = transition_matrix.toarray() else: # new instance transition_matrix = np.copy(transition_matrix) hmm_data = MaximumLikelihoodHMM._HMMModelStorage( transition_matrix=transition_matrix, output_model=initial_model.output_model.copy(), initial_distribution=initial_model.initial_distribution.copy()) dtrajs = ensure_timeseries_data(dtrajs) dtrajs = compute_dtrajs_effective( dtrajs, lagtime=self.lagtime, n_states=initial_model.n_hidden_states, stride=self.stride) max_n_frames = max(len(obs) for obs in dtrajs) # pre-construct hidden variables N = initial_model.n_hidden_states alpha = np.zeros((max_n_frames, N)) beta = np.zeros((max_n_frames, N)) gammas = [np.zeros((len(obs), N)) for obs in dtrajs] count_matrices = [np.zeros((N, N)) for _ in dtrajs] it = 0 likelihoods = np.empty(self.maxit) # flag if connectivity has changed (e.g. state lost) - in that case the likelihood # is discontinuous and can't be used as a convergence criterion in that iteration. tmatrix_nonzeros = hmm_data.transition_matrix.nonzero() converged = False while not converged and it < self.maxit: loglik = 0.0 for obs, gamma, counts in zip(dtrajs, gammas, count_matrices): loglik_update, _ = self._forward_backward( hmm_data, obs, alpha, beta, gamma, counts) loglik += loglik_update assert np.isfinite(loglik), it # convergence check if it > 0: dL = loglik - likelihoods[it - 1] if dL < self.accuracy: converged = True # update model self._update_model(hmm_data, dtrajs, gammas, count_matrices, maxiter=self.maxit_reversible) # connectivity change check tmatrix_nonzeros_new = hmm_data.transition_matrix.nonzero() if not np.array_equal(tmatrix_nonzeros, tmatrix_nonzeros_new): converged = False # unset converged tmatrix_nonzeros = tmatrix_nonzeros_new # end of iteration likelihoods[it] = loglik it += 1 likelihoods = np.resize(likelihoods, it) transition_counts = self._reduce_transition_counts(count_matrices) count_model = TransitionCountModel(count_matrix=transition_counts, lagtime=self.lagtime) transition_model = MarkovStateModel(hmm_data.transition_matrix, reversible=self.reversible, count_model=count_model) hidden_state_trajs = [ viterbi(hmm_data.transition_matrix, hmm_data.output_model.to_state_probability_trajectory(obs), hmm_data.initial_distribution) for obs in dtrajs ] model = HiddenMarkovModel( transition_model=transition_model, output_model=hmm_data.output_model, initial_distribution=hmm_data.initial_distribution, likelihoods=likelihoods, state_probabilities=gammas, initial_count=self._init_counts(gammas), hidden_state_trajectories=hidden_state_trajs, stride=self.stride) self._model = model return self
def submodel(self, states=None, obs=None, mincount_connectivity='1/n', inplace=False): """Returns a HMM with restricted state space Parameters ---------- states : None, str or int-array Hidden states to restrict the model to. In addition to specifying the subset, possible options are: * None : all states - don't restrict * 'populous-strong' : strongly connected subset with maximum counts * 'populous-weak' : weakly connected subset with maximum counts * 'largest-strong' : strongly connected subset with maximum size * 'largest-weak' : weakly connected subset with maximum size obs : None, str or int-array Observed states to restrict the model to. In addition to specifying an array with the state labels to be observed, possible options are: * None : all states - don't restrict * 'nonempty' : all states with at least one observation in the estimator mincount_connectivity : float or '1/n' minimum number of counts to consider a connection between two states. Counts lower than that will count zero in the connectivity check and may thus separate the resulting transition matrix. Default value: 1/nstates. inplace : Bool if True, submodel is estimated in-place, overwriting the original estimator and possibly discarding information. Default value: False Returns ------- hmm : HMM The restricted HMM. """ if states is None and obs is None and mincount_connectivity == 0: return self if states is None: states = _np.arange(self.nstates) if obs is None: obs = _np.arange(self.nstates_obs) if str(mincount_connectivity) == '1/n': mincount_connectivity = 1.0 / float(self.nstates) # handle new connectivity cm = TransitionCountModel(self.count_matrix) S = cm.connected_sets(connectivity_threshold=mincount_connectivity, directed=True) if inplace: submodel_estimator = self else: from copy import deepcopy submodel_estimator = deepcopy(self) from deeptime.markov._transition_matrix import stationary_distribution if len(S) > 1: # keep only non-negligible transitions C = _np.zeros(self.count_matrix.shape) large = _np.where(self.count_matrix >= mincount_connectivity) C[large] = self.count_matrix[large] for s in S: # keep all (also small) transition counts within strongly connected subsets C[_np.ix_(s, s)] = self.count_matrix[_np.ix_(s, s)] # re-estimate transition matrix with disc. from deeptime.markov.msm import MaximumLikelihoodMSM msmest = MaximumLikelihoodMSM(allow_disconnected=True, reversible=self.reversible, connectivity_threshold=0) msm = msmest.fit_fetch(C) P = msm.transition_matrix pi = stationary_distribution(P, C, mincount_connectivity=0) else: C = self.count_matrix P = self.transition_matrix pi = self.stationary_distribution # determine substates if isinstance(states, str): strong = 'strong' in states largest = 'largest' in states S = cm.connected_sets(connectivity_threshold=mincount_connectivity, directed=strong) if largest: score = [len(s) for s in S] else: score = [self.count_matrix[_np.ix_(s, s)].sum() for s in S] states = _np.array(S[_np.argmax(score)]) if states is not None: # sub-transition matrix submodel_estimator._active_set = states C = C[_np.ix_(states, states)].copy() P = P[_np.ix_(states, states)].copy() P /= P.sum(axis=1)[:, None] pi = stationary_distribution(P, C) submodel_estimator.initial_count = self.initial_count[states] submodel_estimator.initial_distribution = self.initial_distribution[ states] / self.initial_distribution[states].sum() # determine observed states if str(obs) == 'nonempty': obs = _np.where( count_states(self.discrete_trajectories_lagged) > 0)[0] if obs is not None: # set observable set submodel_estimator._observable_set = obs submodel_estimator._nstates_obs = obs.size # full2active mapping _full2obs = -1 * _np.ones(self._nstates_obs_full, dtype=int) _full2obs[obs] = _np.arange(len(obs), dtype=int) # observable trajectories submodel_estimator._dtrajs_obs = [] for dtraj in self.discrete_trajectories_full: submodel_estimator._dtrajs_obs.append(_full2obs[dtraj]) # observation matrix B = self.observation_probabilities[_np.ix_(states, obs)].copy() B /= B.sum(axis=1)[:, None] else: B = self.observation_probabilities # set quantities back. submodel_estimator.update_model_params(P=P, pobs=B, pi=pi) submodel_estimator.count_matrix_EM = self.count_matrix[_np.ix_( states, states)] # unchanged count matrix submodel_estimator.count_matrix = C # count matrix consistent with P return submodel_estimator
def _estimate(self, dtrajs): # ensure right format dtrajs = _types.ensure_dtraj_list(dtrajs) # CHECK LAG trajlengths = [_np.size(dtraj) for dtraj in dtrajs] if self.lag >= _np.max(trajlengths): raise ValueError('Illegal lag time ' + str(self.lag) + ' exceeds longest trajectory length') if self.lag > _np.mean(trajlengths): self.logger.warning( 'Lag time ' + str(self.lag) + ' is on the order of mean trajectory length ' + str(_np.mean(trajlengths)) + '. It is recommended to fit four lag times in each ' + 'trajectory. HMM might be inaccurate.') # EVALUATE STRIDE if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # get a quick estimate from the spectral radius of the non-reversible from pyemma.msm import estimate_markov_model msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False, connectivity='largest', dt_traj=self.timestep_traj) # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_nr.nstates > self.nstates: # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning import warnings with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', category=ImaginaryEigenValueWarning, module= 'deeptime.markov.tools.analysis.dense.decomposition') corrtime = max(1, msm_nr.timescales()[self.nstates - 1]) # use the smaller of these two pessimistic estimates self.stride = int(min(self.lag, 2 * corrtime)) # LAG AND STRIDE DATA from deeptime.markov import compute_dtrajs_effective dtrajs_lagged_strided = compute_dtrajs_effective(dtrajs, self.lag, n_states=-1, stride=self.stride) # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # INIT HMM from deeptime.markov.hmm import init from pyemma.msm.estimators import MaximumLikelihoodMSM from pyemma.msm.estimators import OOMReweightedMSM if self.msm_init == 'largest-strong': hmm_init = init.discrete.metastable_from_data( dtrajs, n_hidden_states=self.nstates, lagtime=self.lag, stride=self.stride, mode='largest-regularized', reversible=self.reversible, stationary=True, separate_symbols=self.separate) elif self.msm_init == 'all': hmm_init = init.discrete.metastable_from_data( dtrajs, n_hidden_states=self.nstates, lagtime=self.lag, stride=self.stride, reversible=self.reversible, stationary=True, separate_symbols=self.separate, mode='all-regularized') elif isinstance( self.msm_init, (MaximumLikelihoodMSM, OOMReweightedMSM)): # initial MSM given. msm = MarkovStateModel(transition_matrix=self.msm_init.P, count_model=TransitionCountModel( self.msm_init.count_matrix_active)) hmm_init = init.discrete.metastable_from_msm( msm, n_hidden_states=self.nstates, reversible=self.reversible, stationary=True, separate_symbols=self.separate) observe_subset = self.msm_init.active_set # override observe_subset. else: raise ValueError('Unknown MSM initialization option: ' + str(self.msm_init)) # --------------------------------------------------------------------------------------- # Estimate discrete HMM # --------------------------------------------------------------------------------------- # run EM from deeptime.markov.hmm import MaximumLikelihoodHMM hmm_est = MaximumLikelihoodHMM(hmm_init, lagtime=self.lag, stride=self.stride, reversible=self.reversible, stationary=self.stationary, accuracy=self.accuracy, maxit=self.maxit) # run hmm_est.fit(dtrajs) # package in discrete HMM self.hmm = hmm_est.fetch_model() # get model parameters self.initial_distribution = self.hmm.initial_distribution transition_matrix = self.hmm.transition_model.transition_matrix observation_probabilities = self.hmm.output_probabilities # get estimation parameters self.likelihoods = self.hmm.likelihoods # Likelihood history self.likelihood = self.likelihoods[-1] self.hidden_state_probabilities = self.hmm.state_probabilities # gamma variables self.hidden_state_trajectories = self.hmm.hidden_state_trajectories # Viterbi path self.count_matrix = self.hmm.count_model.count_matrix # hidden count matrix self.initial_count = self.hmm.initial_count # hidden init count self._active_set = _np.arange(self.nstates) # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards? # parametrize self self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged_strided self._nstates_obs_full = number_of_states(dtrajs) self._nstates_obs = number_of_states(dtrajs_lagged_strided) self._observable_set = _np.arange(self._nstates_obs) self._dtrajs_obs = dtrajs self.set_model_params(P=transition_matrix, pobs=observation_probabilities, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) # TODO: perhaps remove connectivity and just rely on .submodel()? # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity, inplace=True)
def test_dt_model(self): C = TransitionCountModel(np.array([[0.1, 0.9], [0.9, 0.1]]), lagtime=5) msm = MarkovStateModel(C.count_matrix, count_model=C) tpt = msm.reactive_flux([0], [1]) np.testing.assert_equal(msm.lagtime, 5)
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True, n_jobs=None, name='', core_set=None, milestoning_method='last_core'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. n_jobs: int or None """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if core_set is not None and count_mode in ('sliding', 'sample'): if milestoning_method == 'last_core': # assign -1 frames to last visited core for d in self._dtrajs: assert d[0] != -1 while -1 in d: mask = (d == -1) d[mask] = d[np.roll(mask, -1)] self._C = count_matrix(self._dtrajs, lag, sliding=count_mode == 'sliding') else: raise NotImplementedError( 'Milestoning method {} not implemented.'.format( milestoning_method)) else: cm = TransitionCountEstimator(lag, count_mode=count_mode, sparse=True).fit( self._dtrajs).fetch_model() self._C = cm.count_matrix # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity self._count_model_full = TransitionCountModel(self._C) self._connected_sets = self._count_model_full.connected_sets( connectivity_threshold=self._mincount_connectivity) self._count_model = self._count_model_full.submodel_largest( connectivity_threshold=self._mincount_connectivity) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.array( (len(cs) for cs in self._connected_sets)) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True
class DiscreteTrajectoryStats(object): r""" Statistics, count matrices and connectivity from discrete trajectories Operates sparse by default. Parameters ---------- dtrajs: list containing ndarrays(dtype=int) or ndarray(n, dtype=int) discrete trajectories, stored as integer ndarrays (arbitrary size) or a single ndarray for only one trajectory. Elements must be non-negative; -1 elements denote unassigned states (milestone counting). """ def __init__(self, dtrajs): from pyemma.util.types import ensure_dtraj_list # discrete trajectories self._dtrajs = ensure_dtraj_list(dtrajs) # TODO: extensive input checking! if any([np.any(d < -1) for d in self._dtrajs]): raise ValueError('Discrete trajectory contains elements < -1.') ## basic count statistics # histogram self._hist = count_states(self._dtrajs, ignore_negative=True) # total counts self._total_count = np.sum(self._hist) # number of states self._nstates = number_of_states(dtrajs) # not yet estimated self._counted_at_lag = False @staticmethod def _compute_connected_sets(C, mincount_connectivity, strong=True): """ Computes the connected sets of C. C : count matrix mincount_connectivity : float Minimum count which counts as a connection. strong : boolean True: Seek strongly connected sets. False: Seek weakly connected sets. Returns ------- Cconn, S """ import scipy.sparse as scs if scs.issparse(C): Cconn = C.tocsr(copy=True) Cconn.data[Cconn.data < mincount_connectivity] = 0 Cconn.eliminate_zeros() else: Cconn = C.copy() Cconn[np.where(Cconn < mincount_connectivity)] = 0 # treat each connected set separately S = connected_sets(Cconn, directed=strong) return S def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True, n_jobs=None, name='', core_set=None, milestoning_method='last_core'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. n_jobs: int or None """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if core_set is not None and count_mode in ('sliding', 'sample'): if milestoning_method == 'last_core': # assign -1 frames to last visited core for d in self._dtrajs: assert d[0] != -1 while -1 in d: mask = (d == -1) d[mask] = d[np.roll(mask, -1)] self._C = count_matrix(self._dtrajs, lag, sliding=count_mode == 'sliding') else: raise NotImplementedError( 'Milestoning method {} not implemented.'.format( milestoning_method)) else: cm = TransitionCountEstimator(lag, count_mode=count_mode, sparse=True).fit( self._dtrajs).fetch_model() self._C = cm.count_matrix # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity self._count_model_full = TransitionCountModel(self._C) self._connected_sets = self._count_model_full.connected_sets( connectivity_threshold=self._mincount_connectivity) self._count_model = self._count_model_full.submodel_largest( connectivity_threshold=self._mincount_connectivity) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.array( (len(cs) for cs in self._connected_sets)) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True # ================================== # Permanent properties # ================================== def _assert_counted_at_lag(self): """ Checks if count_lagged has been run """ assert self._counted_at_lag, \ "You haven't run count_lagged yet. Do that first before accessing lag-based quantities" def _assert_subset(self, A): """ Checks if set A is a subset of states Parameters ---------- A : int or int array set of states """ if np.size(A) == 0: return True # empty set is always contained assert np.max( A ) < self._nstates, 'Chosen set contains states that are not included in the data.' @property def nstates(self): """ Number (int) of states """ return self._nstates @property @alias('dtrajs') def discrete_trajectories(self): """ A list of integer arrays with the original (unmapped) discrete trajectories: """ return self._dtrajs @property def total_count(self): """ Total number of counts """ return self._hist.sum() @property @alias('hist') def histogram(self): r""" Histogram of discrete state counts """ return self._hist # ================================== # Estimated properties # ================================== @property def lag(self): """ The active set of states on which all computations and estimations will be done """ self._assert_counted_at_lag() return self._lag def count_matrix(self, connected_set=None, subset=None): r"""The count matrix Parameters ---------- connected_set : int or None, optional, default=None connected set index. See :func:`connected_sets` to get a sorted list of connected sets. This parameter is exclusive with subset. subset : array-like of int or None, optional, default=None subset of states to compute the count matrix on. This parameter is exclusive with subset. References ---------- ..[1] Trendelkamp-Schroer B, H Wu, F Paul and F Noe. 2015: Reversible Markov models of molecular kinetics: Estimation and uncertainty. in preparation. """ self._assert_counted_at_lag() if subset is not None and connected_set is not None: raise ValueError('Can\'t set both connected_set and subset.') if subset is not None: self._assert_subset(subset) C = submatrix(self._C, subset) elif connected_set is not None: C = submatrix(self._C, self._connected_sets[connected_set]) else: # full matrix wanted C = self._C return C @alias('hist_lagged') def histogram_lagged(self, connected_set=None, subset=None, effective=False): r""" Histogram of discrete state counts """ C = self.count_matrix(connected_set=connected_set, subset=subset, effective=effective) return C.sum(axis=1) @property def total_count_lagged(self, connected_set=None, subset=None, effective=False): h = self.histogram_lagged(connected_set=connected_set, subset=subset, effective=effective) return h.sum() @property def count_matrix_largest(self): """The count matrix on the largest connected set """ return self.count_matrix(connected_set=0) @property def largest_connected_set(self): """ The largest reversible connected set of states """ self._assert_counted_at_lag() return self._lcs @property def visited_set(self): r""" The set of visited states """ return visited_set(self._dtrajs) @property def connected_sets(self): """ The reversible connected sets of states, sorted by size (descending) """ self._assert_counted_at_lag() return self._connected_sets @property def connected_set_sizes(self): """The numbers of states for each connected set """ self._assert_counted_at_lag() return self._connected_set_sizes