Python TransitionCountModel Examples

Programming Language: Python

Namespace/Package Name: deeptime.markov

Examples at hotexamples.com: 14

Python TransitionCountModel - 14 examples found. These are the top rated real world Python examples of deeptime.markov.TransitionCountModel extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

TransitionCountModel(13)

connected_sets(3)

submodel_largest(2)

submodel(1)

Example #1

Show file

    def _find_largest_connected_set(self, connectivity, connectivity_factor, progress_bar=None):
        estimator = TransitionCountEstimator(lagtime=self.lagtime, count_mode=self.count_mode)

        # make a counts model over all observed samples.
        full_counts_model = estimator.fit_fetch(self.dtrajs)

        if connectivity is None:
            warnings.warn(f"connectivity type is None. Data has not been restricted to the largest connected set."
                          f"The full counts model has been returned.")
            return full_counts_model

        if connectivity == 'summed_count_matrix':
            # We assume the thermodynamic states have overlap when they contain counts from the same markov state.
            # Full counts model contains the sum of the state counts over all therm. states., and we simply ignore
            # the thermodynamic state indices.
            return full_counts_model.submodel_largest(
                connectivity_threshold=connectivity_factor,
                directed=True)

        if connectivity in ['post_hoc_RE', 'BAR_variance']:
            # get state counts for each trajectory (=for each therm. state)
            all_state_counts = np.asarray([estimator.fit_fetch(dtraj).state_histogram for dtraj in self.dtrajs],
                                          dtype=object)
            # pad with zero's so they are all the same size and easier for the cpp module to handle
            all_state_counts = to_zero_padded_array(all_state_counts, self.n_markov_states)

            # get list of all possible transitions between thermodynamic states. A transition is only possible when two
            # thermodynamic states have an overlapping markov state. Whether the markov state overlaps depends on the
            # sampled data and the connectivity settings and is computed in find_state_transitions:
            if connectivity == 'post_hoc_RE':
                connectivity_fn = tram.find_state_transitions_post_hoc_RE
            else:
                connectivity_fn = tram.find_state_transitions_BAR_variance

            with callbacks.Callback(progress_bar, self.n_therm_states * self.n_markov_states,
                                    "Finding connected sets") as callback:
                (i_s, j_s) = connectivity_fn(self.ttrajs, self.dtrajs, self.bias_matrices, all_state_counts,
                                             self.n_therm_states, self.n_markov_states, connectivity_factor,
                                             callback)
            print((i_s, j_s))
            # add transitions that occurred within each thermodynamic state. These are simply the connected sets:
            for k in range(self.n_therm_states):
                for cset in estimator.fit_fetch(self.dtrajs[k]).connected_sets():
                    i_s.extend(list(cset[0:-1] + k * self.n_markov_states))
                    j_s.extend(list(cset[1:] + k * self.n_markov_states))

            # turn the list of transitions into a boolean matrix that has a one whenever a transition has occurred
            data = np.ones(len(i_s), dtype=np.int32)
            dim = self.n_therm_states * self.n_markov_states
            sparse_transition_counts = sp.sparse.coo_matrix((data, (i_s, j_s)), shape=(dim, dim))

            # Now we have all possible paths in the list of transitions. Get the connected set of that
            overlap_counts_model = TransitionCountModel(sparse_transition_counts)
            connected_states_ravelled = overlap_counts_model.submodel_largest(directed=False).state_symbols

            # unravel the index and combine all separate csets to one cset
            connected_states = np.unravel_index(connected_states_ravelled, (self.n_therm_states, self.n_markov_states),
                                                order='C')

            return full_counts_model.submodel(np.unique(connected_states[1]))

Example #2

Show file

    def _construct_count_models(self, dtraj_fragments):
        """ Construct a TransitionCountModel for each thermodynamic state based on the dtraj_fragments, and store in
        self.count_models.

        Parameters
        ----------
        dtraj_fragments: list(list(int))
           A list that contains for each thermodynamic state the fragments from all trajectories that were sampled at
           that thermodynamic state. fragment_indices[k][i] defines the i-th fragment sampled at thermodynamic state k.
           The fragments should be restricted to the largest connected set and not contain any negative state indices.

        Returns
        -------
        count_models : list(TransitionCountModel)
        """

        estimator = TransitionCountEstimator(lagtime=self.lagtime, count_mode=self.count_mode)
        count_models = []

        for k in range(self.n_therm_states):

            if len(dtraj_fragments[k]) == 0 or np.all([len(frag) <= self.lagtime for frag in dtraj_fragments[k]]):
                warnings.warn(f"No transitions for thermodynamic state {k} after cutting the trajectories into "
                              f"fragments that start at each replica exchange swap. Replica exchanges possibly occur "
                              f"within the span of the lag time.")
                # there are no samples from this state that belong to the connected set. Make an empty count model.
                count_models.append(TransitionCountModel(np.zeros(self.n_markov_states, self.n_markov_states)))
            else:
                # make a counts model for the samples that belong to the connected set.
                traj_counts_model = estimator.fit_fetch(dtraj_fragments[k])
                count_models.append(traj_counts_model)
        return count_models

Example #3

Show file

File: test_tram_model.py Project: thempel/scikit-time

def make_random_model(n_therm_states, n_markov_states, transition_matrices=None):
    transition_counts = np.zeros((n_therm_states, n_markov_states, n_markov_states))

    if transition_matrices is None:
        # make stochastic transition matrix
        transition_matrices = np.random.rand(n_therm_states, n_markov_states, n_markov_states)
        transition_matrices /= np.sum(transition_matrices, axis=-1, keepdims=True)

    biased_conf_energies = np.random.rand(n_therm_states, n_markov_states)
    lagrangians = np.random.rand(n_therm_states, n_markov_states)
    modified_state_counts_log = np.log(np.random.rand(n_therm_states, n_markov_states))
    count_models = [TransitionCountModel(counts) for counts in transition_counts]
    # construct model.
    return TRAMModel(count_models, transition_matrices, biased_conf_energies=biased_conf_energies,
                     lagrangian_mult_log=lagrangians,
                     modified_state_counts_log=modified_state_counts_log)

Example #4

Show file

File: _bayesian_hmm.py Project: thempel/scikit-time

 def _append_sample(self, models, prior, sample_model):
     # Save a copy of the current model.
     model_copy = deepcopy(sample_model)
     # the Viterbi path is discarded, but is needed to get a new transition matrix for each model.
     if not self.store_hidden:
         model_copy.hidden_trajs.clear()
     # potentially restrict sampled models to observed space
     # since model_copy is defined on full space, observation_symbols are also observation states
     count_model = TransitionCountModel(model_copy.counts,
                                        lagtime=prior.lagtime)
     models.append(
         HiddenMarkovModel(
             transition_model=MarkovStateModel(
                 model_copy.transition_matrix,
                 stationary_distribution=model_copy.stationary_distribution,
                 reversible=self.reversible,
                 count_model=count_model),
             output_model=model_copy.output_model,
             initial_distribution=model_copy.initial_distribution,
             hidden_state_trajectories=model_copy.hidden_trajs))

Example #5

Show file

def test_invalid_arguments():
    with assert_raises(ValueError):
        # negative counts
        MaximumLikelihoodMSM().fit(-1 * np.ones((5, 5))).fetch_model()
    with assert_raises(ValueError):
        # non quadratic count matrix
        MaximumLikelihoodMSM().fit(np.ones((3, 5))).fetch_model()
    with assert_raises(ValueError):
        # stationary distribution not over whole state space
        MaximumLikelihoodMSM(
            stationary_distribution_constraint=np.array([1 / 3, 1 / 3, 1 /
                                                         3])).fit(
                                                             np.ones((5, 5)))
    with assert_raises(ValueError):
        # no counts but statdist constraint
        MaximumLikelihoodMSM(
            stationary_distribution_constraint=np.array([.5, .5])).fit(
                np.zeros((2, 2)))
    with assert_raises(ValueError):
        # fit with transition count estimator that hasn't been fit
        MaximumLikelihoodMSM().fit(TransitionCountEstimator(1, "sliding"))
    with assert_raises(ValueError):
        # fit with bogus object
        MaximumLikelihoodMSM().fit(object())
    with assert_raises(ValueError):
        # fit from timeseries without lagtime
        MaximumLikelihoodMSM().fit(np.array([0, 1, 2, 3, 4, 5, 6]))
    with assert_raises(ValueError):
        # empty collection is not allowed
        MarkovStateModelCollection([], [], False, [], 1.)
    with assert_raises(ValueError):
        # number of elements in lists must match
        MarkovStateModelCollection([np.array([[.5, .5], [.5, .5]])], [], False,
                                   [], 1.)
    with assert_raises(ValueError):
        # number of states in lists must match
        MarkovStateModelCollection([np.array([[.5, .5], [.5, .5]])], [None],
                                   False,
                                   [TransitionCountModel(np.ones((3, 3)))], 1.)

Example #6

Show file

File: augmented_msm.py Project: sklus/scikit-time

    def fit(self, data, *args, **kw):
        r""" Fits an AMM.

        Parameters
        ----------
        data : TransitionCountModel or (N, N) ndarray
            Count matrix over data.
        *args
            scikit-learn compatibility argument
        **kw
            scikit-learn compatibility argument

        Returns
        -------
        self : AugmentedMSMEstimator
            Reference to self.
        """
        if not isinstance(data, (TransitionCountModel, np.ndarray)):
            raise ValueError("Can only fit on a TransitionCountModel or a count matrix directly.")

        if isinstance(data, np.ndarray):
            if data.ndim != 2 or data.shape[0] != data.shape[1] or np.any(data < 0.):
                raise ValueError("If fitting a count matrix directly, only non-negative square matrices can be used.")
            count_model = TransitionCountModel(data)
        else:
            count_model = data

        if len(self.experimental_measurement_weights) != self.expectations_by_state.shape[1]:
            raise ValueError("Experimental weights must span full observable space.")
        if len(self.experimental_measurements) != self.expectations_by_state.shape[1]:
            raise ValueError("Experimental measurements must span full observable state space.")

        count_matrix = count_model.count_matrix
        if issparse(count_matrix):
            count_matrix = count_matrix.toarray()

        # slice out active states from E matrix
        expectations_selected = self.expectations_by_state[count_model.state_symbols]
        count_matrix_symmetric = 0.5 * (count_matrix + count_matrix.T)
        nonzero_counts = np.nonzero(count_matrix_symmetric)
        counts_row_sums = np.sum(count_matrix, axis=1)
        expectations_confidence_interval = confidence_interval(expectations_selected, conf=self.support_confidence)

        measurements = self.experimental_measurements
        measurement_weights = self.experimental_measurement_weights

        count_outside = []
        count_inside = []

        i = 0
        # Determine which experimental values are outside the support as defined by the Confidence interval
        for confidence_lower, confidence_upper, measurement, weight in zip(
                expectations_confidence_interval[0], expectations_confidence_interval[1],
                measurements, measurement_weights):
            if measurement < confidence_lower or confidence_upper < measurement:
                self._log.info(f"Experimental value {measurement} is outside the "
                               f"support ({confidence_lower, confidence_upper})")
                count_outside.append(i)
            else:
                count_inside.append(i)
            i = i + 1

        # A number of initializations
        transition_matrix, stationary_distribution = msmest.transition_matrix(count_matrix, reversible=True,
                                                                              return_statdist=True)
        if issparse(transition_matrix):
            transition_matrix = transition_matrix.toarray()
        # Determine number of slices of R-tensors computable at once with the given cache size
        slices_z = np.floor(self.max_cache / (transition_matrix.nbytes / 1.e6)).astype(int)
        # Optimizer state
        state = AMMOptimizerState(expectations_selected, measurements, measurement_weights,
                                  stationary_distribution, slices_z, count_matrix_symmetric, counts_row_sums)
        ll_old = state.log_likelihood_biased(count_matrix, transition_matrix)

        state.log_likelihoods.append(ll_old)
        # make sure everything is initialized
        state.update_pi_hat()
        state.update_m_hat()
        state.update_Q()
        state.update_X_and_pi()

        ll_old = state.log_likelihood_biased(count_matrix, transition_matrix)
        state.log_likelihood_prev = ll_old
        state.update_G()

        #
        # Main estimation algorithm
        # 2-step algorithm, lagrange multipliers and pihat have different convergence criteria
        # when the lagrange multipliers have converged, pihat is updated until the log-likelihood has converged
        # (changes are smaller than 1e-3).
        # These do not always converge together, but usually within a few steps of each other.
        # A better heuristic for the latter may be necessary. For realistic cases (the two ubiquitin examples in [1])
        # this yielded results very similar to those with more stringent convergence criteria
        # (changes smaller than 1e-9) with convergence times
        # which are seconds instead of tens of minutes.
        #

        converged = False  # Convergence flag for lagrange multipliers
        i = 0
        die = False
        while i <= self.maxiter:
            pi_hat_old = state.pi_hat.copy()
            state.update_pi_hat()
            if not np.all(state.pi_hat > 0):
                state.pi_hat = pi_hat_old.copy()
                die = True
                self._log.warning("pihat does not have a finite probability for all states, terminating")
            state.update_m_hat()
            state.update_Q()

            if i > 1:
                X_old = np.copy(state.X)
                state.update_X_and_pi()
                if np.any(state.X[nonzero_counts] < 0) and i > 0:
                    die = True
                    self._log.warning(
                        "Warning: new X is not proportional to C... reverting to previous step and terminating")
                    state.X = X_old

            if not converged:
                self._newton_lagrange(state, count_matrix)
            else:  # once Lagrange multipliers are converged compute likelihood here
                transition_matrix = state.X / state.pi[:, None]
                _ll_new = state.log_likelihood_biased(count_matrix, transition_matrix)
                state.log_likelihoods.append(_ll_new)

            # General case fixed-point iteration
            if len(count_outside) > 0:
                if i > 1 and np.all((np.abs(state.delta_m_hat) / self.uncertainties) < self.convergence_criterion_lagrange)\
                        and not converged:
                    self._log.info(f"Converged Lagrange multipliers after {i} steps...")
                    converged = True
            # Special case
            else:
                if np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1]) < 1e-8:
                    self._log.info(f"Converged Lagrange multipliers after {i} steps...")
                    converged = True
            # if Lagrange multipliers are converged, check whether log-likelihood has converged
            if converged and np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1]) < 1e-8:
                self._log.info(f"Converged pihat after {i} steps...")
                die = True
            if die:
                break
            if i == self.maxiter:
                ll_diff = np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1])
                self._log.info(f"Failed to converge within {i} iterations. Log-likelihoods lastly changed by {ll_diff}."
                               f" Consider increasing max_iter(now={self.max_iter})")
            i += 1

        transition_matrix = msmest.transition_matrix(count_matrix, reversible=True, mu=state.pi_hat)
        self._model = AugmentedMSM(transition_matrix=transition_matrix, stationary_distribution=state.pi_hat,
                                   reversible=True, count_model=count_model, amm_optimizer_state=state)
        return self

Example #7

Show file

def msm():
    return MarkovStateModel([[0.9, 0.1], [0.1, 0.9]],
                            count_model=TransitionCountModel([[90, 10],
                                                              [10, 90]]))

Example #8

Show file

File: test_transition_counts.py Project: thempel/scikit-time

    def _test_submodel(self, histogram):
        # three connected components: ((1, 2), (0), (3))
        count_matrix = np.array([[10., 0., 0., 0.], [0., 1., 1., 0.],
                                 [0., 1., 1., 0.], [0., 0., 0., 1]])
        model = TransitionCountModel(count_matrix,
                                     counting_mode="effective",
                                     state_histogram=histogram)

        self._check_submodel_transitive_properties(histogram, count_matrix,
                                                   model)

        if histogram is not None:
            assert_equal(model.selected_count_fraction, 1.)
            assert_equal(model.total_count, 100 + 10 + 10 + 10)
            assert_equal(model.visited_set, [0, 1, 2, 3])
        else:
            with assert_raises(RuntimeError):
                print(model.selected_count_fraction)
            with assert_raises(RuntimeError):
                print(model.total_count)
            with assert_raises(RuntimeError):
                print(model.visited_set)

        assert_equal(model.count_matrix, count_matrix)
        assert_equal(model.selected_state_fraction, 1.)

        sets = model.connected_sets(connectivity_threshold=0,
                                    directed=True,
                                    probability_constraint=None)
        assert_equal(len(sets), 3)
        assert_equal(len(sets[0]), 2)
        assert_equal(len(sets[1]), 1)
        assert_equal(len(sets[2]), 1)
        assert_equal(model.state_symbols, [0, 1, 2, 3])
        assert_(model.is_full_model)
        assert_equal(model.state_histogram, histogram)
        assert_equal(model.n_states, 4)
        assert 1 in sets[0] and 2 in sets[
            0], "expected states 1 and 2 in largest connected set, got {}".format(
                sets[0])

        submodel = model.submodel(sets[0])
        self._check_submodel_transitive_properties(histogram, count_matrix,
                                                   submodel)
        if histogram is not None:
            assert_equal(submodel.state_histogram, [10, 10])
            assert_equal(submodel.selected_count_fraction, 20. / 130.)
            assert_equal(submodel.total_count, 20)
            assert_equal(submodel.visited_set, [0, 1])
        else:
            assert_equal(submodel.state_histogram, None)
            with assert_raises(RuntimeError):
                print(submodel.selected_count_fraction)
            with assert_raises(RuntimeError):
                print(submodel.total_count)
            with assert_raises(RuntimeError):
                print(submodel.visited_set)
        assert_equal(submodel.count_matrix, np.array([[1, 1], [1, 1]]))
        assert_equal(submodel.selected_state_fraction, 0.5)
        sets = submodel.connected_sets(connectivity_threshold=0,
                                       directed=True,
                                       probability_constraint=None)
        assert_equal(len(sets), 1)
        assert_equal(len(sets[0]), 2)
        assert 0 in sets[0] and 1 in sets[0], "states 0 and 1 should be in the connected set, " \
                                              "but got {}".format(sets[0])
        assert_equal(submodel.state_symbols, [1, 2])
        assert_(not submodel.is_full_model)
        assert_equal(submodel.n_states, 2)

        subsubmodel = submodel.submodel([1])
        self._check_submodel_transitive_properties(histogram, count_matrix,
                                                   subsubmodel)
        if histogram is not None:
            assert_equal(subsubmodel.state_histogram, [10])
            assert_equal(subsubmodel.selected_count_fraction, 10. / 130.)
            assert_equal(subsubmodel.total_count, 10)
            assert_equal(subsubmodel.visited_set, [0])
        else:
            assert_equal(subsubmodel.state_histogram, None)
            with assert_raises(RuntimeError):
                print(subsubmodel.selected_count_fraction)
            with assert_raises(RuntimeError):
                print(subsubmodel.total_count)
            with assert_raises(RuntimeError):
                print(subsubmodel.visited_set)
        assert_equal(subsubmodel.count_matrix, np.array([[1]]))
        assert_equal(subsubmodel.selected_state_fraction, 0.25)
        sets = subsubmodel.connected_sets(connectivity_threshold=0,
                                          directed=True,
                                          probability_constraint=None)
        assert_equal(len(sets), 1)
        assert_equal(len(sets[0]), 1)
        assert 0 in sets[
            0], "state 0 should be in the connected set, but got {}".format(
                sets[0])
        assert_equal(subsubmodel.state_symbols, [2])
        assert_(not subsubmodel.is_full_model)
        assert_equal(subsubmodel.n_states, 1)

Example #9

Show file

    def fit(self, dtrajs, initial_model=None, **kwargs):
        r""" Fits a new :class:`HMM <HiddenMarkovModel>` to data.

        Parameters
        ----------
        dtrajs : array_like or list of array_like
            Timeseries data.
        initial_model : HiddenMarkovModel, optional, default=None
            Override for :attr:`initial_transition_model`.
        **kwargs
            Ignored kwargs for scikit-learn compatibility.

        Returns
        -------
        self : MaximumLikelihoodHMM
            Reference to self.
        """
        if initial_model is None:
            initial_model = self.initial_transition_model
        if initial_model is None or not isinstance(initial_model,
                                                   HiddenMarkovModel):
            raise ValueError(
                "For estimation, an initial model of type "
                "`deeptime.markov.hmm.HiddenMarkovModel` is required.")

        # copy initial model
        transition_matrix = initial_model.transition_model.transition_matrix
        if issparse(transition_matrix):
            # want dense matrix, toarray makes a copy
            transition_matrix = transition_matrix.toarray()
        else:
            # new instance
            transition_matrix = np.copy(transition_matrix)

        hmm_data = MaximumLikelihoodHMM._HMMModelStorage(
            transition_matrix=transition_matrix,
            output_model=initial_model.output_model.copy(),
            initial_distribution=initial_model.initial_distribution.copy())

        dtrajs = ensure_timeseries_data(dtrajs)
        dtrajs = compute_dtrajs_effective(
            dtrajs,
            lagtime=self.lagtime,
            n_states=initial_model.n_hidden_states,
            stride=self.stride)

        max_n_frames = max(len(obs) for obs in dtrajs)
        # pre-construct hidden variables
        N = initial_model.n_hidden_states
        alpha = np.zeros((max_n_frames, N))
        beta = np.zeros((max_n_frames, N))
        gammas = [np.zeros((len(obs), N)) for obs in dtrajs]
        count_matrices = [np.zeros((N, N)) for _ in dtrajs]

        it = 0
        likelihoods = np.empty(self.maxit)
        # flag if connectivity has changed (e.g. state lost) - in that case the likelihood
        # is discontinuous and can't be used as a convergence criterion in that iteration.
        tmatrix_nonzeros = hmm_data.transition_matrix.nonzero()
        converged = False

        while not converged and it < self.maxit:
            loglik = 0.0
            for obs, gamma, counts in zip(dtrajs, gammas, count_matrices):
                loglik_update, _ = self._forward_backward(
                    hmm_data, obs, alpha, beta, gamma, counts)
                loglik += loglik_update
            assert np.isfinite(loglik), it

            # convergence check
            if it > 0:
                dL = loglik - likelihoods[it - 1]
                if dL < self.accuracy:
                    converged = True

            # update model
            self._update_model(hmm_data,
                               dtrajs,
                               gammas,
                               count_matrices,
                               maxiter=self.maxit_reversible)

            # connectivity change check
            tmatrix_nonzeros_new = hmm_data.transition_matrix.nonzero()
            if not np.array_equal(tmatrix_nonzeros, tmatrix_nonzeros_new):
                converged = False  # unset converged
                tmatrix_nonzeros = tmatrix_nonzeros_new

            # end of iteration
            likelihoods[it] = loglik
            it += 1

        likelihoods = np.resize(likelihoods, it)

        transition_counts = self._reduce_transition_counts(count_matrices)

        count_model = TransitionCountModel(count_matrix=transition_counts,
                                           lagtime=self.lagtime)
        transition_model = MarkovStateModel(hmm_data.transition_matrix,
                                            reversible=self.reversible,
                                            count_model=count_model)
        hidden_state_trajs = [
            viterbi(hmm_data.transition_matrix,
                    hmm_data.output_model.to_state_probability_trajectory(obs),
                    hmm_data.initial_distribution) for obs in dtrajs
        ]
        model = HiddenMarkovModel(
            transition_model=transition_model,
            output_model=hmm_data.output_model,
            initial_distribution=hmm_data.initial_distribution,
            likelihoods=likelihoods,
            state_probabilities=gammas,
            initial_count=self._init_counts(gammas),
            hidden_state_trajectories=hidden_state_trajs,
            stride=self.stride)
        self._model = model
        return self

Example #10

Show file

    def submodel(self,
                 states=None,
                 obs=None,
                 mincount_connectivity='1/n',
                 inplace=False):
        """Returns a HMM with restricted state space

        Parameters
        ----------
        states : None, str or int-array
            Hidden states to restrict the model to. In addition to specifying
            the subset, possible options are:
            * None : all states - don't restrict
            * 'populous-strong' : strongly connected subset with maximum counts
            * 'populous-weak' : weakly connected subset with maximum counts
            * 'largest-strong' : strongly connected subset with maximum size
            * 'largest-weak' : weakly connected subset with maximum size
        obs : None, str or int-array
            Observed states to restrict the model to. In addition to specifying
            an array with the state labels to be observed, possible options are:
            * None : all states - don't restrict
            * 'nonempty' : all states with at least one observation in the estimator
        mincount_connectivity : float or '1/n'
            minimum number of counts to consider a connection between two states.
            Counts lower than that will count zero in the connectivity check and
            may thus separate the resulting transition matrix. Default value:
            1/nstates.
        inplace : Bool
            if True, submodel is estimated in-place, overwriting the original
            estimator and possibly discarding information. Default value: False

        Returns
        -------
        hmm : HMM
            The restricted HMM.

        """
        if states is None and obs is None and mincount_connectivity == 0:
            return self
        if states is None:
            states = _np.arange(self.nstates)
        if obs is None:
            obs = _np.arange(self.nstates_obs)

        if str(mincount_connectivity) == '1/n':
            mincount_connectivity = 1.0 / float(self.nstates)

        # handle new connectivity
        cm = TransitionCountModel(self.count_matrix)
        S = cm.connected_sets(connectivity_threshold=mincount_connectivity,
                              directed=True)

        if inplace:
            submodel_estimator = self
        else:
            from copy import deepcopy
            submodel_estimator = deepcopy(self)
        from deeptime.markov._transition_matrix import stationary_distribution
        if len(S) > 1:
            # keep only non-negligible transitions
            C = _np.zeros(self.count_matrix.shape)
            large = _np.where(self.count_matrix >= mincount_connectivity)
            C[large] = self.count_matrix[large]
            for s in S:  # keep all (also small) transition counts within strongly connected subsets
                C[_np.ix_(s, s)] = self.count_matrix[_np.ix_(s, s)]
            # re-estimate transition matrix with disc.
            from deeptime.markov.msm import MaximumLikelihoodMSM
            msmest = MaximumLikelihoodMSM(allow_disconnected=True,
                                          reversible=self.reversible,
                                          connectivity_threshold=0)
            msm = msmest.fit_fetch(C)
            P = msm.transition_matrix
            pi = stationary_distribution(P, C, mincount_connectivity=0)
        else:
            C = self.count_matrix
            P = self.transition_matrix
            pi = self.stationary_distribution

        # determine substates
        if isinstance(states, str):
            strong = 'strong' in states
            largest = 'largest' in states
            S = cm.connected_sets(connectivity_threshold=mincount_connectivity,
                                  directed=strong)
            if largest:
                score = [len(s) for s in S]
            else:
                score = [self.count_matrix[_np.ix_(s, s)].sum() for s in S]
            states = _np.array(S[_np.argmax(score)])
        if states is not None:  # sub-transition matrix
            submodel_estimator._active_set = states
            C = C[_np.ix_(states, states)].copy()
            P = P[_np.ix_(states, states)].copy()
            P /= P.sum(axis=1)[:, None]
            pi = stationary_distribution(P, C)
            submodel_estimator.initial_count = self.initial_count[states]
            submodel_estimator.initial_distribution = self.initial_distribution[
                states] / self.initial_distribution[states].sum()

        # determine observed states
        if str(obs) == 'nonempty':
            obs = _np.where(
                count_states(self.discrete_trajectories_lagged) > 0)[0]
        if obs is not None:
            # set observable set
            submodel_estimator._observable_set = obs
            submodel_estimator._nstates_obs = obs.size
            # full2active mapping
            _full2obs = -1 * _np.ones(self._nstates_obs_full, dtype=int)
            _full2obs[obs] = _np.arange(len(obs), dtype=int)
            # observable trajectories
            submodel_estimator._dtrajs_obs = []
            for dtraj in self.discrete_trajectories_full:
                submodel_estimator._dtrajs_obs.append(_full2obs[dtraj])

            # observation matrix
            B = self.observation_probabilities[_np.ix_(states, obs)].copy()
            B /= B.sum(axis=1)[:, None]
        else:
            B = self.observation_probabilities

        # set quantities back.
        submodel_estimator.update_model_params(P=P, pobs=B, pi=pi)
        submodel_estimator.count_matrix_EM = self.count_matrix[_np.ix_(
            states, states)]  # unchanged count matrix
        submodel_estimator.count_matrix = C  # count matrix consistent with P
        return submodel_estimator

Example #11

Show file

    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = _types.ensure_dtraj_list(dtrajs)

        # CHECK LAG
        trajlengths = [_np.size(dtraj) for dtraj in dtrajs]
        if self.lag >= _np.max(trajlengths):
            raise ValueError('Illegal lag time ' + str(self.lag) +
                             ' exceeds longest trajectory length')
        if self.lag > _np.mean(trajlengths):
            self.logger.warning(
                'Lag time ' + str(self.lag) +
                ' is on the order of mean trajectory length ' +
                str(_np.mean(trajlengths)) +
                '. It is recommended to fit four lag times in each ' +
                'trajectory. HMM might be inaccurate.')

        # EVALUATE STRIDE
        if self.stride == 'effective':
            # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
            # how many uncorrelated counts we can make
            self.stride = self.lag
            # get a quick estimate from the spectral radius of the non-reversible
            from pyemma.msm import estimate_markov_model
            msm_nr = estimate_markov_model(dtrajs,
                                           lag=self.lag,
                                           reversible=False,
                                           sparse=False,
                                           connectivity='largest',
                                           dt_traj=self.timestep_traj)
            # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
            # estimate of the decorrelation time
            if msm_nr.nstates > self.nstates:
                # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning
                import warnings
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        'ignore',
                        category=ImaginaryEigenValueWarning,
                        module=
                        'deeptime.markov.tools.analysis.dense.decomposition')
                    corrtime = max(1, msm_nr.timescales()[self.nstates - 1])
                # use the smaller of these two pessimistic estimates
                self.stride = int(min(self.lag, 2 * corrtime))

        # LAG AND STRIDE DATA
        from deeptime.markov import compute_dtrajs_effective
        dtrajs_lagged_strided = compute_dtrajs_effective(dtrajs,
                                                         self.lag,
                                                         n_states=-1,
                                                         stride=self.stride)

        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # INIT HMM
        from deeptime.markov.hmm import init
        from pyemma.msm.estimators import MaximumLikelihoodMSM
        from pyemma.msm.estimators import OOMReweightedMSM
        if self.msm_init == 'largest-strong':
            hmm_init = init.discrete.metastable_from_data(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                stride=self.stride,
                mode='largest-regularized',
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate)
        elif self.msm_init == 'all':
            hmm_init = init.discrete.metastable_from_data(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                stride=self.stride,
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate,
                mode='all-regularized')
        elif isinstance(
                self.msm_init,
            (MaximumLikelihoodMSM, OOMReweightedMSM)):  # initial MSM given.
            msm = MarkovStateModel(transition_matrix=self.msm_init.P,
                                   count_model=TransitionCountModel(
                                       self.msm_init.count_matrix_active))
            hmm_init = init.discrete.metastable_from_msm(
                msm,
                n_hidden_states=self.nstates,
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate)
            observe_subset = self.msm_init.active_set  # override observe_subset.
        else:
            raise ValueError('Unknown MSM initialization option: ' +
                             str(self.msm_init))

        # ---------------------------------------------------------------------------------------
        # Estimate discrete HMM
        # ---------------------------------------------------------------------------------------

        # run EM
        from deeptime.markov.hmm import MaximumLikelihoodHMM
        hmm_est = MaximumLikelihoodHMM(hmm_init,
                                       lagtime=self.lag,
                                       stride=self.stride,
                                       reversible=self.reversible,
                                       stationary=self.stationary,
                                       accuracy=self.accuracy,
                                       maxit=self.maxit)
        # run
        hmm_est.fit(dtrajs)
        # package in discrete HMM
        self.hmm = hmm_est.fetch_model()

        # get model parameters
        self.initial_distribution = self.hmm.initial_distribution
        transition_matrix = self.hmm.transition_model.transition_matrix
        observation_probabilities = self.hmm.output_probabilities

        # get estimation parameters
        self.likelihoods = self.hmm.likelihoods  # Likelihood history
        self.likelihood = self.likelihoods[-1]
        self.hidden_state_probabilities = self.hmm.state_probabilities  # gamma variables
        self.hidden_state_trajectories = self.hmm.hidden_state_trajectories  # Viterbi path
        self.count_matrix = self.hmm.count_model.count_matrix  # hidden count matrix
        self.initial_count = self.hmm.initial_count  # hidden init count
        self._active_set = _np.arange(self.nstates)

        # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards?
        # parametrize self
        self._dtrajs_full = dtrajs
        self._dtrajs_lagged = dtrajs_lagged_strided
        self._nstates_obs_full = number_of_states(dtrajs)
        self._nstates_obs = number_of_states(dtrajs_lagged_strided)
        self._observable_set = _np.arange(self._nstates_obs)
        self._dtrajs_obs = dtrajs
        self.set_model_params(P=transition_matrix,
                              pobs=observation_probabilities,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        # TODO: perhaps remove connectivity and just rely on .submodel()?
        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset,
                             obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity,
                             inplace=True)

Example #12

Show file

File: test_reactive_flux.py Project: thempel/scikit-time

 def test_dt_model(self):
     C = TransitionCountModel(np.array([[0.1, 0.9], [0.9, 0.1]]), lagtime=5)
     msm = MarkovStateModel(C.count_matrix, count_model=C)
     tpt = msm.reactive_flux([0], [1])
     np.testing.assert_equal(msm.lagtime, 5)

Example #13

Show file

    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True,
                     n_jobs=None,
                     name='',
                     core_set=None,
                     milestoning_method='last_core'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        n_jobs: int or None

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if core_set is not None and count_mode in ('sliding', 'sample'):
            if milestoning_method == 'last_core':

                # assign -1 frames to last visited core
                for d in self._dtrajs:
                    assert d[0] != -1
                    while -1 in d:
                        mask = (d == -1)
                        d[mask] = d[np.roll(mask, -1)]
                self._C = count_matrix(self._dtrajs,
                                       lag,
                                       sliding=count_mode == 'sliding')

            else:
                raise NotImplementedError(
                    'Milestoning method {} not implemented.'.format(
                        milestoning_method))
        else:
            cm = TransitionCountEstimator(lag,
                                          count_mode=count_mode,
                                          sparse=True).fit(
                                              self._dtrajs).fetch_model()
            self._C = cm.count_matrix

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        self._count_model_full = TransitionCountModel(self._C)
        self._connected_sets = self._count_model_full.connected_sets(
            connectivity_threshold=self._mincount_connectivity)
        self._count_model = self._count_model_full.submodel_largest(
            connectivity_threshold=self._mincount_connectivity)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.array(
            (len(cs) for cs in self._connected_sets))
        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True

Example #14

Show file

class DiscreteTrajectoryStats(object):
    r""" Statistics, count matrices and connectivity from discrete trajectories

    Operates sparse by default.

    Parameters
    ----------
    dtrajs: list containing ndarrays(dtype=int) or ndarray(n, dtype=int)
        discrete trajectories, stored as integer ndarrays (arbitrary size)
        or a single ndarray for only one trajectory. Elements must be
        non-negative; -1 elements denote unassigned states (milestone
        counting).

    """
    def __init__(self, dtrajs):
        from pyemma.util.types import ensure_dtraj_list

        # discrete trajectories
        self._dtrajs = ensure_dtraj_list(dtrajs)

        # TODO: extensive input checking!
        if any([np.any(d < -1) for d in self._dtrajs]):
            raise ValueError('Discrete trajectory contains elements < -1.')

        ## basic count statistics
        # histogram
        self._hist = count_states(self._dtrajs, ignore_negative=True)
        # total counts
        self._total_count = np.sum(self._hist)
        # number of states
        self._nstates = number_of_states(dtrajs)

        # not yet estimated
        self._counted_at_lag = False

    @staticmethod
    def _compute_connected_sets(C, mincount_connectivity, strong=True):
        """ Computes the connected sets of C.

        C : count matrix
        mincount_connectivity : float
            Minimum count which counts as a connection.
        strong : boolean
            True: Seek strongly connected sets. False: Seek weakly connected sets.
        Returns
        -------
        Cconn, S
        """
        import scipy.sparse as scs
        if scs.issparse(C):
            Cconn = C.tocsr(copy=True)
            Cconn.data[Cconn.data < mincount_connectivity] = 0
            Cconn.eliminate_zeros()
        else:
            Cconn = C.copy()
            Cconn[np.where(Cconn < mincount_connectivity)] = 0

        # treat each connected set separately
        S = connected_sets(Cconn, directed=strong)
        return S

    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True,
                     n_jobs=None,
                     name='',
                     core_set=None,
                     milestoning_method='last_core'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        n_jobs: int or None

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if core_set is not None and count_mode in ('sliding', 'sample'):
            if milestoning_method == 'last_core':

                # assign -1 frames to last visited core
                for d in self._dtrajs:
                    assert d[0] != -1
                    while -1 in d:
                        mask = (d == -1)
                        d[mask] = d[np.roll(mask, -1)]
                self._C = count_matrix(self._dtrajs,
                                       lag,
                                       sliding=count_mode == 'sliding')

            else:
                raise NotImplementedError(
                    'Milestoning method {} not implemented.'.format(
                        milestoning_method))
        else:
            cm = TransitionCountEstimator(lag,
                                          count_mode=count_mode,
                                          sparse=True).fit(
                                              self._dtrajs).fetch_model()
            self._C = cm.count_matrix

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        self._count_model_full = TransitionCountModel(self._C)
        self._connected_sets = self._count_model_full.connected_sets(
            connectivity_threshold=self._mincount_connectivity)
        self._count_model = self._count_model_full.submodel_largest(
            connectivity_threshold=self._mincount_connectivity)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.array(
            (len(cs) for cs in self._connected_sets))
        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True

    # ==================================
    # Permanent properties
    # ==================================

    def _assert_counted_at_lag(self):
        """
        Checks if count_lagged has been run
        """
        assert self._counted_at_lag, \
            "You haven't run count_lagged yet. Do that first before accessing lag-based quantities"

    def _assert_subset(self, A):
        """
        Checks if set A is a subset of states

        Parameters
        ----------
        A : int or int array
            set of states
        """
        if np.size(A) == 0:
            return True  # empty set is always contained
        assert np.max(
            A
        ) < self._nstates, 'Chosen set contains states that are not included in the data.'

    @property
    def nstates(self):
        """
        Number (int) of states
        """
        return self._nstates

    @property
    @alias('dtrajs')
    def discrete_trajectories(self):
        """
        A list of integer arrays with the original (unmapped) discrete trajectories:

        """
        return self._dtrajs

    @property
    def total_count(self):
        """
        Total number of counts

        """
        return self._hist.sum()

    @property
    @alias('hist')
    def histogram(self):
        r""" Histogram of discrete state counts

        """
        return self._hist

    # ==================================
    # Estimated properties
    # ==================================

    @property
    def lag(self):
        """
        The active set of states on which all computations and estimations will be done

        """
        self._assert_counted_at_lag()
        return self._lag

    def count_matrix(self, connected_set=None, subset=None):
        r"""The count matrix

        Parameters
        ----------
        connected_set : int or None, optional, default=None
            connected set index. See :func:`connected_sets` to get a sorted list of connected sets.
            This parameter is exclusive with subset.
        subset : array-like of int or None, optional, default=None
            subset of states to compute the count matrix on. This parameter is exclusive with subset.

        References
        ----------

        ..[1] Trendelkamp-Schroer B, H Wu, F Paul and F Noe. 2015:
            Reversible Markov models of molecular kinetics: Estimation and uncertainty.
            in preparation.
        """
        self._assert_counted_at_lag()
        if subset is not None and connected_set is not None:
            raise ValueError('Can\'t set both connected_set and subset.')
        if subset is not None:
            self._assert_subset(subset)
            C = submatrix(self._C, subset)
        elif connected_set is not None:
            C = submatrix(self._C, self._connected_sets[connected_set])
        else:  # full matrix wanted
            C = self._C

        return C

    @alias('hist_lagged')
    def histogram_lagged(self,
                         connected_set=None,
                         subset=None,
                         effective=False):
        r""" Histogram of discrete state counts

        """
        C = self.count_matrix(connected_set=connected_set,
                              subset=subset,
                              effective=effective)
        return C.sum(axis=1)

    @property
    def total_count_lagged(self,
                           connected_set=None,
                           subset=None,
                           effective=False):
        h = self.histogram_lagged(connected_set=connected_set,
                                  subset=subset,
                                  effective=effective)
        return h.sum()

    @property
    def count_matrix_largest(self):
        """The count matrix on the largest connected set

        """
        return self.count_matrix(connected_set=0)

    @property
    def largest_connected_set(self):
        """
        The largest reversible connected set of states

        """
        self._assert_counted_at_lag()
        return self._lcs

    @property
    def visited_set(self):
        r""" The set of visited states
        """
        return visited_set(self._dtrajs)

    @property
    def connected_sets(self):
        """
        The reversible connected sets of states, sorted by size (descending)

        """
        self._assert_counted_at_lag()
        return self._connected_sets

    @property
    def connected_set_sizes(self):
        """The numbers of states for each connected set

        """
        self._assert_counted_at_lag()
        return self._connected_set_sizes