Example #1
0
    def _find_largest_connected_set(self, connectivity, connectivity_factor, progress_bar=None):
        estimator = TransitionCountEstimator(lagtime=self.lagtime, count_mode=self.count_mode)

        # make a counts model over all observed samples.
        full_counts_model = estimator.fit_fetch(self.dtrajs)

        if connectivity is None:
            warnings.warn(f"connectivity type is None. Data has not been restricted to the largest connected set."
                          f"The full counts model has been returned.")
            return full_counts_model

        if connectivity == 'summed_count_matrix':
            # We assume the thermodynamic states have overlap when they contain counts from the same markov state.
            # Full counts model contains the sum of the state counts over all therm. states., and we simply ignore
            # the thermodynamic state indices.
            return full_counts_model.submodel_largest(
                connectivity_threshold=connectivity_factor,
                directed=True)

        if connectivity in ['post_hoc_RE', 'BAR_variance']:
            # get state counts for each trajectory (=for each therm. state)
            all_state_counts = np.asarray([estimator.fit_fetch(dtraj).state_histogram for dtraj in self.dtrajs],
                                          dtype=object)
            # pad with zero's so they are all the same size and easier for the cpp module to handle
            all_state_counts = to_zero_padded_array(all_state_counts, self.n_markov_states)

            # get list of all possible transitions between thermodynamic states. A transition is only possible when two
            # thermodynamic states have an overlapping markov state. Whether the markov state overlaps depends on the
            # sampled data and the connectivity settings and is computed in find_state_transitions:
            if connectivity == 'post_hoc_RE':
                connectivity_fn = tram.find_state_transitions_post_hoc_RE
            else:
                connectivity_fn = tram.find_state_transitions_BAR_variance

            with callbacks.Callback(progress_bar, self.n_therm_states * self.n_markov_states,
                                    "Finding connected sets") as callback:
                (i_s, j_s) = connectivity_fn(self.ttrajs, self.dtrajs, self.bias_matrices, all_state_counts,
                                             self.n_therm_states, self.n_markov_states, connectivity_factor,
                                             callback)
            print((i_s, j_s))
            # add transitions that occurred within each thermodynamic state. These are simply the connected sets:
            for k in range(self.n_therm_states):
                for cset in estimator.fit_fetch(self.dtrajs[k]).connected_sets():
                    i_s.extend(list(cset[0:-1] + k * self.n_markov_states))
                    j_s.extend(list(cset[1:] + k * self.n_markov_states))

            # turn the list of transitions into a boolean matrix that has a one whenever a transition has occurred
            data = np.ones(len(i_s), dtype=np.int32)
            dim = self.n_therm_states * self.n_markov_states
            sparse_transition_counts = sp.sparse.coo_matrix((data, (i_s, j_s)), shape=(dim, dim))

            # Now we have all possible paths in the list of transitions. Get the connected set of that
            overlap_counts_model = TransitionCountModel(sparse_transition_counts)
            connected_states_ravelled = overlap_counts_model.submodel_largest(directed=False).state_symbols

            # unravel the index and combine all separate csets to one cset
            connected_states = np.unravel_index(connected_states_ravelled, (self.n_therm_states, self.n_markov_states),
                                                order='C')

            return full_counts_model.submodel(np.unique(connected_states[1]))
Example #2
0
    def _construct_count_models(self, dtraj_fragments):
        """ Construct a TransitionCountModel for each thermodynamic state based on the dtraj_fragments, and store in
        self.count_models.

        Parameters
        ----------
        dtraj_fragments: list(list(int))
           A list that contains for each thermodynamic state the fragments from all trajectories that were sampled at
           that thermodynamic state. fragment_indices[k][i] defines the i-th fragment sampled at thermodynamic state k.
           The fragments should be restricted to the largest connected set and not contain any negative state indices.

        Returns
        -------
        count_models : list(TransitionCountModel)
        """

        estimator = TransitionCountEstimator(lagtime=self.lagtime, count_mode=self.count_mode)
        count_models = []

        for k in range(self.n_therm_states):

            if len(dtraj_fragments[k]) == 0 or np.all([len(frag) <= self.lagtime for frag in dtraj_fragments[k]]):
                warnings.warn(f"No transitions for thermodynamic state {k} after cutting the trajectories into "
                              f"fragments that start at each replica exchange swap. Replica exchanges possibly occur "
                              f"within the span of the lag time.")
                # there are no samples from this state that belong to the connected set. Make an empty count model.
                count_models.append(TransitionCountModel(np.zeros(self.n_markov_states, self.n_markov_states)))
            else:
                # make a counts model for the samples that belong to the connected set.
                traj_counts_model = estimator.fit_fetch(dtraj_fragments[k])
                count_models.append(traj_counts_model)
        return count_models
Example #3
0
def make_random_model(n_therm_states, n_markov_states, transition_matrices=None):
    transition_counts = np.zeros((n_therm_states, n_markov_states, n_markov_states))

    if transition_matrices is None:
        # make stochastic transition matrix
        transition_matrices = np.random.rand(n_therm_states, n_markov_states, n_markov_states)
        transition_matrices /= np.sum(transition_matrices, axis=-1, keepdims=True)

    biased_conf_energies = np.random.rand(n_therm_states, n_markov_states)
    lagrangians = np.random.rand(n_therm_states, n_markov_states)
    modified_state_counts_log = np.log(np.random.rand(n_therm_states, n_markov_states))
    count_models = [TransitionCountModel(counts) for counts in transition_counts]
    # construct model.
    return TRAMModel(count_models, transition_matrices, biased_conf_energies=biased_conf_energies,
                     lagrangian_mult_log=lagrangians,
                     modified_state_counts_log=modified_state_counts_log)
Example #4
0
 def _append_sample(self, models, prior, sample_model):
     # Save a copy of the current model.
     model_copy = deepcopy(sample_model)
     # the Viterbi path is discarded, but is needed to get a new transition matrix for each model.
     if not self.store_hidden:
         model_copy.hidden_trajs.clear()
     # potentially restrict sampled models to observed space
     # since model_copy is defined on full space, observation_symbols are also observation states
     count_model = TransitionCountModel(model_copy.counts,
                                        lagtime=prior.lagtime)
     models.append(
         HiddenMarkovModel(
             transition_model=MarkovStateModel(
                 model_copy.transition_matrix,
                 stationary_distribution=model_copy.stationary_distribution,
                 reversible=self.reversible,
                 count_model=count_model),
             output_model=model_copy.output_model,
             initial_distribution=model_copy.initial_distribution,
             hidden_state_trajectories=model_copy.hidden_trajs))
Example #5
0
def test_invalid_arguments():
    with assert_raises(ValueError):
        # negative counts
        MaximumLikelihoodMSM().fit(-1 * np.ones((5, 5))).fetch_model()
    with assert_raises(ValueError):
        # non quadratic count matrix
        MaximumLikelihoodMSM().fit(np.ones((3, 5))).fetch_model()
    with assert_raises(ValueError):
        # stationary distribution not over whole state space
        MaximumLikelihoodMSM(
            stationary_distribution_constraint=np.array([1 / 3, 1 / 3, 1 /
                                                         3])).fit(
                                                             np.ones((5, 5)))
    with assert_raises(ValueError):
        # no counts but statdist constraint
        MaximumLikelihoodMSM(
            stationary_distribution_constraint=np.array([.5, .5])).fit(
                np.zeros((2, 2)))
    with assert_raises(ValueError):
        # fit with transition count estimator that hasn't been fit
        MaximumLikelihoodMSM().fit(TransitionCountEstimator(1, "sliding"))
    with assert_raises(ValueError):
        # fit with bogus object
        MaximumLikelihoodMSM().fit(object())
    with assert_raises(ValueError):
        # fit from timeseries without lagtime
        MaximumLikelihoodMSM().fit(np.array([0, 1, 2, 3, 4, 5, 6]))
    with assert_raises(ValueError):
        # empty collection is not allowed
        MarkovStateModelCollection([], [], False, [], 1.)
    with assert_raises(ValueError):
        # number of elements in lists must match
        MarkovStateModelCollection([np.array([[.5, .5], [.5, .5]])], [], False,
                                   [], 1.)
    with assert_raises(ValueError):
        # number of states in lists must match
        MarkovStateModelCollection([np.array([[.5, .5], [.5, .5]])], [None],
                                   False,
                                   [TransitionCountModel(np.ones((3, 3)))], 1.)
Example #6
0
    def fit(self, data, *args, **kw):
        r""" Fits an AMM.

        Parameters
        ----------
        data : TransitionCountModel or (N, N) ndarray
            Count matrix over data.
        *args
            scikit-learn compatibility argument
        **kw
            scikit-learn compatibility argument

        Returns
        -------
        self : AugmentedMSMEstimator
            Reference to self.
        """
        if not isinstance(data, (TransitionCountModel, np.ndarray)):
            raise ValueError("Can only fit on a TransitionCountModel or a count matrix directly.")

        if isinstance(data, np.ndarray):
            if data.ndim != 2 or data.shape[0] != data.shape[1] or np.any(data < 0.):
                raise ValueError("If fitting a count matrix directly, only non-negative square matrices can be used.")
            count_model = TransitionCountModel(data)
        else:
            count_model = data

        if len(self.experimental_measurement_weights) != self.expectations_by_state.shape[1]:
            raise ValueError("Experimental weights must span full observable space.")
        if len(self.experimental_measurements) != self.expectations_by_state.shape[1]:
            raise ValueError("Experimental measurements must span full observable state space.")

        count_matrix = count_model.count_matrix
        if issparse(count_matrix):
            count_matrix = count_matrix.toarray()

        # slice out active states from E matrix
        expectations_selected = self.expectations_by_state[count_model.state_symbols]
        count_matrix_symmetric = 0.5 * (count_matrix + count_matrix.T)
        nonzero_counts = np.nonzero(count_matrix_symmetric)
        counts_row_sums = np.sum(count_matrix, axis=1)
        expectations_confidence_interval = confidence_interval(expectations_selected, conf=self.support_confidence)

        measurements = self.experimental_measurements
        measurement_weights = self.experimental_measurement_weights

        count_outside = []
        count_inside = []

        i = 0
        # Determine which experimental values are outside the support as defined by the Confidence interval
        for confidence_lower, confidence_upper, measurement, weight in zip(
                expectations_confidence_interval[0], expectations_confidence_interval[1],
                measurements, measurement_weights):
            if measurement < confidence_lower or confidence_upper < measurement:
                self._log.info(f"Experimental value {measurement} is outside the "
                               f"support ({confidence_lower, confidence_upper})")
                count_outside.append(i)
            else:
                count_inside.append(i)
            i = i + 1

        # A number of initializations
        transition_matrix, stationary_distribution = msmest.transition_matrix(count_matrix, reversible=True,
                                                                              return_statdist=True)
        if issparse(transition_matrix):
            transition_matrix = transition_matrix.toarray()
        # Determine number of slices of R-tensors computable at once with the given cache size
        slices_z = np.floor(self.max_cache / (transition_matrix.nbytes / 1.e6)).astype(int)
        # Optimizer state
        state = AMMOptimizerState(expectations_selected, measurements, measurement_weights,
                                  stationary_distribution, slices_z, count_matrix_symmetric, counts_row_sums)
        ll_old = state.log_likelihood_biased(count_matrix, transition_matrix)

        state.log_likelihoods.append(ll_old)
        # make sure everything is initialized
        state.update_pi_hat()
        state.update_m_hat()
        state.update_Q()
        state.update_X_and_pi()

        ll_old = state.log_likelihood_biased(count_matrix, transition_matrix)
        state.log_likelihood_prev = ll_old
        state.update_G()

        #
        # Main estimation algorithm
        # 2-step algorithm, lagrange multipliers and pihat have different convergence criteria
        # when the lagrange multipliers have converged, pihat is updated until the log-likelihood has converged
        # (changes are smaller than 1e-3).
        # These do not always converge together, but usually within a few steps of each other.
        # A better heuristic for the latter may be necessary. For realistic cases (the two ubiquitin examples in [1])
        # this yielded results very similar to those with more stringent convergence criteria
        # (changes smaller than 1e-9) with convergence times
        # which are seconds instead of tens of minutes.
        #

        converged = False  # Convergence flag for lagrange multipliers
        i = 0
        die = False
        while i <= self.maxiter:
            pi_hat_old = state.pi_hat.copy()
            state.update_pi_hat()
            if not np.all(state.pi_hat > 0):
                state.pi_hat = pi_hat_old.copy()
                die = True
                self._log.warning("pihat does not have a finite probability for all states, terminating")
            state.update_m_hat()
            state.update_Q()

            if i > 1:
                X_old = np.copy(state.X)
                state.update_X_and_pi()
                if np.any(state.X[nonzero_counts] < 0) and i > 0:
                    die = True
                    self._log.warning(
                        "Warning: new X is not proportional to C... reverting to previous step and terminating")
                    state.X = X_old

            if not converged:
                self._newton_lagrange(state, count_matrix)
            else:  # once Lagrange multipliers are converged compute likelihood here
                transition_matrix = state.X / state.pi[:, None]
                _ll_new = state.log_likelihood_biased(count_matrix, transition_matrix)
                state.log_likelihoods.append(_ll_new)

            # General case fixed-point iteration
            if len(count_outside) > 0:
                if i > 1 and np.all((np.abs(state.delta_m_hat) / self.uncertainties) < self.convergence_criterion_lagrange)\
                        and not converged:
                    self._log.info(f"Converged Lagrange multipliers after {i} steps...")
                    converged = True
            # Special case
            else:
                if np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1]) < 1e-8:
                    self._log.info(f"Converged Lagrange multipliers after {i} steps...")
                    converged = True
            # if Lagrange multipliers are converged, check whether log-likelihood has converged
            if converged and np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1]) < 1e-8:
                self._log.info(f"Converged pihat after {i} steps...")
                die = True
            if die:
                break
            if i == self.maxiter:
                ll_diff = np.abs(state.log_likelihoods[-2] - state.log_likelihoods[-1])
                self._log.info(f"Failed to converge within {i} iterations. Log-likelihoods lastly changed by {ll_diff}."
                               f" Consider increasing max_iter(now={self.max_iter})")
            i += 1

        transition_matrix = msmest.transition_matrix(count_matrix, reversible=True, mu=state.pi_hat)
        self._model = AugmentedMSM(transition_matrix=transition_matrix, stationary_distribution=state.pi_hat,
                                   reversible=True, count_model=count_model, amm_optimizer_state=state)
        return self
Example #7
0
def msm():
    return MarkovStateModel([[0.9, 0.1], [0.1, 0.9]],
                            count_model=TransitionCountModel([[90, 10],
                                                              [10, 90]]))
    def _test_submodel(self, histogram):
        # three connected components: ((1, 2), (0), (3))
        count_matrix = np.array([[10., 0., 0., 0.], [0., 1., 1., 0.],
                                 [0., 1., 1., 0.], [0., 0., 0., 1]])
        model = TransitionCountModel(count_matrix,
                                     counting_mode="effective",
                                     state_histogram=histogram)

        self._check_submodel_transitive_properties(histogram, count_matrix,
                                                   model)

        if histogram is not None:
            assert_equal(model.selected_count_fraction, 1.)
            assert_equal(model.total_count, 100 + 10 + 10 + 10)
            assert_equal(model.visited_set, [0, 1, 2, 3])
        else:
            with assert_raises(RuntimeError):
                print(model.selected_count_fraction)
            with assert_raises(RuntimeError):
                print(model.total_count)
            with assert_raises(RuntimeError):
                print(model.visited_set)

        assert_equal(model.count_matrix, count_matrix)
        assert_equal(model.selected_state_fraction, 1.)

        sets = model.connected_sets(connectivity_threshold=0,
                                    directed=True,
                                    probability_constraint=None)
        assert_equal(len(sets), 3)
        assert_equal(len(sets[0]), 2)
        assert_equal(len(sets[1]), 1)
        assert_equal(len(sets[2]), 1)
        assert_equal(model.state_symbols, [0, 1, 2, 3])
        assert_(model.is_full_model)
        assert_equal(model.state_histogram, histogram)
        assert_equal(model.n_states, 4)
        assert 1 in sets[0] and 2 in sets[
            0], "expected states 1 and 2 in largest connected set, got {}".format(
                sets[0])

        submodel = model.submodel(sets[0])
        self._check_submodel_transitive_properties(histogram, count_matrix,
                                                   submodel)
        if histogram is not None:
            assert_equal(submodel.state_histogram, [10, 10])
            assert_equal(submodel.selected_count_fraction, 20. / 130.)
            assert_equal(submodel.total_count, 20)
            assert_equal(submodel.visited_set, [0, 1])
        else:
            assert_equal(submodel.state_histogram, None)
            with assert_raises(RuntimeError):
                print(submodel.selected_count_fraction)
            with assert_raises(RuntimeError):
                print(submodel.total_count)
            with assert_raises(RuntimeError):
                print(submodel.visited_set)
        assert_equal(submodel.count_matrix, np.array([[1, 1], [1, 1]]))
        assert_equal(submodel.selected_state_fraction, 0.5)
        sets = submodel.connected_sets(connectivity_threshold=0,
                                       directed=True,
                                       probability_constraint=None)
        assert_equal(len(sets), 1)
        assert_equal(len(sets[0]), 2)
        assert 0 in sets[0] and 1 in sets[0], "states 0 and 1 should be in the connected set, " \
                                              "but got {}".format(sets[0])
        assert_equal(submodel.state_symbols, [1, 2])
        assert_(not submodel.is_full_model)
        assert_equal(submodel.n_states, 2)

        subsubmodel = submodel.submodel([1])
        self._check_submodel_transitive_properties(histogram, count_matrix,
                                                   subsubmodel)
        if histogram is not None:
            assert_equal(subsubmodel.state_histogram, [10])
            assert_equal(subsubmodel.selected_count_fraction, 10. / 130.)
            assert_equal(subsubmodel.total_count, 10)
            assert_equal(subsubmodel.visited_set, [0])
        else:
            assert_equal(subsubmodel.state_histogram, None)
            with assert_raises(RuntimeError):
                print(subsubmodel.selected_count_fraction)
            with assert_raises(RuntimeError):
                print(subsubmodel.total_count)
            with assert_raises(RuntimeError):
                print(subsubmodel.visited_set)
        assert_equal(subsubmodel.count_matrix, np.array([[1]]))
        assert_equal(subsubmodel.selected_state_fraction, 0.25)
        sets = subsubmodel.connected_sets(connectivity_threshold=0,
                                          directed=True,
                                          probability_constraint=None)
        assert_equal(len(sets), 1)
        assert_equal(len(sets[0]), 1)
        assert 0 in sets[
            0], "state 0 should be in the connected set, but got {}".format(
                sets[0])
        assert_equal(subsubmodel.state_symbols, [2])
        assert_(not subsubmodel.is_full_model)
        assert_equal(subsubmodel.n_states, 1)
Example #9
0
    def fit(self, dtrajs, initial_model=None, **kwargs):
        r""" Fits a new :class:`HMM <HiddenMarkovModel>` to data.

        Parameters
        ----------
        dtrajs : array_like or list of array_like
            Timeseries data.
        initial_model : HiddenMarkovModel, optional, default=None
            Override for :attr:`initial_transition_model`.
        **kwargs
            Ignored kwargs for scikit-learn compatibility.

        Returns
        -------
        self : MaximumLikelihoodHMM
            Reference to self.
        """
        if initial_model is None:
            initial_model = self.initial_transition_model
        if initial_model is None or not isinstance(initial_model,
                                                   HiddenMarkovModel):
            raise ValueError(
                "For estimation, an initial model of type "
                "`deeptime.markov.hmm.HiddenMarkovModel` is required.")

        # copy initial model
        transition_matrix = initial_model.transition_model.transition_matrix
        if issparse(transition_matrix):
            # want dense matrix, toarray makes a copy
            transition_matrix = transition_matrix.toarray()
        else:
            # new instance
            transition_matrix = np.copy(transition_matrix)

        hmm_data = MaximumLikelihoodHMM._HMMModelStorage(
            transition_matrix=transition_matrix,
            output_model=initial_model.output_model.copy(),
            initial_distribution=initial_model.initial_distribution.copy())

        dtrajs = ensure_timeseries_data(dtrajs)
        dtrajs = compute_dtrajs_effective(
            dtrajs,
            lagtime=self.lagtime,
            n_states=initial_model.n_hidden_states,
            stride=self.stride)

        max_n_frames = max(len(obs) for obs in dtrajs)
        # pre-construct hidden variables
        N = initial_model.n_hidden_states
        alpha = np.zeros((max_n_frames, N))
        beta = np.zeros((max_n_frames, N))
        gammas = [np.zeros((len(obs), N)) for obs in dtrajs]
        count_matrices = [np.zeros((N, N)) for _ in dtrajs]

        it = 0
        likelihoods = np.empty(self.maxit)
        # flag if connectivity has changed (e.g. state lost) - in that case the likelihood
        # is discontinuous and can't be used as a convergence criterion in that iteration.
        tmatrix_nonzeros = hmm_data.transition_matrix.nonzero()
        converged = False

        while not converged and it < self.maxit:
            loglik = 0.0
            for obs, gamma, counts in zip(dtrajs, gammas, count_matrices):
                loglik_update, _ = self._forward_backward(
                    hmm_data, obs, alpha, beta, gamma, counts)
                loglik += loglik_update
            assert np.isfinite(loglik), it

            # convergence check
            if it > 0:
                dL = loglik - likelihoods[it - 1]
                if dL < self.accuracy:
                    converged = True

            # update model
            self._update_model(hmm_data,
                               dtrajs,
                               gammas,
                               count_matrices,
                               maxiter=self.maxit_reversible)

            # connectivity change check
            tmatrix_nonzeros_new = hmm_data.transition_matrix.nonzero()
            if not np.array_equal(tmatrix_nonzeros, tmatrix_nonzeros_new):
                converged = False  # unset converged
                tmatrix_nonzeros = tmatrix_nonzeros_new

            # end of iteration
            likelihoods[it] = loglik
            it += 1

        likelihoods = np.resize(likelihoods, it)

        transition_counts = self._reduce_transition_counts(count_matrices)

        count_model = TransitionCountModel(count_matrix=transition_counts,
                                           lagtime=self.lagtime)
        transition_model = MarkovStateModel(hmm_data.transition_matrix,
                                            reversible=self.reversible,
                                            count_model=count_model)
        hidden_state_trajs = [
            viterbi(hmm_data.transition_matrix,
                    hmm_data.output_model.to_state_probability_trajectory(obs),
                    hmm_data.initial_distribution) for obs in dtrajs
        ]
        model = HiddenMarkovModel(
            transition_model=transition_model,
            output_model=hmm_data.output_model,
            initial_distribution=hmm_data.initial_distribution,
            likelihoods=likelihoods,
            state_probabilities=gammas,
            initial_count=self._init_counts(gammas),
            hidden_state_trajectories=hidden_state_trajs,
            stride=self.stride)
        self._model = model
        return self
Example #10
0
    def submodel(self,
                 states=None,
                 obs=None,
                 mincount_connectivity='1/n',
                 inplace=False):
        """Returns a HMM with restricted state space

        Parameters
        ----------
        states : None, str or int-array
            Hidden states to restrict the model to. In addition to specifying
            the subset, possible options are:
            * None : all states - don't restrict
            * 'populous-strong' : strongly connected subset with maximum counts
            * 'populous-weak' : weakly connected subset with maximum counts
            * 'largest-strong' : strongly connected subset with maximum size
            * 'largest-weak' : weakly connected subset with maximum size
        obs : None, str or int-array
            Observed states to restrict the model to. In addition to specifying
            an array with the state labels to be observed, possible options are:
            * None : all states - don't restrict
            * 'nonempty' : all states with at least one observation in the estimator
        mincount_connectivity : float or '1/n'
            minimum number of counts to consider a connection between two states.
            Counts lower than that will count zero in the connectivity check and
            may thus separate the resulting transition matrix. Default value:
            1/nstates.
        inplace : Bool
            if True, submodel is estimated in-place, overwriting the original
            estimator and possibly discarding information. Default value: False

        Returns
        -------
        hmm : HMM
            The restricted HMM.

        """
        if states is None and obs is None and mincount_connectivity == 0:
            return self
        if states is None:
            states = _np.arange(self.nstates)
        if obs is None:
            obs = _np.arange(self.nstates_obs)

        if str(mincount_connectivity) == '1/n':
            mincount_connectivity = 1.0 / float(self.nstates)

        # handle new connectivity
        cm = TransitionCountModel(self.count_matrix)
        S = cm.connected_sets(connectivity_threshold=mincount_connectivity,
                              directed=True)

        if inplace:
            submodel_estimator = self
        else:
            from copy import deepcopy
            submodel_estimator = deepcopy(self)
        from deeptime.markov._transition_matrix import stationary_distribution
        if len(S) > 1:
            # keep only non-negligible transitions
            C = _np.zeros(self.count_matrix.shape)
            large = _np.where(self.count_matrix >= mincount_connectivity)
            C[large] = self.count_matrix[large]
            for s in S:  # keep all (also small) transition counts within strongly connected subsets
                C[_np.ix_(s, s)] = self.count_matrix[_np.ix_(s, s)]
            # re-estimate transition matrix with disc.
            from deeptime.markov.msm import MaximumLikelihoodMSM
            msmest = MaximumLikelihoodMSM(allow_disconnected=True,
                                          reversible=self.reversible,
                                          connectivity_threshold=0)
            msm = msmest.fit_fetch(C)
            P = msm.transition_matrix
            pi = stationary_distribution(P, C, mincount_connectivity=0)
        else:
            C = self.count_matrix
            P = self.transition_matrix
            pi = self.stationary_distribution

        # determine substates
        if isinstance(states, str):
            strong = 'strong' in states
            largest = 'largest' in states
            S = cm.connected_sets(connectivity_threshold=mincount_connectivity,
                                  directed=strong)
            if largest:
                score = [len(s) for s in S]
            else:
                score = [self.count_matrix[_np.ix_(s, s)].sum() for s in S]
            states = _np.array(S[_np.argmax(score)])
        if states is not None:  # sub-transition matrix
            submodel_estimator._active_set = states
            C = C[_np.ix_(states, states)].copy()
            P = P[_np.ix_(states, states)].copy()
            P /= P.sum(axis=1)[:, None]
            pi = stationary_distribution(P, C)
            submodel_estimator.initial_count = self.initial_count[states]
            submodel_estimator.initial_distribution = self.initial_distribution[
                states] / self.initial_distribution[states].sum()

        # determine observed states
        if str(obs) == 'nonempty':
            obs = _np.where(
                count_states(self.discrete_trajectories_lagged) > 0)[0]
        if obs is not None:
            # set observable set
            submodel_estimator._observable_set = obs
            submodel_estimator._nstates_obs = obs.size
            # full2active mapping
            _full2obs = -1 * _np.ones(self._nstates_obs_full, dtype=int)
            _full2obs[obs] = _np.arange(len(obs), dtype=int)
            # observable trajectories
            submodel_estimator._dtrajs_obs = []
            for dtraj in self.discrete_trajectories_full:
                submodel_estimator._dtrajs_obs.append(_full2obs[dtraj])

            # observation matrix
            B = self.observation_probabilities[_np.ix_(states, obs)].copy()
            B /= B.sum(axis=1)[:, None]
        else:
            B = self.observation_probabilities

        # set quantities back.
        submodel_estimator.update_model_params(P=P, pobs=B, pi=pi)
        submodel_estimator.count_matrix_EM = self.count_matrix[_np.ix_(
            states, states)]  # unchanged count matrix
        submodel_estimator.count_matrix = C  # count matrix consistent with P
        return submodel_estimator
Example #11
0
    def _estimate(self, dtrajs):
        # ensure right format
        dtrajs = _types.ensure_dtraj_list(dtrajs)

        # CHECK LAG
        trajlengths = [_np.size(dtraj) for dtraj in dtrajs]
        if self.lag >= _np.max(trajlengths):
            raise ValueError('Illegal lag time ' + str(self.lag) +
                             ' exceeds longest trajectory length')
        if self.lag > _np.mean(trajlengths):
            self.logger.warning(
                'Lag time ' + str(self.lag) +
                ' is on the order of mean trajectory length ' +
                str(_np.mean(trajlengths)) +
                '. It is recommended to fit four lag times in each ' +
                'trajectory. HMM might be inaccurate.')

        # EVALUATE STRIDE
        if self.stride == 'effective':
            # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
            # how many uncorrelated counts we can make
            self.stride = self.lag
            # get a quick estimate from the spectral radius of the non-reversible
            from pyemma.msm import estimate_markov_model
            msm_nr = estimate_markov_model(dtrajs,
                                           lag=self.lag,
                                           reversible=False,
                                           sparse=False,
                                           connectivity='largest',
                                           dt_traj=self.timestep_traj)
            # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
            # estimate of the decorrelation time
            if msm_nr.nstates > self.nstates:
                # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning
                import warnings
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        'ignore',
                        category=ImaginaryEigenValueWarning,
                        module=
                        'deeptime.markov.tools.analysis.dense.decomposition')
                    corrtime = max(1, msm_nr.timescales()[self.nstates - 1])
                # use the smaller of these two pessimistic estimates
                self.stride = int(min(self.lag, 2 * corrtime))

        # LAG AND STRIDE DATA
        from deeptime.markov import compute_dtrajs_effective
        dtrajs_lagged_strided = compute_dtrajs_effective(dtrajs,
                                                         self.lag,
                                                         n_states=-1,
                                                         stride=self.stride)

        # OBSERVATION SET
        if self.observe_nonempty:
            observe_subset = 'nonempty'
        else:
            observe_subset = None

        # INIT HMM
        from deeptime.markov.hmm import init
        from pyemma.msm.estimators import MaximumLikelihoodMSM
        from pyemma.msm.estimators import OOMReweightedMSM
        if self.msm_init == 'largest-strong':
            hmm_init = init.discrete.metastable_from_data(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                stride=self.stride,
                mode='largest-regularized',
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate)
        elif self.msm_init == 'all':
            hmm_init = init.discrete.metastable_from_data(
                dtrajs,
                n_hidden_states=self.nstates,
                lagtime=self.lag,
                stride=self.stride,
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate,
                mode='all-regularized')
        elif isinstance(
                self.msm_init,
            (MaximumLikelihoodMSM, OOMReweightedMSM)):  # initial MSM given.
            msm = MarkovStateModel(transition_matrix=self.msm_init.P,
                                   count_model=TransitionCountModel(
                                       self.msm_init.count_matrix_active))
            hmm_init = init.discrete.metastable_from_msm(
                msm,
                n_hidden_states=self.nstates,
                reversible=self.reversible,
                stationary=True,
                separate_symbols=self.separate)
            observe_subset = self.msm_init.active_set  # override observe_subset.
        else:
            raise ValueError('Unknown MSM initialization option: ' +
                             str(self.msm_init))

        # ---------------------------------------------------------------------------------------
        # Estimate discrete HMM
        # ---------------------------------------------------------------------------------------

        # run EM
        from deeptime.markov.hmm import MaximumLikelihoodHMM
        hmm_est = MaximumLikelihoodHMM(hmm_init,
                                       lagtime=self.lag,
                                       stride=self.stride,
                                       reversible=self.reversible,
                                       stationary=self.stationary,
                                       accuracy=self.accuracy,
                                       maxit=self.maxit)
        # run
        hmm_est.fit(dtrajs)
        # package in discrete HMM
        self.hmm = hmm_est.fetch_model()

        # get model parameters
        self.initial_distribution = self.hmm.initial_distribution
        transition_matrix = self.hmm.transition_model.transition_matrix
        observation_probabilities = self.hmm.output_probabilities

        # get estimation parameters
        self.likelihoods = self.hmm.likelihoods  # Likelihood history
        self.likelihood = self.likelihoods[-1]
        self.hidden_state_probabilities = self.hmm.state_probabilities  # gamma variables
        self.hidden_state_trajectories = self.hmm.hidden_state_trajectories  # Viterbi path
        self.count_matrix = self.hmm.count_model.count_matrix  # hidden count matrix
        self.initial_count = self.hmm.initial_count  # hidden init count
        self._active_set = _np.arange(self.nstates)

        # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards?
        # parametrize self
        self._dtrajs_full = dtrajs
        self._dtrajs_lagged = dtrajs_lagged_strided
        self._nstates_obs_full = number_of_states(dtrajs)
        self._nstates_obs = number_of_states(dtrajs_lagged_strided)
        self._observable_set = _np.arange(self._nstates_obs)
        self._dtrajs_obs = dtrajs
        self.set_model_params(P=transition_matrix,
                              pobs=observation_probabilities,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        # TODO: perhaps remove connectivity and just rely on .submodel()?
        # deal with connectivity
        states_subset = None
        if self.connectivity == 'largest':
            states_subset = 'largest-strong'
        elif self.connectivity == 'populous':
            states_subset = 'populous-strong'

        # return submodel (will return self if all None)
        return self.submodel(states=states_subset,
                             obs=observe_subset,
                             mincount_connectivity=self.mincount_connectivity,
                             inplace=True)
Example #12
0
 def test_dt_model(self):
     C = TransitionCountModel(np.array([[0.1, 0.9], [0.9, 0.1]]), lagtime=5)
     msm = MarkovStateModel(C.count_matrix, count_model=C)
     tpt = msm.reactive_flux([0], [1])
     np.testing.assert_equal(msm.lagtime, 5)
Example #13
0
    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True,
                     n_jobs=None,
                     name='',
                     core_set=None,
                     milestoning_method='last_core'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        n_jobs: int or None

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if core_set is not None and count_mode in ('sliding', 'sample'):
            if milestoning_method == 'last_core':

                # assign -1 frames to last visited core
                for d in self._dtrajs:
                    assert d[0] != -1
                    while -1 in d:
                        mask = (d == -1)
                        d[mask] = d[np.roll(mask, -1)]
                self._C = count_matrix(self._dtrajs,
                                       lag,
                                       sliding=count_mode == 'sliding')

            else:
                raise NotImplementedError(
                    'Milestoning method {} not implemented.'.format(
                        milestoning_method))
        else:
            cm = TransitionCountEstimator(lag,
                                          count_mode=count_mode,
                                          sparse=True).fit(
                                              self._dtrajs).fetch_model()
            self._C = cm.count_matrix

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        self._count_model_full = TransitionCountModel(self._C)
        self._connected_sets = self._count_model_full.connected_sets(
            connectivity_threshold=self._mincount_connectivity)
        self._count_model = self._count_model_full.submodel_largest(
            connectivity_threshold=self._mincount_connectivity)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.array(
            (len(cs) for cs in self._connected_sets))
        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True
Example #14
0
class DiscreteTrajectoryStats(object):
    r""" Statistics, count matrices and connectivity from discrete trajectories

    Operates sparse by default.

    Parameters
    ----------
    dtrajs: list containing ndarrays(dtype=int) or ndarray(n, dtype=int)
        discrete trajectories, stored as integer ndarrays (arbitrary size)
        or a single ndarray for only one trajectory. Elements must be
        non-negative; -1 elements denote unassigned states (milestone
        counting).

    """
    def __init__(self, dtrajs):
        from pyemma.util.types import ensure_dtraj_list

        # discrete trajectories
        self._dtrajs = ensure_dtraj_list(dtrajs)

        # TODO: extensive input checking!
        if any([np.any(d < -1) for d in self._dtrajs]):
            raise ValueError('Discrete trajectory contains elements < -1.')

        ## basic count statistics
        # histogram
        self._hist = count_states(self._dtrajs, ignore_negative=True)
        # total counts
        self._total_count = np.sum(self._hist)
        # number of states
        self._nstates = number_of_states(dtrajs)

        # not yet estimated
        self._counted_at_lag = False

    @staticmethod
    def _compute_connected_sets(C, mincount_connectivity, strong=True):
        """ Computes the connected sets of C.

        C : count matrix
        mincount_connectivity : float
            Minimum count which counts as a connection.
        strong : boolean
            True: Seek strongly connected sets. False: Seek weakly connected sets.
        Returns
        -------
        Cconn, S
        """
        import scipy.sparse as scs
        if scs.issparse(C):
            Cconn = C.tocsr(copy=True)
            Cconn.data[Cconn.data < mincount_connectivity] = 0
            Cconn.eliminate_zeros()
        else:
            Cconn = C.copy()
            Cconn[np.where(Cconn < mincount_connectivity)] = 0

        # treat each connected set separately
        S = connected_sets(Cconn, directed=strong)
        return S

    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True,
                     n_jobs=None,
                     name='',
                     core_set=None,
                     milestoning_method='last_core'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        n_jobs: int or None

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if core_set is not None and count_mode in ('sliding', 'sample'):
            if milestoning_method == 'last_core':

                # assign -1 frames to last visited core
                for d in self._dtrajs:
                    assert d[0] != -1
                    while -1 in d:
                        mask = (d == -1)
                        d[mask] = d[np.roll(mask, -1)]
                self._C = count_matrix(self._dtrajs,
                                       lag,
                                       sliding=count_mode == 'sliding')

            else:
                raise NotImplementedError(
                    'Milestoning method {} not implemented.'.format(
                        milestoning_method))
        else:
            cm = TransitionCountEstimator(lag,
                                          count_mode=count_mode,
                                          sparse=True).fit(
                                              self._dtrajs).fetch_model()
            self._C = cm.count_matrix

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        self._count_model_full = TransitionCountModel(self._C)
        self._connected_sets = self._count_model_full.connected_sets(
            connectivity_threshold=self._mincount_connectivity)
        self._count_model = self._count_model_full.submodel_largest(
            connectivity_threshold=self._mincount_connectivity)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.array(
            (len(cs) for cs in self._connected_sets))
        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True

    # ==================================
    # Permanent properties
    # ==================================

    def _assert_counted_at_lag(self):
        """
        Checks if count_lagged has been run
        """
        assert self._counted_at_lag, \
            "You haven't run count_lagged yet. Do that first before accessing lag-based quantities"

    def _assert_subset(self, A):
        """
        Checks if set A is a subset of states

        Parameters
        ----------
        A : int or int array
            set of states
        """
        if np.size(A) == 0:
            return True  # empty set is always contained
        assert np.max(
            A
        ) < self._nstates, 'Chosen set contains states that are not included in the data.'

    @property
    def nstates(self):
        """
        Number (int) of states
        """
        return self._nstates

    @property
    @alias('dtrajs')
    def discrete_trajectories(self):
        """
        A list of integer arrays with the original (unmapped) discrete trajectories:

        """
        return self._dtrajs

    @property
    def total_count(self):
        """
        Total number of counts

        """
        return self._hist.sum()

    @property
    @alias('hist')
    def histogram(self):
        r""" Histogram of discrete state counts

        """
        return self._hist

    # ==================================
    # Estimated properties
    # ==================================

    @property
    def lag(self):
        """
        The active set of states on which all computations and estimations will be done

        """
        self._assert_counted_at_lag()
        return self._lag

    def count_matrix(self, connected_set=None, subset=None):
        r"""The count matrix

        Parameters
        ----------
        connected_set : int or None, optional, default=None
            connected set index. See :func:`connected_sets` to get a sorted list of connected sets.
            This parameter is exclusive with subset.
        subset : array-like of int or None, optional, default=None
            subset of states to compute the count matrix on. This parameter is exclusive with subset.

        References
        ----------

        ..[1] Trendelkamp-Schroer B, H Wu, F Paul and F Noe. 2015:
            Reversible Markov models of molecular kinetics: Estimation and uncertainty.
            in preparation.
        """
        self._assert_counted_at_lag()
        if subset is not None and connected_set is not None:
            raise ValueError('Can\'t set both connected_set and subset.')
        if subset is not None:
            self._assert_subset(subset)
            C = submatrix(self._C, subset)
        elif connected_set is not None:
            C = submatrix(self._C, self._connected_sets[connected_set])
        else:  # full matrix wanted
            C = self._C

        return C

    @alias('hist_lagged')
    def histogram_lagged(self,
                         connected_set=None,
                         subset=None,
                         effective=False):
        r""" Histogram of discrete state counts

        """
        C = self.count_matrix(connected_set=connected_set,
                              subset=subset,
                              effective=effective)
        return C.sum(axis=1)

    @property
    def total_count_lagged(self,
                           connected_set=None,
                           subset=None,
                           effective=False):
        h = self.histogram_lagged(connected_set=connected_set,
                                  subset=subset,
                                  effective=effective)
        return h.sum()

    @property
    def count_matrix_largest(self):
        """The count matrix on the largest connected set

        """
        return self.count_matrix(connected_set=0)

    @property
    def largest_connected_set(self):
        """
        The largest reversible connected set of states

        """
        self._assert_counted_at_lag()
        return self._lcs

    @property
    def visited_set(self):
        r""" The set of visited states
        """
        return visited_set(self._dtrajs)

    @property
    def connected_sets(self):
        """
        The reversible connected sets of states, sorted by size (descending)

        """
        self._assert_counted_at_lag()
        return self._connected_sets

    @property
    def connected_set_sizes(self):
        """The numbers of states for each connected set

        """
        self._assert_counted_at_lag()
        return self._connected_set_sizes