Ejemplo n.º 1
0
    def test_3state_prev(self):
        dtraj = np.array([0, 1, 2, 0, 3, 4])
        import msmtools.estimation as msmest
        for rev in [True, False]:
            hmm = initial_guess_discrete_from_data(dtraj, n_hidden_states=3, lagtime=1, reversible=rev)
            assert msmana.is_transition_matrix(hmm.transition_model.transition_matrix)
            if rev:
                assert msmana.is_reversible(hmm.transition_model.transition_matrix)
            assert np.allclose(hmm.output_probabilities.sum(axis=1), 1)

        for rev in [True, False]:
            C = TransitionCountEstimator(lagtime=1, count_mode="sliding").fit(dtraj).fetch_model().count_matrix
            C += msmest.prior_neighbor(C, 0.001)
            hmm = initial_guess_discrete_from_data(dtraj, n_hidden_states=3, lagtime=1, reversible=rev)
            np.testing.assert_(msmana.is_transition_matrix(hmm.transition_model.transition_matrix))
            if rev:
                np.testing.assert_(msmana.is_reversible(hmm.transition_model.transition_matrix))
            np.testing.assert_allclose(hmm.output_model.output_probabilities.sum(axis=1), 1.)
Ejemplo n.º 2
0
    def test_prior_neighbor(self):
        Bn = prior_neighbor(self.C)
        self.assertTrue(allclose_sparse(Bn, self.alpha_def * self.B_neighbor))

        Bn = prior_neighbor(self.C, alpha=self.alpha)
        self.assertTrue(allclose_sparse(Bn, self.alpha * self.B_neighbor))
Ejemplo n.º 3
0
    def test_prior_neighbor(self):
        Bn = prior_neighbor(self.C)
        assert_allclose(Bn, self.alpha_def * self.B_neighbor)

        Bn = prior_neighbor(self.C, alpha=self.alpha)
        assert_allclose(Bn, self.alpha * self.B_neighbor)
Ejemplo n.º 4
0
Archivo: api.py Proyecto: ChayaSt/bhmm
def init_discrete_hmm(observations, nstates, lag=1, reversible=True, stationary=True, regularize=True,
                      method='connect-spectral', separate=None):
    """Use a heuristic scheme to generate an initial model.

    Parameters
    ----------
    observations : list of ndarray((T_i))
        list of arrays of length T_i with observation data
    nstates : int
        The number of states.
    lag : int
        Lag time at which the observations should be counted.
    reversible : bool
        Estimate reversible HMM transition matrix.
    stationary : bool
        p0 is the stationary distribution of P. Currently only reversible=True is implemented
    regularize : bool
        Regularize HMM probabilities to avoid 0's.
    method : str
        * 'lcs-spectral' : Does spectral clustering on the largest connected set
            of observed states.
        * 'connect-spectral' : Uses a weak regularization to connect the weakly
            connected sets and then initializes HMM using spectral clustering on
            the nonempty set.
        * 'spectral' : Uses spectral clustering on the nonempty subsets. Separated
            observed states will end up in separate hidden states. This option is
            only recommended for small observation spaces. Use connect-spectral for
            large observation spaces.
    separate : None or iterable of int
        Force the given set of observed states to stay in a separate hidden state.
        The remaining nstates-1 states will be assigned by a metastable decomposition.

    Examples
    --------

    Generate initial model for a discrete output model.

    >>> import bhmm
    >>> [model, observations, states] = bhmm.testsystems.generate_synthetic_observations(output='discrete')
    >>> initial_model = init_discrete_hmm(observations, model.nstates)

    """
    import msmtools.estimation as msmest
    from bhmm.init.discrete import init_discrete_hmm_spectral
    C = msmest.count_matrix(observations, lag).toarray()
    # regularization
    if regularize:
        eps_A = None
        eps_B = None
    else:
        eps_A = 0
        eps_B = 0
    if not stationary:
        raise NotImplementedError('Discrete-HMM initialization with stationary=False is not yet implemented.')

    if method=='lcs-spectral':
        lcs = msmest.largest_connected_set(C)
        p0, P, B = init_discrete_hmm_spectral(C, nstates, reversible=reversible, stationary=stationary,
                                              active_set=lcs, separate=separate, eps_A=eps_A, eps_B=eps_B)
    elif method=='connect-spectral':
        # make sure we're strongly connected
        C += msmest.prior_neighbor(C, 0.001)
        nonempty = _np.where(C.sum(axis=0) + C.sum(axis=1) > 0)[0]
        C[nonempty, nonempty] = _np.maximum(C[nonempty, nonempty], 0.001)
        p0, P, B = init_discrete_hmm_spectral(C, nstates, reversible=reversible, stationary=stationary,
                                              active_set=nonempty, separate=separate, eps_A=eps_A, eps_B=eps_B)
    elif method=='spectral':
        p0, P, B = init_discrete_hmm_spectral(C, nstates, reversible=reversible, stationary=stationary,
                                              active_set=None, separate=separate, eps_A=eps_A, eps_B=eps_B)
    else:
        raise NotImplementedError('Unknown discrete-HMM initialization method ' + str(method))

    hmm0 = discrete_hmm(p0, P, B)
    hmm0._lag = lag
    return hmm0
Ejemplo n.º 5
0
Archivo: api.py Proyecto: ongbe/bhmm
def init_discrete_hmm(observations,
                      nstates,
                      lag=1,
                      reversible=True,
                      stationary=True,
                      regularize=True,
                      method='connect-spectral',
                      separate=None):
    """Use a heuristic scheme to generate an initial model.

    Parameters
    ----------
    observations : list of ndarray((T_i))
        list of arrays of length T_i with observation data
    nstates : int
        The number of states.
    lag : int
        Lag time at which the observations should be counted.
    reversible : bool
        Estimate reversible HMM transition matrix.
    stationary : bool
        p0 is the stationary distribution of P. Currently only reversible=True is implemented
    regularize : bool
        Regularize HMM probabilities to avoid 0's.
    method : str
        * 'lcs-spectral' : Does spectral clustering on the largest connected set
            of observed states.
        * 'connect-spectral' : Uses a weak regularization to connect the weakly
            connected sets and then initializes HMM using spectral clustering on
            the nonempty set.
        * 'spectral' : Uses spectral clustering on the nonempty subsets. Separated
            observed states will end up in separate hidden states. This option is
            only recommended for small observation spaces. Use connect-spectral for
            large observation spaces.
    separate : None or iterable of int
        Force the given set of observed states to stay in a separate hidden state.
        The remaining nstates-1 states will be assigned by a metastable decomposition.

    Examples
    --------

    Generate initial model for a discrete output model.

    >>> import bhmm
    >>> [model, observations, states] = bhmm.testsystems.generate_synthetic_observations(output='discrete')
    >>> initial_model = init_discrete_hmm(observations, model.nstates)

    """
    import msmtools.estimation as msmest
    from bhmm.init.discrete import init_discrete_hmm_spectral
    C = msmest.count_matrix(observations, lag).toarray()
    # regularization
    if regularize:
        eps_A = None
        eps_B = None
    else:
        eps_A = 0
        eps_B = 0
    if not stationary:
        raise NotImplementedError(
            'Discrete-HMM initialization with stationary=False is not yet implemented.'
        )

    if method == 'lcs-spectral':
        lcs = msmest.largest_connected_set(C)
        p0, P, B = init_discrete_hmm_spectral(C,
                                              nstates,
                                              reversible=reversible,
                                              stationary=stationary,
                                              active_set=lcs,
                                              separate=separate,
                                              eps_A=eps_A,
                                              eps_B=eps_B)
    elif method == 'connect-spectral':
        # make sure we're strongly connected
        C += msmest.prior_neighbor(C, 0.001)
        nonempty = _np.where(C.sum(axis=0) + C.sum(axis=1) > 0)[0]
        C[nonempty, nonempty] = _np.maximum(C[nonempty, nonempty], 0.001)
        p0, P, B = init_discrete_hmm_spectral(C,
                                              nstates,
                                              reversible=reversible,
                                              stationary=stationary,
                                              active_set=nonempty,
                                              separate=separate,
                                              eps_A=eps_A,
                                              eps_B=eps_B)
    elif method == 'spectral':
        p0, P, B = init_discrete_hmm_spectral(C,
                                              nstates,
                                              reversible=reversible,
                                              stationary=stationary,
                                              active_set=None,
                                              separate=separate,
                                              eps_A=eps_A,
                                              eps_B=eps_B)
    else:
        raise NotImplementedError(
            'Unknown discrete-HMM initialization method ' + str(method))

    hmm0 = discrete_hmm(p0, P, B)
    hmm0._lag = lag
    return hmm0
Ejemplo n.º 6
0
def initial_guess_discrete_from_data(
        dtrajs,
        n_hidden_states,
        lagtime,
        stride=1,
        mode='largest-regularized',
        reversible: bool = True,
        stationary: bool = False,
        separate_symbols=None,
        states: Optional[np.ndarray] = None,
        regularize: bool = True,
        connectivity_threshold: Union[str, float] = 0.):
    r"""Estimates an initial guess :class:`HMM <sktime.markov.hmm.HiddenMarkovStateModel>` from given
    discrete trajectories.

    Following the procedure described in [1]_: First a :class:`MSM <sktime.markov.msm.MarkovStateModel>` is
    estimated, which is then subsequently coarse-grained with PCCA+ [2]_. After estimation of the MSM, this method
    class :meth:`initial_guess_discrete_from_msm`.

    Parameters
    ----------
    dtrajs : array_like or list of array_like
        A discrete trajectory or a list of discrete trajectories.
    n_hidden_states : int
        Number of hidden states.
    lagtime : int
        The lagtime at which transitions are counted.
    stride : int or str, optional, default=1
        stride between two lagged trajectories extracted from the input trajectories. Given trajectory :code:`s[t]`,
        stride and lag will result in trajectories

            :code:`s[0], s[lag], s[2 lag], ...`

            :code:`s[stride], s[stride + lag], s[stride + 2 lag], ...`

        Setting stride = 1 will result in using all data (useful for maximum likelihood estimator), while a Bayesian
        estimator requires a longer stride in order to have statistically uncorrelated trajectories. Setting
        :code:`stride='effective'` uses the largest neglected timescale as an estimate for the correlation time
        and sets the stride accordingly.
    mode : str, optional, default='largest-regularized'
        The mode at which the markov state model is estimated. Since the process is assumed to be reversible and
        finite statistics might lead to unconnected regions in state space, a subselection can automatically be made
        and the count matrix can be regularized. The following options are available:

        * 'all': all available states are taken into account
        * 'largest': the largest connected state set is selected, see
          :meth:`TransitionCountModel.submodel_largest <sktime.markov.TransitionCountModel.submodel_largest>`.
        * populus: the connected set with the largest population in the data, see
          :meth:`TransitionCountModel.submodel_largest <sktime.markov.TransitionCountModel.submodel_largest>`.

        For regularization, each of the options can be suffixed by a '-regularized', e.g., 'largest-regularized'.
        This means that the count matrix has no zero entries and everything is reversibly connected. In particular,
        a prior of the form

        .. math:: b_{ij}=\left \{ \begin{array}{rl}
                     \alpha & \text{, if }c_{ij}+c_{ji}>0, \\
                     0      & \text{, otherwise,}
                     \end{array} \right .

        with :math:`\alpha=10^{-3}` is added and all non-reversibly connected components are artifically connected
        by adding backward paths.
    reversible : bool, optional, default=True
        Whether the HMM transition matrix is estimated so that it is reversibe.
    stationary : bool, optional, default=False
        If True, the initial distribution of hidden states is self-consistently computed as the stationary
        distribution of the transition matrix. If False, it will be estimated from the starting states.
        Only set this to true if you're sure that the observation trajectories are initiated from a global
        equilibrium distribution.
    separate_symbols : array_like, optional, default=None
        Force the given set of observed states to stay in a separate hidden state.
        The remaining nstates-1 states will be assigned by a metastable decomposition.
    states : (dtype=int) ndarray, optional, default=None
        Artifically restrict count model to selection of states, even before regularization.
    regularize : bool, optional, default=True
        If set to True, makes sure that the hidden initial distribution and transition matrix have nonzero probabilities
        by setting them to eps and then renormalizing. Avoids zeros that would cause estimation algorithms to crash or
        get stuck in suboptimal states.
    connectivity_threshold : float or '1/n', optional, default=0.
        Connectivity threshold. counts that are below the specified value are disregarded when finding connected
        sets. In case of '1/n', the threshold gets resolved to :math:`1 / \mathrm{n\_states\_full}`.

    Returns
    -------
    hmm_init : HiddenMarkovStateModel
        An initial guess for the HMM

    See Also
    --------
    DiscreteOutputModel : The type of output model this heuristic uses.
    initial_guess_discrete_from_msm : Initial guess from an already existing :class:`MSM <sktime.markov.msm.MarkovStateModel>`.
    initial_guess_gaussian_from_data : Initial guess with :class:`Gaussian output model <sktime.markov.hmm.GaussianOutputModel>`.

    References
    ----------
    .. [1] F. Noe, H. Wu, J.-H. Prinz and N. Plattner: Projected and hidden Markov models for calculating kinetics and
       metastable states of complex molecules. J. Chem. Phys. 139, 184114 (2013)
    .. [2] S. Roeblitz and M. Weber, Fuzzy spectral clustering by PCCA+:
       application to Markov state models and data classification.
       Adv Data Anal Classif 7, 147-179 (2013).
    """
    if mode not in initial_guess_discrete_from_data.VALID_MODES \
            + [m + "-regularized" for m in initial_guess_discrete_from_data.VALID_MODES]:
        raise ValueError("mode can only be one of [{}]".format(", ".join(
            initial_guess_discrete_from_data.VALID_MODES)))

    dtrajs = ensure_dtraj_list(dtrajs)
    dtrajs = compute_dtrajs_effective(dtrajs,
                                      lagtime=lagtime,
                                      n_states=n_hidden_states,
                                      stride=stride)
    counts = TransitionCountEstimator(1, 'sliding',
                                      sparse=False).fit(dtrajs).fetch_model()
    if states is not None:
        counts = counts.submodel(states)
    if '-regularized' in mode:
        import msmtools.estimation as memest
        counts.count_matrix[...] += memest.prior_neighbor(
            counts.count_matrix, 0.001)
        nonempty = np.where(
            counts.count_matrix.sum(axis=0) +
            counts.count_matrix.sum(axis=1) > 0)[0]
        counts.count_matrix[nonempty, nonempty] = np.maximum(
            counts.count_matrix[nonempty, nonempty], 0.001)
    if 'all' in mode:
        pass  # no-op
    if 'largest' in mode:
        counts = counts.submodel_largest(
            directed=True,
            connectivity_threshold=connectivity_threshold,
            sort_by_population=False)
    if 'populous' in mode:
        counts = counts.submodel_largest(
            directed=True,
            connectivity_threshold=connectivity_threshold,
            sort_by_population=True)
    msm = MaximumLikelihoodMSM(reversible=True,
                               allow_disconnected=True,
                               maxerr=1e-3,
                               maxiter=10000).fit(counts).fetch_model()
    return initial_guess_discrete_from_msm(msm, n_hidden_states, reversible,
                                           stationary, separate_symbols,
                                           regularize)