Ejemplo n.º 1
0
 def test_basic_properties(self):
     # two obs states, four hidden states
     output_probs = np.array([[0.5, 0.3, 0.2, 0.0], [0.5, 0.3, 0.2, 0.0]])
     m = DiscreteOutputModel(output_probs, ignore_outliers=False)
     np.testing.assert_equal(m.ignore_outliers, False)
     m.ignore_outliers = True
     np.testing.assert_equal(m.ignore_outliers, True)
     np.testing.assert_allclose(m.prior, 0.)
     np.testing.assert_equal(m.n_hidden_states, 2)
     np.testing.assert_equal(m.n_observable_states, 4)
     np.testing.assert_equal(m.output_probabilities, output_probs)
Ejemplo n.º 2
0
 def test_fit(self):
     output_probabilities = np.array([[0.8, 0.1, 0.1], [0.1, 0.9, 0.0]])
     n_trajs = 100
     m = DiscreteOutputModel(output_probabilities)
     obs = [
         np.random.randint(0, 3, size=10000 + np.random.randint(-3, 3))
         for _ in range(n_trajs)
     ]
     weights = [np.random.dirichlet([2, 3, 4], size=o.size) for o in obs]
     m.fit(obs, weights)
     np.testing.assert_allclose(m.output_probabilities, 1. / 3, atol=.01)
Ejemplo n.º 3
0
    def test_output_probability_trajectory(self):
        output_probabilities = np.array([
            [0.1, 0.6, 0.1, 0.1, 0.1],
            [0.1, 0.3, 0.1, 0.3, 0.2],
            [0.1, 0.1, 0.1, 0.1, 0.6],
            [0.6, 0.1, 0.1, 0.1, 0.1],
        ])

        m = DiscreteOutputModel(output_probabilities)
        obs_traj = np.array([0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4])
        prob_traj = m.to_state_probability_trajectory(obs_traj)
        for state, prob in zip(obs_traj, prob_traj):
            np.testing.assert_equal(prob, output_probabilities[:, state])
Ejemplo n.º 4
0
 def test_sample(self):
     output_probabilities = np.array([[0.8, 0.1, 0.1], [0.1, 0.9, 0.0]])
     m = DiscreteOutputModel(output_probabilities)
     obs_per_state = [
         np.array([0] * 50000 + [1] * 50000),  # state 0
         np.array([1] * 30000 + [2] * 70000)  # state 1
     ]
     m.sample(obs_per_state)
     # the output probabilities of the unpopulated states are left as-is (can't sample), hence we compare against
     # [[.5, .5, .1], [.1, .3, .7]] instead of [[.5, .5, .0], [.0, .3, .7]]
     np.testing.assert_array_almost_equal(m.output_probabilities,
                                          np.array([[.5, .5, .1],
                                                    [.1, .3, .7]]),
                                          decimal=2)
Ejemplo n.º 5
0
 def test_invalid_ctor_args(self):
     with self.assertRaises(ValueError):
         # not row stochastic
         DiscreteOutputModel(
             np.array([[0.5, 0.3, 0.2, 0.1], [0.5, 0.3, 0.2, 0.0]]))
     # no-op: this does not raise
     DiscreteOutputModel(np.array([[0., 1., 0., 0.], [1., 0., 0., 0.]]),
                         prior=np.random.normal(size=(2, 4)).astype(
                             np.float64))
     with self.assertRaises(ValueError):
         # prior has wrong shape, raise
         DiscreteOutputModel(np.array([[0., 1., 0., 0.], [1., 0., 0., 0.]]),
                             prior=np.random.normal(size=(2, 5)).astype(
                                 np.float64))
Ejemplo n.º 6
0
 def test_observation_trajectory(self):
     output_probabilities = np.array([
         [0.1, 0.6, 0.1, 0.1, 0.1],
         [0.1, 0.3, 0.1, 0.3, 0.2],
         [0.1, 0.1, 0.1, 0.1, 0.6],
         [0.6, 0.1, 0.1, 0.1, 0.1],
     ])
     m = DiscreteOutputModel(output_probabilities)
     np.testing.assert_equal(m.n_hidden_states, 4)
     np.testing.assert_equal(m.n_observable_states, 5)
     traj = m.generate_observation_trajectory(np.array([1] * 2000000))
     bc = np.bincount(traj.astype(np.int32),
                      minlength=m.n_observable_states).astype(np.float32)
     bc /= np.sum(bc)
     np.testing.assert_array_almost_equal(bc,
                                          np.array(
                                              [0.1, 0.3, 0.1, 0.3, 0.2]),
                                          decimal=2)
Ejemplo n.º 7
0
 def __init__(
         self,
         transition_model,
         output_model: Union[np.ndarray, OutputModel],
         initial_distribution: Optional[np.ndarray] = None,
         likelihoods: Optional[np.ndarray] = None,
         state_probabilities: Optional[List[np.ndarray]] = None,
         initial_count: Optional[np.ndarray] = None,
         hidden_state_trajectories: Optional[Iterable[np.ndarray]] = None,
         stride: Union[int, str] = 1,
         observation_symbols: Optional[np.ndarray] = None,
         observation_symbols_full: Optional[np.ndarray] = None):
     super().__init__()
     if isinstance(transition_model, np.ndarray):
         from deeptime.markov.msm import MarkovStateModel
         transition_model = MarkovStateModel(transition_model)
     if isinstance(output_model, np.ndarray):
         output_model = DiscreteOutputModel(output_model)
     if transition_model.n_states != output_model.n_hidden_states:
         raise ValueError("Transition model must describe hidden states")
     if initial_distribution is None:
         # uniform
         initial_distribution = np.ones(
             transition_model.n_states) / transition_model.n_states
     if initial_distribution.shape[0] != transition_model.n_states:
         raise ValueError(
             "Initial distribution over hidden states must be of length {}".
             format(transition_model.n_states))
     self._transition_model = transition_model
     self._output_model = output_model
     self._initial_distribution = initial_distribution
     self._likelihoods = likelihoods
     self._state_probabilities = state_probabilities
     self._initial_count = initial_count
     self._hidden_state_trajectories = hidden_state_trajectories
     if observation_symbols is None and output_model.n_observable_states >= 0:
         observation_symbols = np.arange(output_model.n_observable_states)
         observation_symbols_full = observation_symbols
     self._observation_symbols = observation_symbols
     self._observation_symbols_full = observation_symbols_full
     if not (isinstance(stride, Integral) or
             (isinstance(stride, str) and stride == 'effective')):
         raise ValueError(
             "Stride argument must either be an integer value or 'effective', "
             "but was: {}".format(stride))
     self._stride = stride
Ejemplo n.º 8
0
    def fit(self, data, n_burn_in: int = 0, n_thin: int = 1, **kwargs):
        r""" Sample from the posterior.

        Parameters
        ----------
        data : array_like or list of array_like
            Input time series data.
        n_burn_in : int, optional, default=0
            The number of samples to discard to burn-in, following which :attr:`n_samples` samples will be generated.
        n_thin : int, optional, default=1
            The number of Gibbs sampling updates used to generate each returned sample.
        **kwargs
            Ignored kwargs for scikit-learn compatibility.

        Returns
        -------
        self : BayesianHMM
            Reference to self.
        """
        dtrajs = ensure_dtraj_list(data)

        # fetch priors
        tmat = self.initial_hmm.transition_model.transition_matrix
        transition_matrix_prior = self._transition_matrix_prior_np

        initial_distribution_prior = self._initial_distribution_prior_np

        model = BayesianHMMPosterior()
        # update HMM Model
        model.prior = self.initial_hmm.copy()

        prior = model.prior

        # check if we are strongly connected in the reversible case (plus prior)
        if self.reversible and not is_connected(tmat + transition_matrix_prior,
                                                directed=True):
            raise NotImplementedError(
                'Trying to sample disconnected HMM with option reversible:\n '
                f'{tmat}\n Use prior to connect, select connected subset, '
                f'or use reversible=False.')

        # EVALUATE STRIDE
        dtrajs_lagged_strided = compute_dtrajs_effective(
            dtrajs,
            lagtime=prior.lagtime,
            n_states=prior.n_hidden_states,
            stride=self.stride)
        # if stride is different to init_hmm, check if microstates in lagged-strided trajs are compatible
        if self.stride != self.initial_hmm.stride:
            symbols = np.unique(np.concatenate(dtrajs_lagged_strided))
            if not len(
                    np.intersect1d(self.initial_hmm.observation_symbols,
                                   symbols)) == len(symbols):
                raise ValueError(
                    'Choice of stride has excluded a different set of microstates than in '
                    'init_hmm. Set of observed microstates in time-lagged strided trajectories '
                    'must match to the one used for init_hmm estimation.')

        # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the
        # Bayesian HMM sampler. This is just an initialization.
        n_states_full = number_of_states(dtrajs_lagged_strided)

        if prior.n_observation_states < n_states_full:
            eps = 0.01 / n_states_full  # default output probability, in order to avoid zero columns
            # full state space output matrix. make sure there are no zero columns
            full_obs_probabilities = eps * np.ones(
                (prior.n_hidden_states, n_states_full), dtype=np.float64)
            # fill active states
            full_obs_probabilities[:, prior.observation_symbols] = np.maximum(
                eps, prior.output_probabilities)
            # renormalize B to make it row-stochastic
            full_obs_probabilities /= full_obs_probabilities.sum(axis=1)[:,
                                                                         None]
        else:
            full_obs_probabilities = prior.output_probabilities

        maxT = max(len(o) for o in dtrajs_lagged_strided)

        # pre-construct hidden variables
        temp_alpha = np.zeros((maxT, prior.n_hidden_states))

        has_all_obs_symbols = model.prior.n_observation_states == len(
            model.prior.observation_symbols_full)

        try:
            # sample model is basically copy of prior
            sample_model = BayesianHMM._SampleStorage(
                transition_matrix=prior.transition_model.transition_matrix.
                copy(),
                output_model=DiscreteOutputModel(
                    full_obs_probabilities.copy()),
                initial_distribution=prior.initial_distribution.copy(),
                stationary_distribution=prior.transition_model.
                stationary_distribution.copy(),
                counts=prior.count_model.count_matrix.copy(),
                hidden_trajs=[])

            # Run burn-in.
            for _ in range(n_burn_in):
                self._update(sample_model, dtrajs_lagged_strided, temp_alpha,
                             transition_matrix_prior,
                             initial_distribution_prior)

            # Collect data.
            models = []
            for _ in range(self.n_samples):
                # Run a number of Gibbs sampling updates to generate each sample.
                for _ in range(n_thin):
                    self._update(sample_model, dtrajs_lagged_strided,
                                 temp_alpha, transition_matrix_prior,
                                 initial_distribution_prior)
                    sample_model.output_model.normalize()
                self._append_sample(models, prior, sample_model)

            if not has_all_obs_symbols:
                models = [
                    m.submodel(states=None,
                               obs=model.prior.observation_symbols)
                    for m in models
                ]

            model.samples = models
        finally:
            del temp_alpha

        # set new model
        self._model = model

        return self