Ejemplo n.º 1
0
 def test_iterator(self):
     for c in self.cl:
         for itraj, chunk in c:
             assert types.is_int(itraj)
             assert types.is_int_matrix(chunk)
             assert chunk.shape[0] <= c.chunksize
             assert chunk.shape[1] == c.dimension()
    def propagate(self, p0, k):
        """ Propagates the initial distribution p0 k times

        Computes the product

        .. math::

            p_k = p_0^T P^k

        If the lag time of transition matrix :math:`P` is :math:`\tau`, this
        will provide the probability distribution at time :math:`k \tau`.

        :param p0: ndarray - initial distribution. Vector of size of the active set
        :param k: int - number of time steps
        :return: ndarray - distribution after k steps, vector of size of the active set
        """

        p0 = _types.ensure_ndarray(p0, ndim=1, kind='numeric')
        assert _types.is_int(k) and k >= 0, 'k must be a non-negative integer'

        if k == 0 or k == 1:
            return self.eval(0).propagate(p0, k).real
        else:
            pprop = self.eval(0).propagate(p0, 1).real
            for i in range(1, k):
                pprop = self.eval(i).propagate(pprop, 1).real
            return pprop
Ejemplo n.º 3
0
    def propagate(self, p0, k):
        r""" Propagates the initial distribution p0 k times

        Computes the product

        .. math::

            p_k = p_0^T P^k

        If the lag time of transition matrix :math:`P` is :math:`\tau`, this
        will provide the probability distribution at time :math:`k \tau`.

        Parameters
        ----------
        p0 : ndarray(n,)
            Initial distribution. Vector of size of the active set.

        k : int
            Number of time steps

        Returns
        ----------
        pk : ndarray(n,)
            Distribution after k steps. Vector of size of the active set.

        """
        p0 = _types.ensure_ndarray(p0,
                                   ndim=1,
                                   size=self.nstates,
                                   kind='numeric')
        assert _types.is_int(k) and k >= 0, 'k must be a non-negative integer'

        if k == 0:  # simply return p0 normalized
            return p0 / p0.sum()

        if self.is_sparse:  # sparse: we don't have a full eigenvalue set, so just propagate
            pk = _np.array(p0)
            for i in range(k):
                pk = _np.dot(pk.T, self.transition_matrix)
        else:  # dense: employ eigenvalue decomposition
            self._ensure_eigendecomposition(self.nstates)
            from pyemma.util.linalg import mdot
            pk = mdot(p0.T, self.eigenvectors_right(),
                      _np.diag(_np.power(self.eigenvalues(), k)),
                      self.eigenvectors_left()).real
        # normalize to 1.0 and return
        return pk / pk.sum()
Ejemplo n.º 4
0
    def fixed_seed(self, val):
        from pyemma.util import types
        if isinstance(val, bool) or val is None:
            if val:
                self._fixed_seed = 42
            else:
                self._fixed_seed = random.randint(0, 2**32-1)
        elif types.is_int(val):
            if val < 0 or val > 2**32-1:
                self.logger.warn("seed has to be positive (or smaller than 2**32-1)."
                                 " Seed will be chosen randomly.")
                self.fixed_seed = False
            else:
                self._fixed_seed = val
        else:
            raise ValueError("fixed seed has to be bool or integer")

        self.logger.debug("seed = %i", self._fixed_seed)
Ejemplo n.º 5
0
    def test_dimension(self):
        assert types.is_int(self.tica_obj.dimension())
        # Here:
        assert self.tica_obj.dimension() == 1
        # Test other variants
        tica = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=1.0)
        assert tica.dimension() == 2
        tica = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=0.9)
        assert tica.dimension() == 1
        with self.assertRaises(
                ValueError
        ):  # trying to set both dim and subspace_variance is forbidden
            api.tica(data=self.X, lag=self.lag, dim=1, var_cutoff=0.9)

        with self.assertRaises(ValueError):
            api.tica(lag=self.lag, var_cutoff=0)
        with self.assertRaises(ValueError):
            api.tica(lag=self.lag, var_cutoff=1.1)
Ejemplo n.º 6
0
    def __init__(self,
                 estimator,
                 lags=None,
                 nits=None,
                 n_jobs=1,
                 show_progress=True):
        r"""Implied timescales for a series of lag times.

        Parameters
        ----------
        estimator : Estimator
            Estimator to be used for estimating timescales at each lag time.

        lags : array-like with integers or None, optional
            integer lag times at which the implied timescales will be calculated. If set to None (default)
            as list of lagtimes will be automatically generated.

        nits : int, optional
            maximum number of implied timescales to be computed and stored. If less
            timescales are available, nits will be set to a smaller value during
            estimation. None means the number of timescales will be automatically
            determined.

        n_jobs: int, optional
            how many subprocesses to start to estimate the models for each lag time.

        """
        # initialize
        self.estimator = get_estimator(estimator)
        self.nits = nits
        self.n_jobs = n_jobs
        self.show_progress = show_progress

        # set lag times
        if _types.is_int(lags):  # got a single integer. We create a list
            self._lags = _generate_lags(lags, 1.5)
        else:  # got a list of ints or None - otherwise raise exception.
            self._lags = _types.ensure_int_vector_or_None(lags,
                                                          require_order=True)

        # estimated its. 2D-array with indexing: lagtime, its
        self._its = None
        # sampled its's. 3D-array with indexing: lagtime, its, sample
        self._its_samples = None
Ejemplo n.º 7
0
    def _progress_register(self, amount_of_work, description='', stage=0):
        """ Registers a progress which can be reported/displayed via a progress bar.

        Parameters
        ----------
        amount_of_work : int
            Amount of steps the underlying algorithm has to perform.
        description : str, optional
            This string will be displayed in the progress bar widget.
        stage : int, optional, default=0
            If the algorithm has multiple different stages (eg. calculate means
            in the first pass over the data, calculate covariances in the second),
            one needs to estimate different times of arrival.
        """
        if not self.show_progress:
            return

        if not is_int(amount_of_work):
            raise ValueError(
                "amount_of_work has to be of integer type. But is %s" %
                type(amount_of_work))

        # if we do not have enough work to do for the overhead of a progress bar,
        # we just define a dummy here
        if amount_of_work <= ProgressReporter._pg_threshold:

            class dummy(object):
                pass

            pg = dummy()
            pg.__str__ = lambda: description
            pg.__repr__ = pg.__str__
            pg._dummy = None
            pg.description = ''
        else:
            pg = _ProgressBar(amount_of_work, description=description)

        self._prog_rep_progressbars[stage] = pg
        # pg.description = description
        self._prog_rep_descriptions[stage] = description
Ejemplo n.º 8
0
    def __init__(self, test_model, test_estimator, mlags=None, conf=0.95, err_est=False,
                 n_jobs=None, show_progress=True):

        # set model and estimator
        # copy the test model, since the estimation of cktest modifies the model.
        from copy import deepcopy
        self.test_model = deepcopy(test_model)
        self.test_estimator = test_estimator

        # set mlags
        try:
            maxlength = np.max([len(dtraj) for dtraj in test_estimator.discrete_trajectories_full])
        except AttributeError:
            maxlength = np.max(test_estimator.trajectory_lengths())
        maxmlag = int(math.floor(maxlength / test_estimator.lag))
        if mlags is None:
            mlags = maxmlag
        if types.is_int(mlags):
            mlags = np.arange(mlags)
        mlags = types.ensure_ndarray(mlags, ndim=1, kind='i')
        if np.any(mlags > maxmlag):
            mlags = mlags[np.where(mlags <= maxmlag)]
            self.logger.warning('Changed mlags as some mlags exceeded maximum trajectory length.')
        if np.any(mlags < 0):
            mlags = mlags[np.where(mlags >= 0)]
            self.logger.warning('Changed mlags as some mlags were negative.')
        self.mlags = mlags

        # set conf and error handling
        self.conf = conf
        self.has_errors = issubclass(self.test_model.__class__, SampledModel)
        if self.has_errors:
            self.test_model.set_model_params(conf=conf)
        self.err_est = err_est
        if err_est and not self.has_errors:
            raise ValueError('Requested errors on the estimated models, '
                             'but the model is not able to calculate errors at all')
        self.n_jobs = n_jobs
        self.show_progress = show_progress
Ejemplo n.º 9
0
    def lags(self, lags):
        """Sets the lag times at which the models will be estimated.
        Remembers the last non None value, in order to extend the lag list. If the input data during estimation is unchanged, we will only
        need to re-estimate new lag times. If a lag time is removed, we clean up the underlying models and
        derived data (timescales).
        """
        if hasattr(self, '_lags') and self._lags is not None:
            self._last_lags = frozenset(self._lags)
            # remove obsolete models and computed data.
            if self._models and lags is not None:
                survivors = np.array([
                    i for i in self._successful_lag_indexes
                    if self._models[i].lag in lags
                ])
                if survivors.size == 0:
                    self._models = []
                    self._its = None
                    self._its_samples = None
                    self._successful_lag_indexes = None
                else:
                    self._models = np.array(self._models)[survivors].tolist()
                    self._successful_lag_indexes = np.arange(len(self._models))
                    self._its = self._its[survivors]
                    if self.samples_available:
                        self._its_samples = self._its_samples[survivors]
        else:
            self._last_lags = set()

        # set lag times
        if _types.is_int(lags):  # got a single integer. We create a list
            self._lags = _generate_lags(lags, 1.5)
        elif lags is None:  # obtain a series of meaningful lag times from the input trajectory length
            self._lags = None
        else:  # got a list of ints or None - otherwise raise exception.
            self._lags = _types.ensure_int_vector_or_None(lags,
                                                          require_order=True)
            self._lags.sort()
Ejemplo n.º 10
0
 def test_chunksize(self):
     assert types.is_int(self.pca_obj.chunksize)
Ejemplo n.º 11
0
 def test_lag(self):
     assert types.is_int(self.tica_obj.lag)
     # Here:
     assert self.tica_obj.lag == self.lag
Ejemplo n.º 12
0
 def test_iterator(self):
     for itraj, chunk in self.pca_obj:
         assert types.is_int(itraj)
         assert types.is_float_matrix(chunk)
         assert chunk.shape[1] == self.pca_obj.dimension()
Ejemplo n.º 13
0
 def test_dimension(self):
     c = self.ass
     assert types.is_int(c.dimension())
     assert c.dimension() == 1
Ejemplo n.º 14
0
 def test_lag(self):
     assert types.is_int(self.tica_obj.lag)
     # Here:
     assert self.tica_obj.lag == self.lag
Ejemplo n.º 15
0
 def test_dimension(self):
     for c in self.cl:
         assert types.is_int(c.dimension())
         assert c.dimension() == 1
Ejemplo n.º 16
0
    def __init__(self, model, estimator, mlags=None, conf=0.95, err_est=False,
                 n_jobs=1, show_progress=True):
        r"""
        Parameters
        ----------
        model : Model
            Model to be tested

        estimator : Estimator
            Parametrized Estimator that has produced the model

        mlags : int or int-array, default=10
            multiples of lag times for testing the Model, e.g. range(10).
            A single int will trigger a range, i.e. mlags=10 maps to
            mlags=range(10). The setting None will choose mlags automatically
            according to the longest available trajectory
            Note that you need to be able to do a model prediction for each
            of these lag time multiples, e.g. the value 0 only make sense
            if _predict_observables(0) will work.

        conf : float, default = 0.95
            confidence interval for errors

        err_est : bool, default=False
            if the Estimator is capable of error calculation, will compute
            errors for each tau estimate. This option can be computationally
            expensive.

        n_jobs : int, default=1
            how many jobs to use during calculation

        show_progress : bool, default=True
            Show progressbars for calculation?

        """
        # set model and estimator
        self.test_model = model
        self.test_estimator = estimator

        # set mlags
        maxlength = np.max([len(dtraj) for dtraj in estimator.discrete_trajectories_full])
        maxmlag = int(math.floor(maxlength / estimator.lag))
        if mlags is None:
            mlags = maxmlag
        if types.is_int(mlags):
            mlags = np.arange(mlags)
        mlags = types.ensure_ndarray(mlags, ndim=1, kind='i')
        if np.any(mlags > maxmlag):
            mlags = mlags[np.where(mlags <= maxmlag)]
            self.logger.warn('Changed mlags as some mlags exceeded maximum trajectory length.')
        if np.any(mlags < 0):
            mlags = mlags[np.where(mlags >= 0)]
            self.logger.warn('Changed mlags as some mlags were negative.')
        self.mlags = mlags

        # set conf and error handling
        self.conf = conf
        self.has_errors = issubclass(self.test_model.__class__, SampledModel)
        if self.has_errors:
            self.test_model.set_model_params(conf=conf)
        self.err_est = err_est
        if err_est and not self.has_errors:
            raise ValueError('Requested errors on the estimated models, '
                             'but the model is not able to calculate errors at all')
        self.n_jobs = n_jobs
        self.show_progress = show_progress
Ejemplo n.º 17
0
 def test_chunksize(self):
     for c in self.cl:
         assert types.is_int(c.chunksize)
Ejemplo n.º 18
0
 def test_dimension(self):
     for c in self.cl:
         assert types.is_int(c.dimension())
         assert c.dimension() == 1
Ejemplo n.º 19
0
 def test_dimension(self):
     c = self.ass
     assert types.is_int(c.dimension())
     assert c.dimension() == 1
Ejemplo n.º 20
0
 def test_dimension(self):
     assert types.is_int(self.pca_obj.dimension())
     # Here:
     assert self.pca_obj.dimension() == 1
Ejemplo n.º 21
0
 def test_iterator(self):
     for itraj, chunk in self.inp:
         assert types.is_int(itraj)
         assert types.is_float_matrix(chunk)
         assert chunk.shape[0] == self.inp.chunksize
         assert chunk.shape[1] == self.inp.dimension()
Ejemplo n.º 22
0
 def test_iterator(self):
     for itraj, chunk in self.pca_obj:
         assert types.is_int(itraj)
         assert types.is_float_matrix(chunk)
         assert chunk.shape[0] <= self.pca_obj.chunksize + self.lag
         assert chunk.shape[1] == self.pca_obj.dimension()
Ejemplo n.º 23
0
    def _estimate(self, dtrajs):
        """

        Parameters
        ----------

        Return
        ------
        hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>`
            Estimated Hidden Markov state model

        """
        # ensure right format
        dtrajs = _types.ensure_dtraj_list(dtrajs)
        # if no initial MSM is given, estimate it now
        if self.msm_init is None:
            # estimate with sparse=False, because we need to do PCCA which is currently not implemented for sparse
            # estimate with store_data=True, because we need an EstimatedMSM
            msm_estimator = _MSMEstimator(lag=self.lag, reversible=self.reversible, sparse=False,
                                          connectivity=self.connectivity, dt_traj=self.timestep_traj)
            msm_init = msm_estimator.estimate(dtrajs)
        else:
            assert isinstance(self.msm_init, _EstimatedMSM), 'msm_init must be of type EstimatedMSM'
            msm_init = self.msm_init
            self.reversible = msm_init.is_reversible

        # print 'Connected set: ', msm_init.active_set

        # generate lagged observations
        if self.stride == 'effective':
            # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding
            # how many uncorrelated counts we can make
            self.stride = self.lag
            # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an
            # estimate of the decorrelation time
            if msm_init.nstates > self.nstates:
                corrtime = int(max(1, msm_init.timescales()[self.nstates-1]))
                # use the smaller of these two pessimistic estimates
                self.stride = min(self.stride, 2*corrtime)
        # TODO: Here we always use the full observation state space for the estimation.
        dtrajs_lagged = _lag_observations(dtrajs, self.lag, stride=self.stride)

        # check input
        assert _types.is_int(self.nstates) and self.nstates > 1 and self.nstates <= msm_init.nstates, \
            'nstates must be an int in [2,msmobj.nstates]'
        # if hmm.nstates = msm.nstates there is no problem. Otherwise, check spectral gap
        if msm_init.nstates > self.nstates:
            timescale_ratios = msm_init.timescales()[:-1] / msm_init.timescales()[1:]
            if timescale_ratios[self.nstates-2] < 2.0:
                self.logger.warn('Requested coarse-grained model with ' + str(self.nstates) + ' metastable states at ' +
                                 'lag=' + str(self.lag) + '.' + 'The ratio of relaxation timescales between ' +
                                 str(self.nstates) + ' and ' + str(self.nstates+1) + ' states is only ' +
                                 str(timescale_ratios[self.nstates-2]) + ' while we recommend at least 2. ' +
                                 ' It is possible that the resulting HMM is inaccurate. Handle with caution.')

        # set things from MSM
        # TODO: dtrajs_obs is set here, but not used in estimation. Estimation is alwas done with
        # TODO: respect to full observation (see above). This is confusing. Define how we want to do this in gen.
        # TODO: observable set is also not used, it is just saved.
        nstates_obs_full = msm_init.nstates_full
        if self.observe_active:
            nstates_obs = msm_init.nstates
            observable_set = msm_init.active_set
            dtrajs_obs = msm_init.discrete_trajectories_active
        else:
            nstates_obs = msm_init.nstates_full
            observable_set = np.arange(nstates_obs_full)
            dtrajs_obs = msm_init.discrete_trajectories_full

        # TODO: this is redundant with BHMM code because that code is currently not easily accessible and
        # TODO: we don't want to re-estimate. Should be reengineered in bhmm.
        # ---------------------------------------------------------------------------------------
        # PCCA-based coarse-graining
        # ---------------------------------------------------------------------------------------
        # pcca- to number of metastable states
        pcca = msm_init.pcca(self.nstates)

        # HMM output matrix
        eps = 0.01 * (1.0/nstates_obs_full)  # default output probability, in order to avoid zero columns
        # Use PCCA distributions, but at least eps to avoid 100% assignment to any state (breaks convergence)
        B_conn = np.maximum(msm_init.metastable_distributions, eps)
        # full state space output matrix
        B = eps * np.ones((self.nstates, nstates_obs_full), dtype=np.float64)
        # expand B_conn to full state space
        # TODO: here we always select the active set, no matter if observe_active=True or False.
        B[:, msm_init.active_set] = B_conn[:, :]
        # TODO: at this point we will have zero observation probabilities for states that are not in the active
        # TODO: set. If these occur in the trajectory, that will mean zero columns in the output probabilities
        # TODO: and crash of forward-backward and sampling algorithms.
        # renormalize B to make it row-stochastic
        B /= B.sum(axis=1)[:, None]

        # coarse-grained transition matrix
        P_coarse = pcca.coarse_grained_transition_matrix
        # take care of unphysical values. First symmetrize
        X = np.dot(np.diag(pcca.coarse_grained_stationary_probability), P_coarse)
        X = 0.5*(X + X.T)
        # if there are values < 0, set to eps
        X = np.maximum(X, eps)
        # turn into coarse-grained transition matrix
        A = X / X.sum(axis=1)[:, None]

        # ---------------------------------------------------------------------------------------
        # Estimate discrete HMM
        # ---------------------------------------------------------------------------------------
        # lazy import bhmm here in order to avoid dependency loops
        import bhmm
        # initialize discrete HMM
        hmm_init = bhmm.discrete_hmm(A, B, stationary=True, reversible=self.reversible)
        # run EM
        hmm = bhmm.estimate_hmm(dtrajs_lagged, self.nstates, lag=1, initial_model=hmm_init,
                                accuracy=self.accuracy, maxit=self.maxit)
        self.hmm = bhmm.DiscreteHMM(hmm)

        # find observable set
        transition_matrix = self.hmm.transition_matrix
        observation_probabilities = self.hmm.output_probabilities
        # TODO: Cutting down... OK, this can be done
        if self.observe_active:  # cut down observation probabilities to active set
            observation_probabilities = observation_probabilities[:, msm_init.active_set]
            observation_probabilities /= observation_probabilities.sum(axis=1)[:,None]  # renormalize

        # parametrize self
        self._dtrajs_full = dtrajs
        self._dtrajs_lagged = dtrajs_lagged
        self._observable_set = observable_set
        self._dtrajs_obs = dtrajs_obs
        self.set_model_params(P=transition_matrix, pobs=observation_probabilities,
                              reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Ejemplo n.º 24
0
 def test_chunksize(self):
     assert types.is_int(self.inp.chunksize)
Ejemplo n.º 25
0
 def test_chunksize(self):
     assert types.is_int(self.pca_obj.chunksize)
Ejemplo n.º 26
0
 def test_dimension(self):
     assert types.is_int(self.inp.dimension())
Ejemplo n.º 27
0
 def test_chunksize(self):
     for c in self.cl:
         assert types.is_int(c.chunksize)
Ejemplo n.º 28
0
 def test_dimension(self):
     assert types.is_int(self.pca_obj.dimension())
     # Here:
     assert self.pca_obj.dimension() == 1
Ejemplo n.º 29
0
 def test_chunksize(self):
     assert types.is_int(self.ass.chunksize)