Esempio n. 1
0
 def test_singletraj(self):
     # lag 1
     C = count_matrix(self.dtraj_long, 1)
     Ceff = effective_count_matrix(self.dtraj_long, 1)
     assert np.array_equal(Ceff.shape, C.shape)
     assert np.array_equal(C.nonzero(), Ceff.nonzero())
     assert np.all(Ceff.toarray() <= C.toarray())
     # lag 100
     C = count_matrix(self.dtraj_long, 100)
     Ceff = effective_count_matrix(self.dtraj_long, 100)
     assert np.array_equal(Ceff.shape, C.shape)
     assert np.array_equal(C.nonzero(), Ceff.nonzero())
     assert np.all(Ceff.toarray() <= C.toarray())
Esempio n. 2
0
 def test_multitraj(self):
     dtrajs = [[1, 0, 1, 0, 1, 1, 0, 0, 0, 1], [2], [0, 1, 0, 1]]
     # lag 1
     C = count_matrix(dtrajs, 1)
     Ceff = effective_count_matrix(dtrajs, 1)
     assert np.array_equal(Ceff.shape, C.shape)
     assert np.array_equal(C.nonzero(), Ceff.nonzero())
     assert np.all(Ceff.toarray() <= C.toarray())
     # lag 2
     C = count_matrix(dtrajs, 2)
     Ceff = effective_count_matrix(dtrajs, 2)
     assert np.array_equal(Ceff.shape, C.shape)
     assert np.array_equal(C.nonzero(), Ceff.nonzero())
     assert np.all(Ceff.toarray() <= C.toarray())
Esempio n. 3
0
    def count_lagged(self, lag, count_mode='sliding'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)


        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if count_mode == 'sliding':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True)
        elif count_mode == 'sample':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False)
        elif count_mode == 'effective':
            self._C = msmest.effective_count_matrix(self._dtrajs, lag)
        else:
            raise ValueError('Count mode ' + count_mode + ' is unknown.')

        # Compute reversibly connected sets
        self._connected_sets = msmest.connected_sets(self._C)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.zeros((len(self._connected_sets)))
        self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object)
        for i in range(len(self._connected_sets)):
            # set size
            self._connected_set_sizes[i] = len(self._connected_sets[i])
            # submatrix
            self._C_sub[i] = submatrix(self._C, self._connected_sets[i])

        # largest connected set
        lcs = self._connected_sets[0]

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[lcs] = np.array(list(range(len(lcs))), dtype=int)

        # remember that this function was called
        self._counted_at_lag = True
    def test_multitraj_njobs(self):
        import _multiprocess
        dtrajs = [[1, 0, 1, 0, 1, 1, 0, 0, 0, 1], [2], [0, 1, 0, 1]]
        # lag 1
        C = count_matrix(dtrajs, 1)
        Ceff = effective_count_matrix(dtrajs, 1, n_jobs=1)
        assert np.array_equal(Ceff.shape, C.shape)
        assert np.array_equal(C.nonzero(), Ceff.nonzero())
        assert np.all(Ceff.toarray() <= C.toarray())

        Ceff2 = effective_count_matrix(dtrajs, 1, n_jobs=2)
        assert np.array_equal(Ceff2.shape, C.shape)
        assert np.array_equal(C.nonzero(), Ceff2.nonzero())
        assert np.all(Ceff2.toarray() <= C.toarray())

        # lag 2
        C = count_matrix(dtrajs, 2)
        Ceff2 = effective_count_matrix(dtrajs, 2)
        assert np.array_equal(Ceff2.shape, C.shape)
        assert np.array_equal(C.nonzero(), Ceff2.nonzero())
        assert np.all(Ceff2.toarray() <= C.toarray())
    def effective_count_matrix(self):
        """Statistically uncorrelated transition counts within the active set of states

        You can use this count matrix for Bayesian estimation or error perturbation.

        References
        ----------
        [1] Noe, F. (2015) Statistical inefficiency of Markov model count matrices
            http://publications.mi.fu-berlin.de/1699/1/autocorrelation_counts.pdf

        """
        self._check_is_estimated()
        Ceff_full = msmest.effective_count_matrix(self._dtrajs_full, self.lag)
        from pyemma.util.linalg import submatrix
        Ceff = submatrix(Ceff_full, self.active_set)
        return Ceff
Esempio n. 6
0
    def fit(self, dtrajs):
        # remove last lag steps from dtrajs:
        dtrajs_lag = [traj[:-self.lagtime] for traj in dtrajs]
        count_model = TransitionCountEstimator(lagtime=self.lagtime, mincount_connectivity=self.connectivity_threshold,
                                               count_mode=self.count_mode).fit(dtrajs).fetch_model()

        # Estimate transition matrix using re-sampling:
        if self.rank_Ct == 'bootstrap_counts':
            Ceff_full = effective_count_matrix(dtrajs_lag, self.lagtime)
            Ceff = submatrix(Ceff_full, count_model.active_set)
            smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs)
        else:
            smean, sdev = bootstrapping_dtrajs(dtrajs_lag, self.lagtime, count_model.n_states, nbs=self.nbs,
                                               active_set=count_model.active_set)
        # Estimate two step count matrices:
        C2t = twostep_count_matrix(dtrajs, self.lagtime, count_model.n_states)
        # Rank decision:
        rank_ind = rank_decision(smean, sdev, tol=self.tol_rank)
        # Estimate OOM components:
        Xi, omega, sigma, l = oom_components(count_model.count_matrix.toarray(), C2t, rank_ind=rank_ind,
                                             lcc=count_model.active_set)
        # Compute transition matrix:
        P, lcc_new = equilibrium_transition_matrix(Xi, omega, sigma, reversible=self.reversible)

        # Update active set and derived quantities:
        # todo: dont re-initialize, this is only due to active set (see bhmm impl)
        if lcc_new.size < count_model.n_states:
            assert isinstance(count_model, TransitionCountModel)
            count_model.__init__(self.lagtime, active_set=count_model.active_set[lcc_new],
                                 physical_time=count_model.physical_time, connected_sets=count_model.connected_sets,
                                 count_matrix=count_model.count_matrix)
            warnings.warn("Caution: Re-estimation of count matrix resulted in reduction of the active set.")

        # update models
        count_model.C2t = C2t

        self._model = KoopmanReweightedMSM(
            transition_matrix=P,
            eigenvalues_OOM=l,
            sigma=sigma,
            omega=omega,
            count_model=count_model,
            oom_components=Xi
        )

        return self
Esempio n. 7
0
    def test_compare_with_old_impl(self):
        # generated with v1.1@ from
        # pyemma.datasets.load_2well_discrete().dtraj_T100K_dt10_n6good
        Ceff_ref = np.array(
            [[
                2.21353316e+04, 2.13659736e+03, 4.63558176e+02, 1.56043628e+02,
                3.88680098e+01, 1.14317676e+01
            ],
             [
                 1.84456322e+03, 3.74107190e+02, 1.79811199e+02,
                 9.29024530e+01, 5.59412620e+01, 2.59727288e+01
             ],
             [
                 3.45678646e+02, 1.42148228e+02, 8.19775293e+01,
                 7.75353971e+01, 5.73438875e+01, 8.19775293e+01
             ],
             [
                 9.08206988e+01, 6.53466003e+01, 7.82682445e+01,
                 7.71606750e+01, 8.38060919e+01, 2.84276171e+02
             ],
             [
                 3.56219388e+01, 3.43186971e+01, 7.64568442e+01,
                 1.13816439e+02, 2.51960055e+02, 1.33451946e+03
             ],
             [
                 1.57044024e+01, 3.26168358e+01, 1.12346879e+02,
                 4.34287128e+02, 1.88573632e+03, 2.35837843e+04
             ]])
        import pkg_resources
        f = pkg_resources.resource_filename('msmtools.estimation',
                                            'tests/testfiles/dwell.npz')
        ref_dtraj = np.load(f)['dtraj_T100K_dt10_n6good'].astype('int32')
        Ceff = effective_count_matrix(ref_dtraj,
                                      lag=10,
                                      average='row',
                                      mact=1.0).toarray()

        np.testing.assert_allclose(Ceff, Ceff_ref, atol=1e-15, rtol=1e-8)
Esempio n. 8
0
    def fit(self, data, *args, **kw):
        r""" Counts transitions at given lag time according to configuration of the estimator.

        Parameters
        ----------
        data : array_like or list of array_like
            discretized trajectories
        """
        dtrajs = ensure_dtraj_list(data)

        # basic count statistics
        histogram = count_states(dtrajs, ignore_negative=True)

        # Compute count matrix
        count_mode = self.count_mode
        lagtime = self.lagtime
        if count_mode == 'sliding' or count_mode == 'sliding-effective':
            count_matrix = msmest.count_matrix(dtrajs, lagtime, sliding=True, sparse_return=self.sparse)
            if count_mode == 'sliding-effective':
                count_matrix /= lagtime
        elif count_mode == 'sample':
            count_matrix = msmest.count_matrix(dtrajs, lagtime, sliding=False, sparse_return=self.sparse)
        elif count_mode == 'effective':
            count_matrix = msmest.effective_count_matrix(dtrajs, lagtime)
            if not self.sparse and issparse(count_matrix):
                count_matrix = count_matrix.toarray()
        else:
            raise ValueError('Count mode {} is unknown.'.format(count_mode))

        # initially state symbols, full count matrix, and full histogram can be left None because they coincide
        # with the input arguments
        self._model = TransitionCountModel(
            count_matrix=count_matrix, counting_mode=count_mode, lagtime=lagtime, state_histogram=histogram,
            physical_time=self.physical_time
        )

        return self
Esempio n. 9
0
    def _estimate(self, dtrajs):
        """ Estimate MSM """

        if self.core_set is not None:
            raise NotImplementedError(
                'Core set MSMs currently not compatible with {}.'.format(
                    self.__class__.__name__))

        # remove last lag steps from dtrajs:
        dtrajs_lag = [traj[:-self.lag] for traj in dtrajs]

        # get trajectory counts. This sets _C_full and _nstates_full
        dtrajstats = self._get_dtraj_stats(dtrajs_lag)
        self._C_full = dtrajstats.count_matrix()  # full count matrix
        self._nstates_full = self._C_full.shape[0]  # number of states

        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            self.active_set = dtrajstats.largest_connected_set
        else:
            raise NotImplementedError(
                'OOM based MSM estimation is only implemented for connectivity=\'largest\'.'
            )

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # if active set is empty, we can't do anything.
        if _np.size(self.active_set) == 0:
            raise RuntimeError('Active set is empty. Cannot estimate MSM.')

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)
        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int)
        self._full2active[self.active_set] = _np.arange(len(self.active_set))

        # Estimate transition matrix
        if self.connectivity == 'largest':
            # Re-sampling:
            if self.rank_Ct == 'bootstrap_counts':
                Ceff_full = msmest.effective_count_matrix(dtrajs_lag, self.lag)
                from pyerna.util.linalg import submatrix
                Ceff = submatrix(Ceff_full, self.active_set)
                smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs)
            else:
                smean, sdev = bootstrapping_dtrajs(dtrajs_lag,
                                                   self.lag,
                                                   self._nstates_full,
                                                   nbs=self.nbs,
                                                   active_set=self._active_set)
            # Estimate two step count matrices:
            C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full)
            # Rank decision:
            rank_ind = rank_decision(smean, sdev, tol=self.tol_rank)
            # Estimate OOM components:
            Xi, omega, sigma, l = oom_components(self._C_full.toarray(),
                                                 C2t,
                                                 rank_ind=rank_ind,
                                                 lcc=self.active_set)
            # Compute transition matrix:
            P, lcc_new = equilibrium_transition_matrix(
                Xi, omega, sigma, reversible=self.reversible)
        else:
            raise NotImplementedError(
                'OOM based MSM estimation is only implemented for connectivity=\'largest\'.'
            )

        # Update active set and derived quantities:
        if lcc_new.size < self._nstates:
            self._active_set = self._active_set[lcc_new]
            self._C_active = dtrajstats.count_matrix(subset=self.active_set)
            self._nstates = self._C_active.shape[0]
            self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int)
            self._full2active[self.active_set] = _np.arange(
                len(self.active_set))
            warnings.warn(
                "Caution: Re-estimation of count matrix resulted in reduction of the active set."
            )

        # continue sparse or dense?
        if not self.sparse:
            # converting count matrices to arrays. As a result the
            # transition matrix and all subsequent properties will be
            # computed using dense arrays and dense matrix algebra.
            self._C_full = self._C_full.toarray()
            self._C_active = self._C_active.toarray()

        # Done. We set our own model parameters, so this estimator is
        # equal to the estimated model.
        self._dtrajs_full = dtrajs
        self._connected_sets = msmest.connected_sets(self._C_full)
        self._Xi = Xi
        self._omega = omega
        self._sigma = sigma
        self._eigenvalues_OOM = l
        self._rank_ind = rank_ind
        self._oom_rank = self._sigma.size
        self._C2t = C2t
        self.set_model_params(P=P,
                              pi=None,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Esempio n. 10
0
    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if count_mode == 'sliding':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True)
        elif count_mode == 'sample':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False)
        elif count_mode == 'effective':
            from pyemma.util.reflection import getargspec_no_self
            argspec = getargspec_no_self(msmest.effective_count_matrix)
            kw = {}
            if show_progress and 'callback' in argspec.args:
                from pyemma._base.progress import ProgressReporter
                from pyemma._base.parallel import get_n_jobs

                pg = ProgressReporter()
                # this is a fast operation
                C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True)
                pg.register(C_temp.nnz, 'compute statistical inefficiencies')
                del C_temp
                callback = lambda: pg.update(1)
                kw['callback'] = callback
                kw['n_jobs'] = get_n_jobs()

            self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw)
        else:
            raise ValueError('Count mode ' + count_mode + ' is unknown.')

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        # Compute reversibly connected sets
        if self._mincount_connectivity > 0:
            self._connected_sets = \
                self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity)
        else:
            self._connected_sets = msmest.connected_sets(self._C)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.zeros((len(self._connected_sets)))
        self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object)
        for i in range(len(self._connected_sets)):
            # set size
            self._connected_set_sizes[i] = len(self._connected_sets[i])
            # submatrix
            # self._C_sub[i] = submatrix(self._C, self._connected_sets[i])

        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True
Esempio n. 11
0
    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True,
                     n_jobs=None,
                     name='',
                     core_set=None,
                     milestoning_method='last_core'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        n_jobs: int or None

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if core_set is not None and count_mode in ('sliding', 'sample'):
            if milestoning_method == 'last_core':

                # assign -1 frames to last visited core
                for d in self._dtrajs:
                    assert d[0] != -1
                    while -1 in d:
                        mask = (d == -1)
                        d[mask] = d[np.roll(mask, -1)]
                self._C = msmest.count_matrix(self._dtrajs,
                                              lag,
                                              sliding=count_mode == 'sliding')

            else:
                raise NotImplementedError(
                    'Milestoning method {} not implemented.'.format(
                        milestoning_method))

        elif count_mode == 'sliding':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True)
        elif count_mode == 'sample':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False)
        elif count_mode == 'effective':
            if core_set is not None:
                raise RuntimeError(
                    'Cannot estimate core set MSM with effective counting.')
            from pyerna.util.reflection import getargspec_no_self
            argspec = getargspec_no_self(msmest.effective_count_matrix)
            kw = {}
            from pyerna.util.contexts import nullcontext
            ctx = nullcontext()
            if 'callback' in argspec.args:  # msmtools effective cmatrix ready for multiprocessing?
                from pyerna._base.progress import ProgressReporter
                from pyerna._base.parallel import get_n_jobs

                kw['n_jobs'] = get_n_jobs() if n_jobs is None else n_jobs

                if show_progress:
                    pg = ProgressReporter()
                    # this is a fast operation
                    C_temp = msmest.count_matrix(self._dtrajs,
                                                 lag,
                                                 sliding=True)
                    pg.register(
                        C_temp.nnz,
                        '{}: compute stat. inefficiencies'.format(name),
                        stage=0)
                    del C_temp
                    kw['callback'] = pg.update
                    ctx = pg.context(stage=0)
            with ctx:
                self._C = msmest.effective_count_matrix(
                    self._dtrajs, lag, **kw)
        else:
            raise ValueError('Count mode ' + count_mode + ' is unknown.')

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        # Compute reversibly connected sets
        if self._mincount_connectivity > 0:
            self._connected_sets = \
                self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity)
        else:
            self._connected_sets = msmest.connected_sets(self._C)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.zeros((len(self._connected_sets)))
        self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object)
        for i in range(len(self._connected_sets)):
            # set size
            self._connected_set_sizes[i] = len(self._connected_sets[i])
            # submatrix
            # self._C_sub[i] = submatrix(self._C, self._connected_sets[i])

        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True