Esempio n. 1
0
    def effective_count_matrix(self):
        """Statistically uncorrelated transition counts within the active set of states

        You can use this count matrix for Bayesian estimation or error perturbation.

        References
        ----------
        [1] Noe, F. (2015) Statistical inefficiency of Markov model count matrices
            http://publications.mi.fu-berlin.de/1699/1/autocorrelation_counts.pdf

        """
        self._check_is_estimated()
        Ceff_full = effective_count_matrix(self._dtrajs_full, self.lag)
        from pyemma.util.linalg import submatrix
        Ceff = submatrix(Ceff_full, self.active_set)
        return Ceff
Esempio n. 2
0
    def count_matrix(self, connected_set=None, subset=None, effective=False):
        """The count matrix

        Parameters
        ----------
        connected_set : int or None, optional, default=None
            connected set index. See :func:`connected_sets` to get a sorted list of connected sets.
            This parameter is exclusive with subset.
        subset : array-like of int or None, optional, default=None
            subset of states to compute the count matrix on. This parameter is exclusive with subset.
        effective : bool, optional, default=False
            Statistically uncorrelated transition counts within the active set of states.

            You can use this count matrix for any kind of estimation, in particular it is meant to give reasonable
            error bars in uncertainty measurements (error perturbation or Gibbs sampling of the posterior).

            The effective count matrix is obtained by dividing the sliding-window count matrix by the lag time. This
            can be shown to provide a likelihood that is the geometrical average over shifted subsamples of the trajectory,
            :math:`(s_1,\:s_{tau+1},\:...),\:(s_2,\:t_{tau+2},\:...),` etc. This geometrical average converges to the
            correct likelihood in the statistical limit [1]_.

        References
        ----------

        ..[1] Trendelkamp-Schroer B, H Wu, F Paul and F Noe. 2015:
            Reversible Markov models of molecular kinetics: Estimation and uncertainty.
            in preparation.
        """
        self._assert_counted_at_lag()
        if subset is not None and connected_set is not None:
            raise ValueError('Can\'t set both connected_set and subset.')
        if subset is not None:
            self._assert_subset(subset)
            C = submatrix(self._C, subset)
        elif connected_set is not None:
            C = self._C_sub[connected_set]
        else:  # full matrix wanted
            C = self._C

        # effective count matrix wanted?
        if effective:
            C /= float(self._lag)

        return C
Esempio n. 3
0
def bootstrapping_dtrajs(dtrajs, lag, N_full, nbs=10000, active_set=None):
    """
    Perform trajectory based re-sampling.

    Parameters
    ----------
    dtrajs : list of discrete trajectories

    lag : int
        lag time

    N_full : int
        Number of states in discrete trajectories.
    nbs : int, optional
        Number of bootstrapping samples
    active_set : ndarray
        Indices of active set, all count matrices will be restricted
        to active set.

    Returns
    -------
    smean : ndarray(N,)
        mean values of singular values
    sdev : ndarray(N,)
        standard deviations of singular values
    """

    # Get the number of simulations:
    Q = len(dtrajs)
    # Get the number of states in the active set:
    if active_set is not None:
        N = active_set.size
    else:
        N = N_full
    # Build up a matrix of count matrices for each simulation. Size is Q*N^2:
    traj_ind = []
    state1 = []
    state2 = []
    q = 0
    for traj in dtrajs:
        traj_ind.append(q * np.ones(traj[:-lag].size))
        state1.append(traj[:-lag])
        state2.append(traj[lag:])
        q += 1
    traj_inds = np.concatenate(traj_ind)
    pairs = N_full * np.concatenate(state1) + np.concatenate(state2)
    data = np.ones(pairs.size)
    Ct_traj = scipy.sparse.coo_matrix((data, (traj_inds, pairs)),
                                      shape=(Q, N_full * N_full))
    Ct_traj = Ct_traj.tocsr()

    # Perform re-sampling:
    svals = np.zeros((nbs, N))
    for s in range(nbs):
        # Choose selection:
        sel = np.random.choice(Q, Q, replace=True)
        # Compute count matrix for selection:
        Ct_sel = Ct_traj[sel, :].sum(axis=0)
        Ct_sel = np.asarray(Ct_sel).reshape((N_full, N_full))
        if active_set is not None:
            from pyemma.util.linalg import submatrix
            Ct_sel = submatrix(Ct_sel, active_set)
        svals[s, :] = scl.svdvals(Ct_sel)
    # Compute mean and uncertainties:
    smean = np.mean(svals, axis=0)
    sdev = np.std(svals, axis=0)

    return smean, sdev
Esempio n. 4
0
    def count_lagged(self, lag, count_mode='sliding'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)


        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if count_mode == 'sliding':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True)
        elif count_mode == 'sample':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False)
        elif count_mode == 'effective':
            self._C = msmest.effective_count_matrix(self._dtrajs, lag)
        else:
            raise ValueError('Count mode ' + count_mode + ' is unknown.')

        # Compute reversibly connected sets
        self._connected_sets = msmest.connected_sets(self._C)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.zeros((len(self._connected_sets)))
        self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object)
        for i in range(len(self._connected_sets)):
            # set size
            self._connected_set_sizes[i] = len(self._connected_sets[i])
            # submatrix
            self._C_sub[i] = submatrix(self._C, self._connected_sets[i])

        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.array(list(range(len(self._lcs))),
                                             dtype=int)

        # remember that this function was called
        self._counted_at_lag = True
Esempio n. 5
0
    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if count_mode == 'sliding':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True)
        elif count_mode == 'sample':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False)
        elif count_mode == 'effective':
            from pyemma.util.reflection import getargspec_no_self
            argspec = getargspec_no_self(msmest.effective_count_matrix)
            kw = {}
            if show_progress and 'callback' in argspec.args:
                from pyemma._base.progress import ProgressReporter
                from pyemma._base.parallel import get_n_jobs

                pg = ProgressReporter()
                # this is a fast operation
                C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True)
                pg.register(C_temp.nnz, 'compute statistical inefficiencies')
                del C_temp
                callback = lambda: pg.update(1)
                kw['callback'] = callback
                kw['n_jobs'] = get_n_jobs()

            self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw)
        else:
            raise ValueError('Count mode ' + count_mode + ' is unknown.')

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        # Compute reversibly connected sets
        if self._mincount_connectivity > 0:
            self._connected_sets = \
                self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity)
        else:
            self._connected_sets = msmest.connected_sets(self._C)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.zeros((len(self._connected_sets)))
        self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object)
        for i in range(len(self._connected_sets)):
            # set size
            self._connected_set_sizes[i] = len(self._connected_sets[i])
            # submatrix
            # self._C_sub[i] = submatrix(self._C, self._connected_sets[i])

        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True
Esempio n. 6
0
    def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n',
                     show_progress=True, n_jobs=None, name='', core_set=None, milestoning_method='last_core'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        n_jobs: int or None

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if core_set is not None and count_mode in ('sliding', 'sample'):
            if milestoning_method == 'last_core':

                # assign -1 frames to last visited core
                for d in self._dtrajs:
                    assert d[0] != -1
                    while -1 in d:
                        mask = (d == -1)
                        d[mask] = d[np.roll(mask, -1)]
                self._C = msmest.count_matrix(self._dtrajs, lag, sliding=count_mode == 'sliding')

            else:
                raise NotImplementedError('Milestoning method {} not implemented.'.format(milestoning_method))


        elif count_mode == 'sliding':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True)
        elif count_mode == 'sample':
            self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False)
        elif count_mode == 'effective':
            if core_set is not None:
                raise RuntimeError('Cannot estimate core set MSM with effective counting.')
            from pyemma.util.reflection import getargspec_no_self
            argspec = getargspec_no_self(msmest.effective_count_matrix)
            kw = {}
            from pyemma.util.contexts import nullcontext
            ctx = nullcontext()
            if 'callback' in argspec.args:  # msmtools effective cmatrix ready for multiprocessing?
                from pyemma._base.progress import ProgressReporter
                from pyemma._base.parallel import get_n_jobs

                kw['n_jobs'] = get_n_jobs() if n_jobs is None else n_jobs

                if show_progress:
                    pg = ProgressReporter()
                    # this is a fast operation
                    C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True)
                    pg.register(C_temp.nnz, '{}: compute stat. inefficiencies'.format(name), stage=0)
                    del C_temp
                    kw['callback'] = pg.update
                    ctx = pg.context(stage=0)
            with ctx:
                self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw)
        else:
            raise ValueError('Count mode ' + count_mode + ' is unknown.')

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        # Compute reversibly connected sets
        if self._mincount_connectivity > 0:
            self._connected_sets = \
                self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity)
        else:
            self._connected_sets = msmest.connected_sets(self._C)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.zeros((len(self._connected_sets)))
        self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object)
        for i in range(len(self._connected_sets)):
            # set size
            self._connected_set_sizes[i] = len(self._connected_sets[i])
            # submatrix
            # self._C_sub[i] = submatrix(self._C, self._connected_sets[i])

        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True
Esempio n. 7
0
    def _estimate(self, dtrajs):
        """ Estimate MSM """

        if self.core_set is not None:
            raise NotImplementedError(
                'Core set MSMs currently not compatible with {}.'.format(
                    self.__class__.__name__))

        # remove last lag steps from dtrajs:
        dtrajs_lag = [traj[:-self.lag] for traj in dtrajs]

        # get trajectory counts. This sets _C_full and _nstates_full
        dtrajstats = self._get_dtraj_stats(dtrajs_lag)
        self._C_full = dtrajstats.count_matrix()  # full count matrix
        self._nstates_full = self._C_full.shape[0]  # number of states

        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            self.active_set = dtrajstats.largest_connected_set
        else:
            raise NotImplementedError(
                'OOM based MSM estimation is only implemented for connectivity=\'largest\'.'
            )

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # if active set is empty, we can't do anything.
        if _np.size(self.active_set) == 0:
            raise RuntimeError('Active set is empty. Cannot estimate MSM.')

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)
        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int)
        self._full2active[self.active_set] = _np.arange(len(self.active_set))

        # Estimate transition matrix
        if self.connectivity == 'largest':
            # Re-sampling:
            if self.rank_Ct == 'bootstrap_counts':
                Ceff_full = msmest.effective_count_matrix(dtrajs_lag, self.lag)
                from pyemma.util.linalg import submatrix
                Ceff = submatrix(Ceff_full, self.active_set)
                smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs)
            else:
                smean, sdev = bootstrapping_dtrajs(dtrajs_lag,
                                                   self.lag,
                                                   self._nstates_full,
                                                   nbs=self.nbs,
                                                   active_set=self._active_set)
            # Estimate two step count matrices:
            C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full)
            # Rank decision:
            rank_ind = rank_decision(smean, sdev, tol=self.tol_rank)
            # Estimate OOM components:
            Xi, omega, sigma, l = oom_components(self._C_full.toarray(),
                                                 C2t,
                                                 rank_ind=rank_ind,
                                                 lcc=self.active_set)
            # Compute transition matrix:
            P, lcc_new = equilibrium_transition_matrix(
                Xi, omega, sigma, reversible=self.reversible)
        else:
            raise NotImplementedError(
                'OOM based MSM estimation is only implemented for connectivity=\'largest\'.'
            )

        # Update active set and derived quantities:
        if lcc_new.size < self._nstates:
            self._active_set = self._active_set[lcc_new]
            self._C_active = dtrajstats.count_matrix(subset=self.active_set)
            self._nstates = self._C_active.shape[0]
            self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int)
            self._full2active[self.active_set] = _np.arange(
                len(self.active_set))
            warnings.warn(
                "Caution: Re-estimation of count matrix resulted in reduction of the active set."
            )

        # continue sparse or dense?
        if not self.sparse:
            # converting count matrices to arrays. As a result the
            # transition matrix and all subsequent properties will be
            # computed using dense arrays and dense matrix algebra.
            self._C_full = self._C_full.toarray()
            self._C_active = self._C_active.toarray()

        # Done. We set our own model parameters, so this estimator is
        # equal to the estimated model.
        self._dtrajs_full = dtrajs
        self._connected_sets = msmest.connected_sets(self._C_full)
        self._Xi = Xi
        self._omega = omega
        self._sigma = sigma
        self._eigenvalues_OOM = l
        self._rank_ind = rank_ind
        self._oom_rank = self._sigma.size
        self._C2t = C2t
        self.set_model_params(P=P,
                              pi=None,
                              reversible=self.reversible,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Esempio n. 8
0
    def count_lagged(self,
                     lag,
                     count_mode='sliding',
                     mincount_connectivity='1/n',
                     show_progress=True,
                     n_jobs=None,
                     name='',
                     core_set=None,
                     milestoning_method='last_core'):
        r""" Counts transitions at given lag time

        Parameters
        ----------
        lag : int
            lagtime in trajectory steps

        count_mode : str, optional, default='sliding'
            mode to obtain count matrices from discrete trajectories. Should be one of:

            * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1)

            * 'effective' : Uses an estimate of the transition counts that are
              statistically uncorrelated. Recommended when used with a
              Bayesian MSM.

            * 'sample' : A trajectory of length T will have :math:`T / \tau` counts
              at time indexes
              .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T)

        show_progress: bool, default=True
            show the progress for the expensive effective count mode computation.

        n_jobs: int or None

        """
        # store lag time
        self._lag = lag

        # Compute count matrix
        count_mode = count_mode.lower()
        if core_set is not None and count_mode in ('sliding', 'sample'):
            if milestoning_method == 'last_core':

                # assign -1 frames to last visited core
                for d in self._dtrajs:
                    assert d[0] != -1
                    while -1 in d:
                        mask = (d == -1)
                        d[mask] = d[np.roll(mask, -1)]
                self._C = count_matrix(self._dtrajs,
                                       lag,
                                       sliding=count_mode == 'sliding')

            else:
                raise NotImplementedError(
                    'Milestoning method {} not implemented.'.format(
                        milestoning_method))
        else:
            cm = TransitionCountEstimator(lag,
                                          count_mode=count_mode,
                                          sparse=True).fit(
                                              self._dtrajs).fetch_model()
            self._C = cm.count_matrix

        # store mincount_connectivity
        if mincount_connectivity == '1/n':
            mincount_connectivity = 1.0 / np.shape(self._C)[0]
        self._mincount_connectivity = mincount_connectivity

        self._count_model_full = TransitionCountModel(self._C)
        self._connected_sets = self._count_model_full.connected_sets(
            connectivity_threshold=self._mincount_connectivity)
        self._count_model = self._count_model_full.submodel_largest(
            connectivity_threshold=self._mincount_connectivity)

        # set sizes and count matrices on reversibly connected sets
        self._connected_set_sizes = np.array(
            (len(cs) for cs in self._connected_sets))
        # largest connected set
        self._lcs = self._connected_sets[0]

        # if lcs has no counts, make lcs empty
        if submatrix(self._C, self._lcs).sum() == 0:
            self._lcs = np.array([], dtype=int)

        # mapping from full to lcs
        self._full2lcs = -1 * np.ones((self._nstates), dtype=int)
        self._full2lcs[self._lcs] = np.arange(len(self._lcs))

        # remember that this function was called
        self._counted_at_lag = True