def effective_count_matrix(self): """Statistically uncorrelated transition counts within the active set of states You can use this count matrix for Bayesian estimation or error perturbation. References ---------- [1] Noe, F. (2015) Statistical inefficiency of Markov model count matrices http://publications.mi.fu-berlin.de/1699/1/autocorrelation_counts.pdf """ self._check_is_estimated() Ceff_full = effective_count_matrix(self._dtrajs_full, self.lag) from pyemma.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) return Ceff
def count_matrix(self, connected_set=None, subset=None, effective=False): """The count matrix Parameters ---------- connected_set : int or None, optional, default=None connected set index. See :func:`connected_sets` to get a sorted list of connected sets. This parameter is exclusive with subset. subset : array-like of int or None, optional, default=None subset of states to compute the count matrix on. This parameter is exclusive with subset. effective : bool, optional, default=False Statistically uncorrelated transition counts within the active set of states. You can use this count matrix for any kind of estimation, in particular it is meant to give reasonable error bars in uncertainty measurements (error perturbation or Gibbs sampling of the posterior). The effective count matrix is obtained by dividing the sliding-window count matrix by the lag time. This can be shown to provide a likelihood that is the geometrical average over shifted subsamples of the trajectory, :math:`(s_1,\:s_{tau+1},\:...),\:(s_2,\:t_{tau+2},\:...),` etc. This geometrical average converges to the correct likelihood in the statistical limit [1]_. References ---------- ..[1] Trendelkamp-Schroer B, H Wu, F Paul and F Noe. 2015: Reversible Markov models of molecular kinetics: Estimation and uncertainty. in preparation. """ self._assert_counted_at_lag() if subset is not None and connected_set is not None: raise ValueError('Can\'t set both connected_set and subset.') if subset is not None: self._assert_subset(subset) C = submatrix(self._C, subset) elif connected_set is not None: C = self._C_sub[connected_set] else: # full matrix wanted C = self._C # effective count matrix wanted? if effective: C /= float(self._lag) return C
def bootstrapping_dtrajs(dtrajs, lag, N_full, nbs=10000, active_set=None): """ Perform trajectory based re-sampling. Parameters ---------- dtrajs : list of discrete trajectories lag : int lag time N_full : int Number of states in discrete trajectories. nbs : int, optional Number of bootstrapping samples active_set : ndarray Indices of active set, all count matrices will be restricted to active set. Returns ------- smean : ndarray(N,) mean values of singular values sdev : ndarray(N,) standard deviations of singular values """ # Get the number of simulations: Q = len(dtrajs) # Get the number of states in the active set: if active_set is not None: N = active_set.size else: N = N_full # Build up a matrix of count matrices for each simulation. Size is Q*N^2: traj_ind = [] state1 = [] state2 = [] q = 0 for traj in dtrajs: traj_ind.append(q * np.ones(traj[:-lag].size)) state1.append(traj[:-lag]) state2.append(traj[lag:]) q += 1 traj_inds = np.concatenate(traj_ind) pairs = N_full * np.concatenate(state1) + np.concatenate(state2) data = np.ones(pairs.size) Ct_traj = scipy.sparse.coo_matrix((data, (traj_inds, pairs)), shape=(Q, N_full * N_full)) Ct_traj = Ct_traj.tocsr() # Perform re-sampling: svals = np.zeros((nbs, N)) for s in range(nbs): # Choose selection: sel = np.random.choice(Q, Q, replace=True) # Compute count matrix for selection: Ct_sel = Ct_traj[sel, :].sum(axis=0) Ct_sel = np.asarray(Ct_sel).reshape((N_full, N_full)) if active_set is not None: from pyemma.util.linalg import submatrix Ct_sel = submatrix(Ct_sel, active_set) svals[s, :] = scl.svdvals(Ct_sel) # Compute mean and uncertainties: smean = np.mean(svals, axis=0) sdev = np.std(svals, axis=0) return smean, sdev
def count_lagged(self, lag, count_mode='sliding'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': self._C = msmest.effective_count_matrix(self._dtrajs, lag) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # Compute reversibly connected sets self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.array(list(range(len(self._lcs))), dtype=int) # remember that this function was called self._counted_at_lag = True
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': from pyemma.util.reflection import getargspec_no_self argspec = getargspec_no_self(msmest.effective_count_matrix) kw = {} if show_progress and 'callback' in argspec.args: from pyemma._base.progress import ProgressReporter from pyemma._base.parallel import get_n_jobs pg = ProgressReporter() # this is a fast operation C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True) pg.register(C_temp.nnz, 'compute statistical inefficiencies') del C_temp callback = lambda: pg.update(1) kw['callback'] = callback kw['n_jobs'] = get_n_jobs() self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity # Compute reversibly connected sets if self._mincount_connectivity > 0: self._connected_sets = \ self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity) else: self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix # self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True, n_jobs=None, name='', core_set=None, milestoning_method='last_core'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. n_jobs: int or None """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if core_set is not None and count_mode in ('sliding', 'sample'): if milestoning_method == 'last_core': # assign -1 frames to last visited core for d in self._dtrajs: assert d[0] != -1 while -1 in d: mask = (d == -1) d[mask] = d[np.roll(mask, -1)] self._C = msmest.count_matrix(self._dtrajs, lag, sliding=count_mode == 'sliding') else: raise NotImplementedError('Milestoning method {} not implemented.'.format(milestoning_method)) elif count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': if core_set is not None: raise RuntimeError('Cannot estimate core set MSM with effective counting.') from pyemma.util.reflection import getargspec_no_self argspec = getargspec_no_self(msmest.effective_count_matrix) kw = {} from pyemma.util.contexts import nullcontext ctx = nullcontext() if 'callback' in argspec.args: # msmtools effective cmatrix ready for multiprocessing? from pyemma._base.progress import ProgressReporter from pyemma._base.parallel import get_n_jobs kw['n_jobs'] = get_n_jobs() if n_jobs is None else n_jobs if show_progress: pg = ProgressReporter() # this is a fast operation C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True) pg.register(C_temp.nnz, '{}: compute stat. inefficiencies'.format(name), stage=0) del C_temp kw['callback'] = pg.update ctx = pg.context(stage=0) with ctx: self._C = msmest.effective_count_matrix(self._dtrajs, lag, **kw) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity # Compute reversibly connected sets if self._mincount_connectivity > 0: self._connected_sets = \ self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity) else: self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix # self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True
def _estimate(self, dtrajs): """ Estimate MSM """ if self.core_set is not None: raise NotImplementedError( 'Core set MSMs currently not compatible with {}.'.format( self.__class__.__name__)) # remove last lag steps from dtrajs: dtrajs_lag = [traj[:-self.lag] for traj in dtrajs] # get trajectory counts. This sets _C_full and _nstates_full dtrajstats = self._get_dtraj_stats(dtrajs_lag) self._C_full = dtrajstats.count_matrix() # full count matrix self._nstates_full = self._C_full.shape[0] # number of states # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': self.active_set = dtrajstats.largest_connected_set else: raise NotImplementedError( 'OOM based MSM estimation is only implemented for connectivity=\'largest\'.' ) # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # Estimate transition matrix if self.connectivity == 'largest': # Re-sampling: if self.rank_Ct == 'bootstrap_counts': Ceff_full = msmest.effective_count_matrix(dtrajs_lag, self.lag) from pyemma.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs) else: smean, sdev = bootstrapping_dtrajs(dtrajs_lag, self.lag, self._nstates_full, nbs=self.nbs, active_set=self._active_set) # Estimate two step count matrices: C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full) # Rank decision: rank_ind = rank_decision(smean, sdev, tol=self.tol_rank) # Estimate OOM components: Xi, omega, sigma, l = oom_components(self._C_full.toarray(), C2t, rank_ind=rank_ind, lcc=self.active_set) # Compute transition matrix: P, lcc_new = equilibrium_transition_matrix( Xi, omega, sigma, reversible=self.reversible) else: raise NotImplementedError( 'OOM based MSM estimation is only implemented for connectivity=\'largest\'.' ) # Update active set and derived quantities: if lcc_new.size < self._nstates: self._active_set = self._active_set[lcc_new] self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange( len(self.active_set)) warnings.warn( "Caution: Re-estimation of count matrix resulted in reduction of the active set." ) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self._Xi = Xi self._omega = omega self._sigma = sigma self._eigenvalues_OOM = l self._rank_ind = rank_ind self._oom_rank = self._sigma.size self._C2t = C2t self.set_model_params(P=P, pi=None, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True, n_jobs=None, name='', core_set=None, milestoning_method='last_core'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. n_jobs: int or None """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if core_set is not None and count_mode in ('sliding', 'sample'): if milestoning_method == 'last_core': # assign -1 frames to last visited core for d in self._dtrajs: assert d[0] != -1 while -1 in d: mask = (d == -1) d[mask] = d[np.roll(mask, -1)] self._C = count_matrix(self._dtrajs, lag, sliding=count_mode == 'sliding') else: raise NotImplementedError( 'Milestoning method {} not implemented.'.format( milestoning_method)) else: cm = TransitionCountEstimator(lag, count_mode=count_mode, sparse=True).fit( self._dtrajs).fetch_model() self._C = cm.count_matrix # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity self._count_model_full = TransitionCountModel(self._C) self._connected_sets = self._count_model_full.connected_sets( connectivity_threshold=self._mincount_connectivity) self._count_model = self._count_model_full.submodel_largest( connectivity_threshold=self._mincount_connectivity) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.array( (len(cs) for cs in self._connected_sets)) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True