def count_matrix(self, connected_set=None, subset=None): r"""The count matrix Parameters ---------- connected_set : int or None, optional, default=None connected set index. See :func:`connected_sets` to get a sorted list of connected sets. This parameter is exclusive with subset. subset : array-like of int or None, optional, default=None subset of states to compute the count matrix on. This parameter is exclusive with subset. References ---------- ..[1] Trendelkamp-Schroer B, H Wu, F Paul and F Noe. 2015: Reversible Markov models of molecular kinetics: Estimation and uncertainty. in preparation. """ self._assert_counted_at_lag() if subset is not None and connected_set is not None: raise ValueError('Can\'t set both connected_set and subset.') if subset is not None: self._assert_subset(subset) C = submatrix(self._C, subset) elif connected_set is not None: C = submatrix(self._C, self._connected_sets[connected_set]) else: # full matrix wanted C = self._C return C
def effective_count_matrix(self): """Statistically uncorrelated transition counts within the active set of states You can use this count matrix for Bayesian estimation or error perturbation. References ---------- [1] Noe, F. (2015) Statistical inefficiency of Markov model count matrices http://publications.mi.fu-berlin.de/1699/1/autocorrelation_counts.pdf """ self._check_is_estimated() Ceff_full = msmest.effective_count_matrix(self._dtrajs_full, self.lag) from pyerna.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) return Ceff
def _estimate(self, dtrajs): """ Estimate MSM """ if self.core_set is not None: raise NotImplementedError( 'Core set MSMs currently not compatible with {}.'.format( self.__class__.__name__)) # remove last lag steps from dtrajs: dtrajs_lag = [traj[:-self.lag] for traj in dtrajs] # get trajectory counts. This sets _C_full and _nstates_full dtrajstats = self._get_dtraj_stats(dtrajs_lag) self._C_full = dtrajstats.count_matrix() # full count matrix self._nstates_full = self._C_full.shape[0] # number of states # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': self.active_set = dtrajstats.largest_connected_set else: raise NotImplementedError( 'OOM based MSM estimation is only implemented for connectivity=\'largest\'.' ) # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # Estimate transition matrix if self.connectivity == 'largest': # Re-sampling: if self.rank_Ct == 'bootstrap_counts': Ceff_full = msmest.effective_count_matrix(dtrajs_lag, self.lag) from pyerna.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs) else: smean, sdev = bootstrapping_dtrajs(dtrajs_lag, self.lag, self._nstates_full, nbs=self.nbs, active_set=self._active_set) # Estimate two step count matrices: C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full) # Rank decision: rank_ind = rank_decision(smean, sdev, tol=self.tol_rank) # Estimate OOM components: Xi, omega, sigma, l = oom_components(self._C_full.toarray(), C2t, rank_ind=rank_ind, lcc=self.active_set) # Compute transition matrix: P, lcc_new = equilibrium_transition_matrix( Xi, omega, sigma, reversible=self.reversible) else: raise NotImplementedError( 'OOM based MSM estimation is only implemented for connectivity=\'largest\'.' ) # Update active set and derived quantities: if lcc_new.size < self._nstates: self._active_set = self._active_set[lcc_new] self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange( len(self.active_set)) warnings.warn( "Caution: Re-estimation of count matrix resulted in reduction of the active set." ) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self._Xi = Xi self._omega = omega self._sigma = sigma self._eigenvalues_OOM = l self._rank_ind = rank_ind self._oom_rank = self._sigma.size self._C2t = C2t self.set_model_params(P=P, pi=None, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def bootstrapping_dtrajs(dtrajs, lag, N_full, nbs=10000, active_set=None): """ Perform trajectory based re-sampling. Parameters ---------- dtrajs : list of discrete trajectories lag : int lag time N_full : int Number of states in discrete trajectories. nbs : int, optional Number of bootstrapping samples active_set : ndarray Indices of active set, all count matrices will be restricted to active set. Returns ------- smean : ndarray(N,) mean values of singular values sdev : ndarray(N,) standard deviations of singular values """ # Get the number of simulations: Q = len(dtrajs) # Get the number of states in the active set: if active_set is not None: N = active_set.size else: N = N_full # Build up a matrix of count matrices for each simulation. Size is Q*N^2: traj_ind = [] state1 = [] state2 = [] q = 0 for traj in dtrajs: traj_ind.append(q * np.ones(traj[:-lag].size)) state1.append(traj[:-lag]) state2.append(traj[lag:]) q += 1 traj_inds = np.concatenate(traj_ind) pairs = N_full * np.concatenate(state1) + np.concatenate(state2) data = np.ones(pairs.size) Ct_traj = scipy.sparse.coo_matrix((data, (traj_inds, pairs)), shape=(Q, N_full * N_full)) Ct_traj = Ct_traj.tocsr() # Perform re-sampling: svals = np.zeros((nbs, N)) for s in range(nbs): # Choose selection: sel = np.random.choice(Q, Q, replace=True) # Compute count matrix for selection: Ct_sel = Ct_traj[sel, :].sum(axis=0) Ct_sel = np.asarray(Ct_sel).reshape((N_full, N_full)) if active_set is not None: from pyerna.util.linalg import submatrix Ct_sel = submatrix(Ct_sel, active_set) svals[s, :] = scl.svdvals(Ct_sel) # Compute mean and uncertainties: smean = np.mean(svals, axis=0) sdev = np.std(svals, axis=0) return smean, sdev
def count_lagged(self, lag, count_mode='sliding', mincount_connectivity='1/n', show_progress=True, n_jobs=None, name='', core_set=None, milestoning_method='last_core'): r""" Counts transitions at given lag time Parameters ---------- lag : int lagtime in trajectory steps count_mode : str, optional, default='sliding' mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : A trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarray \tau), (1 \rightarray \tau+1), ..., (T-\tau-1 \rightarray T-1) * 'effective' : Uses an estimate of the transition counts that are statistically uncorrelated. Recommended when used with a Bayesian MSM. * 'sample' : A trajectory of length T will have :math:`T / \tau` counts at time indexes .. math:: (0 \rightarray \tau), (\tau \rightarray 2 \tau), ..., (((T/tau)-1) \tau \rightarray T) show_progress: bool, default=True show the progress for the expensive effective count mode computation. n_jobs: int or None """ # store lag time self._lag = lag # Compute count matrix count_mode = count_mode.lower() if core_set is not None and count_mode in ('sliding', 'sample'): if milestoning_method == 'last_core': # assign -1 frames to last visited core for d in self._dtrajs: assert d[0] != -1 while -1 in d: mask = (d == -1) d[mask] = d[np.roll(mask, -1)] self._C = msmest.count_matrix(self._dtrajs, lag, sliding=count_mode == 'sliding') else: raise NotImplementedError( 'Milestoning method {} not implemented.'.format( milestoning_method)) elif count_mode == 'sliding': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=True) elif count_mode == 'sample': self._C = msmest.count_matrix(self._dtrajs, lag, sliding=False) elif count_mode == 'effective': if core_set is not None: raise RuntimeError( 'Cannot estimate core set MSM with effective counting.') from pyerna.util.reflection import getargspec_no_self argspec = getargspec_no_self(msmest.effective_count_matrix) kw = {} from pyerna.util.contexts import nullcontext ctx = nullcontext() if 'callback' in argspec.args: # msmtools effective cmatrix ready for multiprocessing? from pyerna._base.progress import ProgressReporter from pyerna._base.parallel import get_n_jobs kw['n_jobs'] = get_n_jobs() if n_jobs is None else n_jobs if show_progress: pg = ProgressReporter() # this is a fast operation C_temp = msmest.count_matrix(self._dtrajs, lag, sliding=True) pg.register( C_temp.nnz, '{}: compute stat. inefficiencies'.format(name), stage=0) del C_temp kw['callback'] = pg.update ctx = pg.context(stage=0) with ctx: self._C = msmest.effective_count_matrix( self._dtrajs, lag, **kw) else: raise ValueError('Count mode ' + count_mode + ' is unknown.') # store mincount_connectivity if mincount_connectivity == '1/n': mincount_connectivity = 1.0 / np.shape(self._C)[0] self._mincount_connectivity = mincount_connectivity # Compute reversibly connected sets if self._mincount_connectivity > 0: self._connected_sets = \ self._compute_connected_sets(self._C, mincount_connectivity=self._mincount_connectivity) else: self._connected_sets = msmest.connected_sets(self._C) # set sizes and count matrices on reversibly connected sets self._connected_set_sizes = np.zeros((len(self._connected_sets))) self._C_sub = np.empty((len(self._connected_sets)), dtype=np.object) for i in range(len(self._connected_sets)): # set size self._connected_set_sizes[i] = len(self._connected_sets[i]) # submatrix # self._C_sub[i] = submatrix(self._C, self._connected_sets[i]) # largest connected set self._lcs = self._connected_sets[0] # if lcs has no counts, make lcs empty if submatrix(self._C, self._lcs).sum() == 0: self._lcs = np.array([], dtype=int) # mapping from full to lcs self._full2lcs = -1 * np.ones((self._nstates), dtype=int) self._full2lcs[self._lcs] = np.arange(len(self._lcs)) # remember that this function was called self._counted_at_lag = True