def _get_dtraj_stats(self, dtrajs): """ Compute raw trajectory counts Parameters ---------- dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) or :class:`DiscreteTrajectoryStats <pyemma.msm.estimators._dtraj_stats.DiscreteTrajectoryStats>` discrete trajectories, stored as integer ndarrays (arbitrary size) or a single ndarray for only one trajectory. """ # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warning( 'Building a dense MSM with {nstates} states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.'.format( nstates=dtrajstats.nstates)) # count lagged dtrajstats.count_lagged( self.lag, count_mode=self.count_mode, mincount_connectivity=self.mincount_connectivity, n_jobs=getattr(self, 'n_jobs', None), show_progress=getattr(self, 'show_progress', False), name=self.name) # for other statistics return dtrajstats
def _get_dtraj_stats(self, dtrajs): """ Compute raw trajectory counts Parameters ---------- dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) or :class:`DiscreteTrajectoryStats <pyemma.msm.estimators._dtraj_stats.DiscreteTrajectoryStats>` discrete trajectories, stored as integer ndarrays (arbitrary size) or a single ndarray for only one trajectory. """ # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: if any(-1 in d for d in dtrajs): if self.core_set is None: self.core_set = _np.sort(_np.unique(_np.concatenate(dtrajs)))[1:] self.logger.warning('Empty core set while unassigned states (-1) in discrete trajectory. ' 'Defining core set automatically; check correctness by calling self.core_set.') else: if set(_np.sort(_np.unique(_np.concatenate(dtrajs)))[1:]) != set(self.core_set): self.logger.warning('dtraj containts states that are not in core set definition. ' 'These states will be treated as unassigned.') if self.core_set is not None: self._dtrajs_original = dtrajs from pyemma.util.discrete_trajectories import rewrite_dtrajs_to_core_sets self._dtrajs_full, self._dtrajs_milestone_counting_offsets, self.n_cores = \ rewrite_dtrajs_to_core_sets(dtrajs, core_set=self.core_set, in_place=False) else: self._dtrajs_full = dtrajs # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(self._dtrajs_full) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warning('Building a dense MSM with {nstates} states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.'.format(nstates=dtrajstats.nstates)) # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode, mincount_connectivity=self.mincount_connectivity, n_jobs=getattr(self, 'n_jobs', None), show_progress=getattr(self, 'show_progress', False), name=self.name, core_set=self.core_set, milestoning_method=self.milestoning_method) # for other statistics return dtrajstats
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warning('Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': if self.statdist_constraint is None: # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: active_set = self._prepare_input_revpi(self._C_full, self.statdist_constraint) self.active_set = active_set else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # restrict stationary distribution to active set if self.statdist_constraint is None: statdist_active = None else: statdist_active = self.statdist_constraint[self.active_set] statdist_active /= statdist_active.sum() # renormalize # Estimate transition matrix if self.connectivity == 'largest': P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) elif self.connectivity == 'none': # reversible mode only possible if active set is connected # - in this case all visited states are connected and thus # this mode is identical to 'largest' if self.reversible and not msmest.is_connected(self._C_active): raise ValueError('Reversible MSM estimation is not possible with connectivity mode "none", ' 'because the set of all visited states is not reversibly connected') P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) else: raise NotImplementedError( 'MSM estimation with connectivity=%s is currently not implemented.' % self.connectivity) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() P = P.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # remove last lag steps from dtrajs: dtrajs_lag = [traj[:-self.lag] for traj in dtrajs] # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs_lag) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warning('Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': self.active_set = dtrajstats.largest_connected_set else: raise NotImplementedError('OOM based MSM estimation is only implemented for connectivity=\'largest\'.') # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # Estimate transition matrix if self.connectivity == 'largest': # Re-sampling: if self.rank_Ct=='bootstrap_counts': Ceff_full = msmest.effective_count_matrix(dtrajs_lag, self.lag) from pyemma.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs) else: smean, sdev = bootstrapping_dtrajs(dtrajs_lag, self.lag, self._nstates_full, nbs=self.nbs, active_set=self._active_set) # Estimate two step count matrices: C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full) # Rank decision: rank_ind = rank_decision(smean, sdev, tol=self.tol_rank) # Estimate OOM components: Xi, omega, sigma, l = oom_components(self._C_full.toarray(), C2t, rank_ind=rank_ind, lcc=self.active_set) # Compute transition matrix: P, lcc_new = equilibrium_transition_matrix(Xi, omega, sigma, reversible=self.reversible) else: raise NotImplementedError('OOM based MSM estimation is only implemented for connectivity=\'largest\'.') # Update active set and derived quantities: if lcc_new.size < self._nstates: self._active_set = self._active_set[lcc_new] self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) warnings.warn("Caution: Re-estimation of count matrix resulted in reduction of the active set.") # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self._Xi = Xi self._omega = omega self._sigma = sigma self._eigenvalues_OOM = l self._rank_ind = rank_ind self._oom_rank = self._sigma.size self._C2t = C2t self.set_model_params(P=P, pi=None, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def _estimate(self, dtrajs): """ Parameters ---------- dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) or :class:`pyemma.msm.util.dtraj_states.DiscreteTrajectoryStats` discrete trajectories, stored as integer ndarrays (arbitrary size) or a single ndarray for only one trajectory. **params : Other keyword parameters if different from the settings when this estimator was constructed Returns ------- MSM : :class:`pyemma.msm.EstimatedMSM` or :class:`pyemma.msm.MSM` """ # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warn( 'Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': if self.statdist_constraint is None: # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: # statdist given - simple connectivity on all nonzero probability states nz = _np.nonzero(self.statdist_constraint)[0] Cnz = dtrajstats.count_matrix(subset=nz) self.active_set = nz[msmest.largest_connected_set( Cnz, directed=False)] else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.array(list( range(len(self.active_set))), dtype=int) # restrict stationary distribution to active set if self.statdist_constraint is None: statdist_active = None else: statdist_active = self.statdist_constraint[self.active_set] statdist_active /= statdist_active.sum() # renormalize # Estimate transition matrix if self.connectivity == 'largest': P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) elif self.connectivity == 'none': # reversible mode only possible if active set is connected # - in this case all visited states are connected and thus # this mode is identical to 'largest' if self.reversible and not msmest.is_connected(self._C_active): raise ValueError( 'Reversible MSM estimation is not possible with connectivity mode \'none\', ' + 'because the set of all visited states is not reversibly connected' ) P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) else: raise NotImplementedError( 'MSM estimation with connectivity=\'self.connectivity\' is currently not implemented.' ) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() P = P.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self