def oom(dtrajs, tau, order): dtrajs=ensure_dtraj_list(dtrajs) pii=np.maximum(count_states(dtrajs),1e-20).reshape(-1) pii/=pii.sum() C=cmatrix(dtrajs,tau,sliding=True).toarray()+0.0 C_mem=two_step_cmatrix(dtrajs,tau)+0.0 C=C+C.T C/=C.sum() for i in range(C_mem.shape[0]): C_mem[i]=C_mem[i]+C_mem[i].T C_mem/=C_mem.sum() nstates=pii.shape[0] D=np.diag(1/np.sqrt(pii)) pinv_R=pinv_cholcov(D.dot(C).dot(D),order) order=pinv_R.shape[0] Xi_set=np.empty((nstates,order,order)) for i in range(C_mem.shape[0]): Xi_set[i]=pinv_R.dot(D).dot(C_mem[i]).dot(D).dot(pinv_R.T) omega=pii.reshape(1,-1).dot(D).dot(pinv_R.T) sigma=omega.reshape(-1,1) return {'sigma': sigma, 'omega': omega, 'Xi_set': Xi_set}
def two_step_cmatrix(dtrajs, tau): nstates = number_of_states(dtrajs) C = np.zeros((nstates, nstates, nstates)) dtrajs = ensure_dtraj_list(dtrajs) for dtraj in dtrajs: L = dtraj.shape[0] """For each 'middle state j' compute a two-step count matrix""" for l in range(L-2*tau): i = dtraj[l] j = dtraj[l+tau] k = dtraj[l+2*tau] C[j, i, k] += 1 return C
def _estimate(self, dtrajs): ### PREPARE AND CHECK DATA # TODO: Currently only discrete trajectories are implemented. For a general class this needs to be changed. dtrajs = _types.ensure_dtraj_list(dtrajs) # check trajectory lengths if self._estimated: # if dtrajs has now changed, unset the _estimated flag to re-set every derived quantity. assert hasattr(self, '_last_dtrajs_input_hash') if self._last_dtrajs_input_hash != _hash_dtrajs(dtrajs): self.logger.warning( "estimating from new data, discard all previously computed models." ) self._estimated = False else: self._last_dtrajs_input_hash = _hash_dtrajs(dtrajs) self._trajlengths = np.fromiter((len(traj) for traj in dtrajs), dtype=int, count=len(dtrajs)) maxlength = np.max(self._trajlengths) # set lag times by data if not yet set if self._lags is None: maxlag = 0.5 * np.sum(self._trajlengths) / float( len(self._trajlengths)) self._lags = _generate_lags(maxlag, 1.5) # check if some lag times are forbidden. if np.max(self._lags) >= maxlength: Ifit = np.where(self._lags < maxlength)[0] Inofit = np.where(self._lags >= maxlength)[0] self.logger.warning( 'Ignoring lag times that exceed the longest trajectory: %s', self._lags[Inofit]) self._lags = self._lags[Ifit] ### RUN ESTIMATION if self._estimated: # we already had run an estimation, determine which lag times we need to compute # TODO: this will re-evaluate problematic lag times, wont it? lags = sorted(list(set(self._lags).difference(self._last_lags))) if len(lags) == 0: self.logger.info("All lag times already estimated.") return self assert lags self.logger.info( "Running estimating for not yet estimated lags times: %s", lags) else: lags = self._lags # construct all parameter sets for the estimator param_sets = tuple(param_grid({'lag': lags})) if isinstance(self.estimator, SampledModel): self.estimator.show_progress = False # run estimation on all lag times models, estimators = estimate_param_scan(self.estimator, dtrajs, param_sets, failfast=False, return_estimators=True, n_jobs=self.n_jobs, progress_reporter=self) self._estimators = estimators self._postprocess_results(models)
def _estimate(self, dtrajs): # ensure right format dtrajs = _types.ensure_dtraj_list(dtrajs) # CHECK LAG trajlengths = [_np.size(dtraj) for dtraj in dtrajs] if self.lag >= _np.max(trajlengths): raise ValueError('Illegal lag time ' + str(self.lag) + ' exceeds longest trajectory length') if self.lag > _np.mean(trajlengths): self.logger.warning( 'Lag time ' + str(self.lag) + ' is on the order of mean trajectory length ' + str(_np.mean(trajlengths)) + '. It is recommended to fit four lag times in each ' + 'trajectory. HMM might be inaccurate.') # EVALUATE STRIDE if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # get a quick estimate from the spectral radius of the non-reversible from pyemma.msm import estimate_markov_model msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False, connectivity='largest', dt_traj=self.timestep_traj) # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_nr.nstates > self.nstates: # because we use non-reversible msm, we want to silence the ImaginaryEigenvalueWarning import warnings with warnings.catch_warnings(): warnings.filterwarnings( 'ignore', category=ImaginaryEigenValueWarning, module= 'deeptime.markov.tools.analysis.dense.decomposition') corrtime = max(1, msm_nr.timescales()[self.nstates - 1]) # use the smaller of these two pessimistic estimates self.stride = int(min(self.lag, 2 * corrtime)) # LAG AND STRIDE DATA from deeptime.markov import compute_dtrajs_effective dtrajs_lagged_strided = compute_dtrajs_effective(dtrajs, self.lag, n_states=-1, stride=self.stride) # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # INIT HMM from deeptime.markov.hmm import init from pyemma.msm.estimators import MaximumLikelihoodMSM from pyemma.msm.estimators import OOMReweightedMSM if self.msm_init == 'largest-strong': hmm_init = init.discrete.metastable_from_data( dtrajs, n_hidden_states=self.nstates, lagtime=self.lag, stride=self.stride, mode='largest-regularized', reversible=self.reversible, stationary=True, separate_symbols=self.separate) elif self.msm_init == 'all': hmm_init = init.discrete.metastable_from_data( dtrajs, n_hidden_states=self.nstates, lagtime=self.lag, stride=self.stride, reversible=self.reversible, stationary=True, separate_symbols=self.separate, mode='all-regularized') elif isinstance( self.msm_init, (MaximumLikelihoodMSM, OOMReweightedMSM)): # initial MSM given. msm = MarkovStateModel(transition_matrix=self.msm_init.P, count_model=TransitionCountModel( self.msm_init.count_matrix_active)) hmm_init = init.discrete.metastable_from_msm( msm, n_hidden_states=self.nstates, reversible=self.reversible, stationary=True, separate_symbols=self.separate) observe_subset = self.msm_init.active_set # override observe_subset. else: raise ValueError('Unknown MSM initialization option: ' + str(self.msm_init)) # --------------------------------------------------------------------------------------- # Estimate discrete HMM # --------------------------------------------------------------------------------------- # run EM from deeptime.markov.hmm import MaximumLikelihoodHMM hmm_est = MaximumLikelihoodHMM(hmm_init, lagtime=self.lag, stride=self.stride, reversible=self.reversible, stationary=self.stationary, accuracy=self.accuracy, maxit=self.maxit) # run hmm_est.fit(dtrajs) # package in discrete HMM self.hmm = hmm_est.fetch_model() # get model parameters self.initial_distribution = self.hmm.initial_distribution transition_matrix = self.hmm.transition_model.transition_matrix observation_probabilities = self.hmm.output_probabilities # get estimation parameters self.likelihoods = self.hmm.likelihoods # Likelihood history self.likelihood = self.likelihoods[-1] self.hidden_state_probabilities = self.hmm.state_probabilities # gamma variables self.hidden_state_trajectories = self.hmm.hidden_state_trajectories # Viterbi path self.count_matrix = self.hmm.count_model.count_matrix # hidden count matrix self.initial_count = self.hmm.initial_count # hidden init count self._active_set = _np.arange(self.nstates) # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards? # parametrize self self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged_strided self._nstates_obs_full = number_of_states(dtrajs) self._nstates_obs = number_of_states(dtrajs_lagged_strided) self._observable_set = _np.arange(self._nstates_obs) self._dtrajs_obs = dtrajs self.set_model_params(P=transition_matrix, pobs=observation_probabilities, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) # TODO: perhaps remove connectivity and just rely on .submodel()? # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity, inplace=True)
def timescales_hmsm(dtrajs, nstates, lags=None, nits=None, reversible=True, connected=True, errors=None, nsamples=100, n_jobs=1, show_progress=True): r""" Calculate implied timescales from Hidden Markov state models estimated at a series of lag times. Warning: this can be slow! Parameters ---------- dtrajs : array-like or list of array-likes discrete trajectories nstates : int number of hidden states lags : array-like of integers (optional) integer lag times at which the implied timescales will be calculated nits : int (optional) number of implied timescales to be computed. Will compute less if the number of states are smaller. None means the number of timescales will be determined automatically. connected : boolean (optional) If true compute the connected set before transition matrix estimation at each lag separately reversible : boolean (optional) Estimate transition matrix reversibly (True) or nonreversibly (False) errors : None | 'bayes' Specifies whether to compute statistical uncertainties (by default not), an which algorithm to use if yes. The only option is currently 'bayes'. This algorithm is much faster than MSM-based error calculation because the involved matrices are much smaller. nsamples : int Number of approximately independent HMSM samples generated for each lag time for uncertainty quantification. Only used if errors is not None. n_jobs = 1 : int how many subprocesses to start to estimate the models for each lag time. show_progress : bool, default=True Show progressbars for calculation? Returns ------- itsobj : :class:`ImpliedTimescales <pyemma.msm.ImpliedTimescales>` object See also -------- ImpliedTimescales The object returned by this function. pyemma.plots.plot_implied_timescales Plotting function for the :class:`ImpliedTimescales <pyemma.msm.ImpliedTimescales>` object Example ------- >>> from pyemma import msm >>> import numpy as np >>> np.set_printoptions(precision=3) >>> dtraj = [0,1,1,2,2,2,1,2,2,2,1,0,0,1,1,1,2,2,1,1,2,1,1,0,0,0,1,1,2,2,1] # mini-trajectory >>> ts = msm.timescales_hmsm(dtraj, 2, [1,2,3,4,5]) >>> print(ts.timescales) # doctest: +ELLIPSIS [[ 1.691] [ 7.537] [ 1.919] [ 40.962] [ 11.527]] .. autoclass:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales :members: :undoc-members: .. rubric:: Methods .. autoautosummary:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales :methods: .. rubric:: Attributes .. autoautosummary:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales :attributes: References ---------- Implied timescales as a lagtime-selection and MSM-validation approach were suggested in [1]_. Hidden Markov state model estimation is done here as described in [2]_. For uncertainty quantification we employ the Bayesian sampling algorithm described in [3]_. .. [1] Swope, W. C. and J. W. Pitera and F. Suits: Describing protein folding kinetics by molecular dynamics simulations: 1. Theory. J. Phys. Chem. B 108: 6571-6581 (2004) .. [2] F. Noe, H. Wu, J.-H. Prinz and N. Plattner: Projected and hidden Markov models for calculating kinetics and metastable states of complex molecules. J. Chem. Phys. 139, 184114 (2013) .. [3] J. D. Chodera et al: Bayesian hidden Markov model analysis of single-molecule force spectroscopy: Characterizing kinetics under measurement uncertainty arXiv:1108.1430 (2011) """ # format data dtrajs = _types.ensure_dtraj_list(dtrajs) if connected: connectivity = 'largest' else: connectivity = 'none' # MLE or error estimation? if errors is None: estimator = _ML_HMSM(nstates=nstates, reversible=reversible, connectivity=connectivity) elif errors == 'bayes': estimator = _Bayes_HMSM(nstates=nstates, reversible=reversible, connectivity=connectivity, show_progress=show_progress, nsamples=nsamples) else: raise NotImplementedError('Error estimation method' + str(errors) + 'currently not implemented') # go itsobj = _ImpliedTimescales(estimator, lags=lags, nits=nits, n_jobs=n_jobs, show_progress=show_progress) itsobj.estimate(dtrajs) return itsobj
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # remove last lag steps from dtrajs: dtrajs_lag = [traj[:-self.lag] for traj in dtrajs] # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs_lag) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warning('Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': self.active_set = dtrajstats.largest_connected_set else: raise NotImplementedError('OOM based MSM estimation is only implemented for connectivity=\'largest\'.') # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # Estimate transition matrix if self.connectivity == 'largest': # Re-sampling: if self.rank_Ct=='bootstrap_counts': Ceff_full = msmest.effective_count_matrix(dtrajs_lag, self.lag) from pyemma.util.linalg import submatrix Ceff = submatrix(Ceff_full, self.active_set) smean, sdev = bootstrapping_count_matrix(Ceff, nbs=self.nbs) else: smean, sdev = bootstrapping_dtrajs(dtrajs_lag, self.lag, self._nstates_full, nbs=self.nbs, active_set=self._active_set) # Estimate two step count matrices: C2t = twostep_count_matrix(dtrajs, self.lag, self._nstates_full) # Rank decision: rank_ind = rank_decision(smean, sdev, tol=self.tol_rank) # Estimate OOM components: Xi, omega, sigma, l = oom_components(self._C_full.toarray(), C2t, rank_ind=rank_ind, lcc=self.active_set) # Compute transition matrix: P, lcc_new = equilibrium_transition_matrix(Xi, omega, sigma, reversible=self.reversible) else: raise NotImplementedError('OOM based MSM estimation is only implemented for connectivity=\'largest\'.') # Update active set and derived quantities: if lcc_new.size < self._nstates: self._active_set = self._active_set[lcc_new] self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) warnings.warn("Caution: Re-estimation of count matrix resulted in reduction of the active set.") # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self._Xi = Xi self._omega = omega self._sigma = sigma self._eigenvalues_OOM = l self._rank_ind = rank_ind self._oom_rank = self._sigma.size self._C2t = C2t self.set_model_params(P=P, pi=None, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def wham(ttrajs, dtrajs, bias, maxiter=100000, maxerr=1.0E-15, save_convergence_info=0, dt_traj='1 step'): r""" Weighted histogram analysis method Parameters ---------- ttrajs : ndarray(T) of int, or list of ndarray(T_i) of int A single discrete trajectory or a list of discrete trajectories. The integers are indexes in 1,...,K enumerating the thermodynamic states the trajectory is in at any time. dtrajs : ndarray(T) of int, or list of ndarray(T_i) of int A single discrete trajectory or a list of discrete trajectories. The integers are indexes in 1,...,n enumerating the n Markov states or the bins the trajectory is in at any time. bias : ndarray(K, n) bias[j,i] is the bias energy for each discrete state i at thermodynamic state j. maxiter : int, optional, default=10000 The maximum number of dTRAM iterations before the estimator exits unsuccessfully. maxerr : float, optional, default=1e-15 Convergence criterion based on the maximal free energy change in a self-consistent iteration step. save_convergence_info : int, optional, default=0 Every save_convergence_info iteration steps, store the actual increment and the actual loglikelihood; 0 means no storage. dt_traj : str, optional, default='1 step' Description of the physical time corresponding to the lag. May be used by analysis algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e. there is no physical time unit. Specify by a number, whitespace and unit. Permitted units are (* is an arbitrary string): | 'fs', 'femtosecond*' | 'ps', 'picosecond*' | 'ns', 'nanosecond*' | 'us', 'microsecond*' | 'ms', 'millisecond*' | 's', 'second*' Returns ------- sm : StationaryModel A stationary model which consists of thermodynamic quantities at all temperatures/thermodynamic states. Example ------- **Example: Umbrella sampling**. Suppose we simulate in K umbrellas, centered at positions :math:`y_1,...,y_K` with bias energies .. math:: b_k(x) = 0.5 * c_k * (x - y_k)^2 / kT Suppose we have one simulation of length T in each umbrella, and they are ordered from 1 to K. We have discretized the x-coordinate into 100 bins. Then dtrajs and ttrajs should each be a list of :math:`K` arrays. dtrajs would look for example like this:: [ (1, 2, 2, 3, 2, ...), (2, 4, 5, 4, 4, ...), ... ] where each array has length T, and is the sequence of bins (in the range 0 to 99) visited along the trajectory. ttrajs would look like this: [ (0, 0, 0, 0, 0, ...), (1, 1, 1, 1, 1, ...), ... ] Because trajectory 1 stays in umbrella 1 (index 0), trajectory 2 stays in umbrella 2 (index 1), and so forth. bias is a :math:`K \times n` matrix with all reduced bias energies evaluated at all centers:: [[b_0(y_0), b_0(y_1), ..., b_0(y_n)], [b_1(y_0), b_1(y_1), ..., b_1(y_n)], ... [b_K(y_0), b_K(y_1), ..., b_K(y_n)]] """ # prepare trajectories ttrajs = _types.ensure_dtraj_list(ttrajs) dtrajs = _types.ensure_dtraj_list(dtrajs) assert len(ttrajs) == len(dtrajs) X = [] for i in range(len(ttrajs)): ttraj = ttrajs[i] dtraj = dtrajs[i] assert len(ttraj) == len(dtraj) X.append(_np.ascontiguousarray(_np.array([ttraj, dtraj]).T)) # build WHAM from pyemma.thermo import WHAM wham_estimator = WHAM(bias, maxiter=maxiter, maxerr=maxerr, save_convergence_info=save_convergence_info, dt_traj=dt_traj) # run estimation return wham_estimator.estimate(X)
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) if self.init_hmsm is None: # estimate using maximum-likelihood superclass # memorize the observation state for bhmm and reset # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs) default_connectivity = self.connectivity default_mincount_connectivity = self.mincount_connectivity default_observe_nonempty = self.observe_nonempty self.connectivity = None self.observe_nonempty = False self.mincount_connectivity = 0 self.accuracy = 1e-2 # this is sufficient for an initial guess super(BayesianHMSM, self)._estimate(dtrajs) self.connectivity = default_connectivity self.mincount_connectivity = default_mincount_connectivity self.observe_nonempty = default_observe_nonempty else: # if given another initialization, must copy its attributes copy_attributes = ['_nstates', '_reversible', '_pi', '_observable_set', 'likelihoods', 'likelihood', 'hidden_state_probabilities', 'hidden_state_trajectories', 'count_matrix', 'initial_count', 'initial_distribution', '_active_set'] check_user_choices = ['lag', '_nstates'] # check if nstates and lag are compatible for attr in check_user_choices: if not getattr(self, attr) == getattr(self.init_hmsm, attr): raise UserWarning('BayesianHMSM cannot be initialized with init_hmsm with ' 'incompatible lag or nstates.') if (len(dtrajs) != len(self.init_hmsm.dtrajs_full) or not all((_np.array_equal(d1, d2) for d1, d2 in zip(dtrajs, self.init_hmsm.dtrajs_full)))): raise NotImplementedError('Bayesian HMM estimation with init_hmsm is currently only implemented ' + 'if applied to the same data.') # TODO: implement more elegant solution to copy-pasting effective stride evaluation from ML HMM. # EVALUATE STRIDE if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # get a quick estimate from the spectral radius of the nonreversible from pyemma.msm import estimate_markov_model msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False, connectivity='largest', dt_traj=self.timestep_traj) # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_nr.nstates > self.nstates: corrtime = max(1, msm_nr.timescales()[self.nstates - 1]) # use the smaller of these two pessimistic estimates self.stride = int(min(self.lag, 2 * corrtime)) # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible if self.stride != self.init_hmsm.stride: dtrajs_lagged_strided = _lag_observations(dtrajs, self.lag, stride=self.stride) _nstates_obs = _number_of_states(dtrajs_lagged_strided, only_used=True) _nstates_obs_full = _number_of_states(dtrajs) if _np.setxor1d(_np.concatenate(dtrajs_lagged_strided), _np.concatenate(self.init_hmsm._dtrajs_lagged)).size != 0: raise UserWarning('Choice of stride has excluded a different set of microstates than in ' + 'init_hmsm. Set of observed microstates in time-lagged strided trajectories ' + 'must match to the one used for init_hmsm estimation.') self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged_strided self._nstates_obs_full = _nstates_obs_full self._nstates_obs = _nstates_obs self._observable_set = _np.arange(self._nstates_obs) self._dtrajs_obs = dtrajs else: copy_attributes += ['_dtrajs_full', '_dtrajs_lagged', '_nstates_obs_full', '_nstates_obs', '_observable_set', '_dtrajs_obs'] # update self with estimates from init_hmsm self.__dict__.update( {k: i for k, i in self.init_hmsm.__dict__.items() if k in copy_attributes}) # as mentioned in the docstring, take init_hmsm observed set observation probabilities self.observe_nonempty = False # update HMM Model self.update_model_params(P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities, dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag)) # check if we have a valid initial model import msmtools.estimation as msmest if self.reversible and not msmest.is_connected(self.count_matrix): raise NotImplementedError('Encountered disconnected count matrix:\n ' + str(self.count_matrix) + 'with reversible Bayesian HMM sampler using lag=' + str(self.lag) + ' and stride=' + str(self.stride) + '. Consider using shorter lag, ' + 'or shorter stride (to use more of the data), ' + 'or using a lower value for mincount_connectivity.') # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the # Bayesian HMM sampler. This is just an initialization. nstates_full = msmest.number_of_states(dtrajs) if self.nstates_obs < nstates_full: eps = 0.01 / nstates_full # default output probability, in order to avoid zero columns # full state space output matrix. make sure there are no zero columns B_init = eps * _np.ones((self.nstates, nstates_full), dtype=_np.float64) # fill active states B_init[:, self.observable_set] = _np.maximum(eps, self.observation_probabilities) # renormalize B to make it row-stochastic B_init /= B_init.sum(axis=1)[:, None] else: B_init = self.observation_probabilities # HMM sampler if self.show_progress: self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0) def call_back(): self._progress_update(1, stage=0) else: call_back = None from bhmm import discrete_hmm, bayesian_hmm if self.init_hmsm is not None: hmm_mle = self.init_hmsm.hmm else: hmm_mle = discrete_hmm(self.initial_distribution, self.transition_matrix, B_init) sampled_hmm = bayesian_hmm(self.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples, reversible=self.reversible, stationary=self.stationary, p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior, store_hidden=self.store_hidden, call_back=call_back) if self.show_progress: self._progress_force_finish(stage=0) # Samples sample_inp = [(m.transition_matrix, m.stationary_distribution, m.output_probabilities) for m in sampled_hmm.sampled_hmms] samples = [] for P, pi, pobs in sample_inp: # restrict to observable set if necessary Bobs = pobs[:, self.observable_set] pobs = Bobs / Bobs.sum(axis=1)[:, None] # renormalize samples.append(_HMSM(P, pobs, pi=pi, dt_model=self.dt_model)) # store results self.sampled_trajs = [sampled_hmm.sampled_hmms[i].hidden_state_trajectories for i in range(self.nsamples)] self.update_model_params(samples=samples) # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity)
def _estimate(self, dtrajs): """ Parameters ---------- dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) discrete trajectories, stored as integer ndarrays (arbitrary size) or a single ndarray for only one trajectory. Return ------ hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>` Estimated Hidden Markov state model """ # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # conduct MLE estimation (superclass) first _MLMSM._estimate(self, dtrajs) # transition matrix sampler from msmtools.estimation import tmatrix_sampler from math import sqrt if self.nsteps is None: self.nsteps = int(sqrt( self.nstates)) # heuristic for number of steps to decorrelate # use the same count matrix as the MLE. This is why we have effective as a default if self.statdist_constraint is None: tsampler = tmatrix_sampler(self.count_matrix_active, reversible=self.reversible, T0=self.transition_matrix, nsteps=self.nsteps) else: # Use the stationary distribution on the active set of states statdist_active = self.pi # We can not uise the MLE as T0. Use the initialization in the reversible pi sampler tsampler = tmatrix_sampler(self.count_matrix_active, reversible=self.reversible, mu=statdist_active, nsteps=self.nsteps) self._progress_register(self.nsamples, description="Sampling MSMs", stage=0) if self.show_progress: def call_back(): self._progress_update(1, stage=0) else: call_back = None sample_Ps, sample_mus = tsampler.sample(nsamples=self.nsamples, return_statdist=True, call_back=call_back) self._progress_force_finish(0) # construct sampled MSMs samples = [] for i in range(self.nsamples): samples.append( _MSM(sample_Ps[i], pi=sample_mus[i], reversible=self.reversible, dt_model=self.dt_model)) # update self model self.update_model_params(samples=samples) # done return self
def wham(ttrajs, dtrajs, bias, maxiter=100000, maxerr=1.0E-15, save_convergence_info=0, dt_traj='1 step'): #TODO fix docstring r""" Weighted histogram analysis method Parameters ---------- ttrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int A single discrete trajectory or a list of discrete trajectories. The integers are indexes in 0,...,num_therm_states-1 enumerating the thermodynamic states the trajectory is in at any time. dtrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int A single discrete trajectory or a list of discrete trajectories. The integers are indexes in 0,...,num_conf_states-1 enumerating the num_conf_states Markov states or the bins the trajectory is in at any time. bias : numpy.ndarray(shape=(num_therm_states, num_conf_states)) object bias_energies_full[j, i] is the bias energy in units of kT for each discrete state i at thermodynamic state j. maxiter : int, optional, default=10000 The maximum number of dTRAM iterations before the estimator exits unsuccessfully. maxerr : float, optional, default=1e-15 Convergence criterion based on the maximal free energy change in a self-consistent iteration step. save_convergence_info : int, optional, default=0 Every save_convergence_info iteration steps, store the actual increment and the actual loglikelihood; 0 means no storage. dt_traj : str, optional, default='1 step' Description of the physical time corresponding to the lag. May be used by analysis algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e. there is no physical time unit. Specify by a number, whitespace and unit. Permitted units are (* is an arbitrary string): | 'fs', 'femtosecond*' | 'ps', 'picosecond*' | 'ns', 'nanosecond*' | 'us', 'microsecond*' | 'ms', 'millisecond*' | 's', 'second*' Returns ------- sm : StationaryModel A stationary model which consists of thermodynamic quantities at all temperatures/thermodynamic states. Example ------- **Umbrella sampling**: Suppose we simulate in K umbrellas, centered at positions :math:`y_0,...,y_{K-1}` with bias energies .. math:: b_k(x) = \frac{c_k}{2 \textrm{kT}} \cdot (x - y_k)^2 Suppose we have one simulation of length T in each umbrella, and they are ordered from 0 to K-1. We have discretized the x-coordinate into 100 bins. Then dtrajs and ttrajs should each be a list of :math:`K` arrays. dtrajs would look for example like this:: [ (0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...), (0, 1, 0, 1, 0, 1, 1, 0, 0, 1, ...), ... ] where each array has length T, and is the sequence of bins (in the range 0 to 99) visited along the trajectory. ttrajs would look like this:: [ (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...), (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...), ... ] Because trajectory 1 stays in umbrella 1 (index 0), trajectory 2 stays in umbrella 2 (index 1), and so forth. bias is a :math:`K \times n` matrix with all reduced bias energies evaluated at all centers: .. math:: \left(\begin{array}{cccc} b_0(y_0) & b_0(y_1) & ... & b_0(y_{n-1}) \\ b_1(y_0) & b_1(y_1) & ... & b_1(y_{n-1}) \\ ... \\ b_{K-1}(y_0) & b_{K-1}(y_1) & ... & b_{K-1}(y_{n-1}) \end{array}\right) Let us try the above example: >>> from pyemma.thermo import wham >>> import numpy as np >>> ttrajs = [np.array([0,0,0,0,0,0,0,0,0,0]), np.array([1,1,1,1,1,1,1,1,1,1])] >>> dtrajs = [np.array([0,0,0,0,1,1,1,0,0,0]), np.array([0,1,0,1,0,1,1,0,0,1])] >>> bias = np.array([[0.0, 0.0], [0.5, 1.0]]) >>> wham_obj = wham(ttrajs, dtrajs, bias) >>> wham_obj.log_likelihood() # doctest: +ELLIPSIS -6.6... >>> wham_obj.state_counts # doctest: +SKIP array([[7, 3], [5, 5]]) >>> wham_obj.stationary_distribution # doctest: +ELLIPSIS +REPORT_NDIFF array([ 0.5..., 0.4...]) References ---------- .. [1] Ferrenberg, A.M. and Swensen, R.H. 1988. New Monte Carlo Technique for Studying Phase Transitions. Phys. Rev. Lett. 23, 2635--2638 .. [2] Kumar, S. et al 1992. The Weighted Histogram Analysis Method for Free-Energy Calculations on Biomolecules. I. The Method. J. Comp. Chem. 13, 1011--1021 """ # check trajectories ttrajs = _types.ensure_dtraj_list(ttrajs) dtrajs = _types.ensure_dtraj_list(dtrajs) assert len(ttrajs) == len(dtrajs) for ttraj, dtraj in zip(ttrajs, dtrajs): assert len(ttrajs) == len(dtrajs) # build WHAM from pyemma.thermo import WHAM wham_estimator = WHAM(bias, maxiter=maxiter, maxerr=maxerr, save_convergence_info=save_convergence_info, dt_traj=dt_traj) # run estimation return wham_estimator.estimate((ttrajs, dtrajs))
def dtram(ttrajs, dtrajs, bias, lag, unbiased_state=None, count_mode='sliding', connectivity='largest', maxiter=10000, maxerr=1.0E-15, save_convergence_info=0, dt_traj='1 step', init=None, init_maxiter=10000, init_maxerr=1.0E-8): r""" Discrete transition-based reweighting analysis method Parameters ---------- ttrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int A single discrete trajectory or a list of discrete trajectories. The integers are indexes in 0,...,num_therm_states-1 enumerating the thermodynamic states the trajectory is in at any time. dtrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int A single discrete trajectory or a list of discrete trajectories. The integers are indexes in 0,...,num_conf_states-1 enumerating the num_conf_states Markov states or the bins the trajectory is in at any time. bias : numpy.ndarray(shape=(num_therm_states, num_conf_states)) object bias_energies_full[j, i] is the bias energy in units of kT for each discrete state i at thermodynamic state j. lag : int or list of int, optional, default=1 Integer lag time at which transitions are counted. Providing a list of lag times will trigger one estimation per lag time. count_mode : str, optional, default='sliding' Mode to obtain count matrices from discrete trajectories. Should be one of: * 'sliding' : a trajectory of length T will have :math:`T-\tau` counts at time indexes .. math:: (0 \rightarrow \tau), (1 \rightarrow \tau+1), ..., (T-\tau-1 \rightarrow T-1) * 'sample' : a trajectory of length T will have :math:`T/\tau` counts at time indexes .. math:: (0 \rightarrow \tau), (\tau \rightarrow 2 \tau), ..., ((T/\tau-1) \tau \rightarrow T) Currently only 'sliding' is supported. connectivity : str, optional, default='largest' Defines what should be considered a connected set in the joint space of conformations and thermodynamic ensembles. Currently only 'largest' is supported. maxiter : int, optional, default=10000 The maximum number of dTRAM iterations before the estimator exits unsuccessfully. maxerr : float, optional, default=1e-15 Convergence criterion based on the maximal free energy change in a self-consistent iteration step. save_convergence_info : int, optional, default=0 Every save_convergence_info iteration steps, store the actual increment and the actual loglikelihood; 0 means no storage. dt_traj : str, optional, default='1 step' Description of the physical time corresponding to the lag. May be used by analysis algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e. there is no physical time unit. Specify by a number, whitespace and unit. Permitted units are (* is an arbitrary string): | 'fs', 'femtosecond*' | 'ps', 'picosecond*' | 'ns', 'nanosecond*' | 'us', 'microsecond*' | 'ms', 'millisecond*' | 's', 'second*' init : str, optional, default=None Use a specific initialization for self-consistent iteration: | None: use a hard-coded guess for free energies and Lagrangian multipliers | 'wham': perform a short WHAM estimate to initialize the free energies init_maxiter : int, optional, default=10000 The maximum number of self-consistent iterations during the initialization. init_maxerr : float, optional, default=1.0E-8 Convergence criterion for the initialization. Returns ------- memm : MEMM or list of MEMMs A multi-ensemble Markov state model (for each given lag time) which consists of stationary and kinetic quantities at all temperatures/thermodynamic states. Example ------- **Umbrella sampling**: Suppose we simulate in K umbrellas, centered at positions :math:`y_0,...,y_{K-1}` with bias energies .. math:: b_k(x) = \frac{c_k}{2 \textrm{kT}} \cdot (x - y_k)^2 Suppose we have one simulation of length T in each umbrella, and they are ordered from 0 to K-1. We have discretized the x-coordinate into 100 bins. Then dtrajs and ttrajs should each be a list of :math:`K` arrays. dtrajs would look for example like this:: [ (0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...), (0, 1, 0, 1, 0, 1, 1, 0, 0, 1, ...), ... ] where each array has length T, and is the sequence of bins (in the range 0 to 99) visited along the trajectory. ttrajs would look like this:: [ (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...), (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...), ... ] Because trajectory 1 stays in umbrella 1 (index 0), trajectory 2 stays in umbrella 2 (index 1), and so forth. bias is a :math:`K \times n` matrix with all reduced bias energies evaluated at all centers: .. math:: \left(\begin{array}{cccc} b_0(y_0) & b_0(y_1) & ... & b_0(y_{n-1}) \\ b_1(y_0) & b_1(y_1) & ... & b_1(y_{n-1}) \\ ... \\ b_{K-1}(y_0) & b_{K-1}(y_1) & ... & b_{K-1}(y_{n-1}) \end{array}\right) Let us try the above example: >>> from pyemma.thermo import dtram >>> import numpy as np >>> ttrajs = [np.array([0,0,0,0,0,0,0,0,0,0]), np.array([1,1,1,1,1,1,1,1,1,1])] >>> dtrajs = [np.array([0,0,0,0,1,1,1,0,0,0]), np.array([0,1,0,1,0,1,1,0,0,1])] >>> bias = np.array([[0.0, 0.0], [0.5, 1.0]]) >>> dtram_obj = dtram(ttrajs, dtrajs, bias, 1) >>> dtram_obj.log_likelihood() # doctest: +ELLIPSIS -9.805... >>> dtram_obj.count_matrices # doctest: +SKIP array([[[5, 1], [1, 2]], [[1, 4], [3, 1]]], dtype=int32) >>> dtram_obj.stationary_distribution # doctest: +ELLIPSIS array([ 0.38..., 0.61...]) References ---------- .. [1] Wu, H. et al 2014 Statistically optimal analysis of state-discretized trajectory data from multiple thermodynamic states J. Chem. Phys. 141, 214106 """ # prepare trajectories ttrajs = _types.ensure_dtraj_list(ttrajs) dtrajs = _types.ensure_dtraj_list(dtrajs) assert len(ttrajs) == len(dtrajs) for ttraj, dtraj in zip(ttrajs, dtrajs): assert len(ttraj) == len(dtraj) # check lag time(s) lags = _np.asarray(lag, dtype=_np.intc).reshape((-1, )).tolist() # build DTRAM and run estimation from pyemma.thermo import DTRAM dtram_estimators = [ DTRAM(bias, _lag, count_mode=count_mode, connectivity=connectivity, maxiter=maxiter, maxerr=maxerr, save_convergence_info=save_convergence_info, dt_traj=dt_traj, init=init, init_maxiter=init_maxiter, init_maxerr=init_maxerr).estimate((ttrajs, dtrajs)) for _lag in lags ] _assign_unbiased_state_label(dtram_estimators, unbiased_state) # return if len(dtram_estimators) == 1: return dtram_estimators[0] return dtram_estimators
def tram(ttrajs, dtrajs, bias, lag, unbiased_state=None, count_mode='sliding', connectivity='summed_count_matrix', maxiter=10000, maxerr=1.0E-15, save_convergence_info=0, dt_traj='1 step', connectivity_factor=1.0, nn=None, direct_space=False, N_dtram_accelerations=0, callback=None, init='mbar', init_maxiter=10000, init_maxerr=1e-8): r""" Transition-based reweighting analysis method Parameters ---------- ttrajs : numpy.ndarray(T), or list of numpy.ndarray(T_i) A single discrete trajectory or a list of discrete trajectories. The integers are indexes in 0,...,num_therm_states-1 enumerating the thermodynamic states the trajectory is in at any time. dtrajs : numpy.ndarray(T) of int, or list of numpy.ndarray(T_i) of int A single discrete trajectory or a list of discrete trajectories. The integers are indexes in 0,...,num_conf_states-1 enumerating the num_conf_states Markov states or the bins the trajectory is in at any time. btrajs : numpy.ndarray(T, num_therm_states), or list of numpy.ndarray(T_i, num_therm_states) A single reduced bias energy trajectory or a list of reduced bias energy trajectories. For every simulation frame seen in trajectory i and time step t, btrajs[i][t, k] is the reduced bias energy of that frame evaluated in the k'th thermodynamic state (i.e. at the k'th umbrella/Hamiltonian/temperature) lag : int or list of int, optional, default=1 Integer lag time at which transitions are counted. Providing a list of lag times will trigger one estimation per lag time. maxiter : int, optional, default=10000 The maximum number of dTRAM iterations before the estimator exits unsuccessfully. maxerr : float, optional, default=1e-15 Convergence criterion based on the maximal free energy change in a self-consistent iteration step. save_convergence_info : int, optional, default=0 Every save_convergence_info iteration steps, store the actual increment and the actual loglikelihood; 0 means no storage. dt_traj : str, optional, default='1 step' Description of the physical time corresponding to the lag. May be used by analysis algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e. there is no physical time unit. Specify by a number, whitespace and unit. Permitted units are (* is an arbitrary string): | 'fs', 'femtosecond*' | 'ps', 'picosecond*' | 'ns', 'nanosecond*' | 'us', 'microsecond*' | 'ms', 'millisecond*' | 's', 'second*' connectivity : str, optional, default='summed_count_matrix' One of 'summed_count_matrix', 'strong_in_every_ensemble', 'neighbors', 'post_hoc_RE' or 'BAR_variance'. Defines what should be considered a connected set in the joint space of conformations and thermodynamic ensembles. For details see thermotools.cset.compute_csets_TRAM. nn : int, optional, default=None Only needed if connectivity='neighbors' See thermotools.cset.compute_csets_TRAM. connectivity_factor : float, optional, default=1.0 Only needed if connectivity='post_hoc_RE' or 'BAR_variance'. Weakens the connectivity requirement, see thermotools.cset.compute_csets_TRAM. direct_space : bool, optional, default=False Whether to perform the self-consitent iteration with Boltzmann factors (direct space) or free energies (log-space). When analyzing data from multi-temperature simulations, direct-space is not recommended. N_dtram_accelerations : int, optional, default=0 Convergence of TRAM can be speeded up by interleaving the updates in the self-consitent iteration with a dTRAM-like update step. N_dtram_accelerations says how many times the dTRAM-like update step should be applied in every iteration of the TRAM equations. Currently this is only effective if direct_space=True. init : str, optional, default=None Use a specific initialization for self-consistent iteration: | None: use a hard-coded guess for free energies and Lagrangian multipliers | 'wham': perform a short WHAM estimate to initialize the free energies init_maxiter : int, optional, default=10000 The maximum number of self-consistent iterations during the initialization. init_maxerr : float, optional, default=1.0E-8 Convergence criterion for the initialization. Returns ------- memm : MEMM or list of MEMMs A multi-ensemble Markov state model (for each given lag time) which consists of stationary and kinetic quantities at all temperatures/thermodynamic states. Example ------- **Umbrella sampling**: Suppose we simulate in K umbrellas, centered at positions :math:`y_0,...,y_{K-1}` with bias energies .. math:: b_k(x) = \frac{c_k}{2 \textrm{kT}} \cdot (x - y_k)^2 Suppose we have one simulation of length T in each umbrella, and they are ordered from 0 to K-1. We have discretized the x-coordinate into 100 bins. Then dtrajs and ttrajs should each be a list of :math:`K` arrays. dtrajs would look for example like this:: [ (0, 0, 0, 0, 1, 1, 1, 0, 0, 0, ...), (0, 1, 0, 1, 0, 1, 1, 0, 0, 1, ...), ... ] where each array has length T, and is the sequence of bins (in the range 0 to 99) visited along the trajectory. ttrajs would look like this:: [ (0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...), (1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...), ... ] Because trajectory 1 stays in umbrella 1 (index 0), trajectory 2 stays in umbrella 2 (index 1), and so forth. The bias would be a list of :math:`T \times K` arrays which specify each frame's bias energy in all thermodynamic states: [ ((0, 1.7, 2.3, 6.1, ...), ...), ((0, 2.4, 3.1, 9,5, ...), ...), ... ] Let us try the above example: >>> from pyemma.thermo import tram >>> import numpy as np >>> ttrajs = [np.array([0,0,0,0,0,0,0]), np.array([1,1,1,1,1,1,1])] >>> dtrajs = [np.array([0,0,0,0,1,1,1]), np.array([0,1,0,1,0,1,1])] >>> bias = [np.array([[1,0],[1,0],[0,0],[0,0],[0,0],[0,0],[0,0]],dtype=np.float64), np.array([[1,0],[0,0],[0,0],[1,0],[0,0],[1,0],[1,0]],dtype=np.float64)] >>> tram_obj = tram(ttrajs, dtrajs, bias, 1) >>> tram_obj.log_likelihood() # doctest: +ELLIPSIS -29.111... >>> tram_obj.count_matrices # doctest: +SKIP array([[[1 1] [0 4]] [[0 3] [2 1]]], dtype=int32) >>> tram_obj.stationary_distribution # doctest: +ELLIPSIS array([ 0.38... 0.61...]) References ---------- .. [1] Wu, H. et al 2016 in press """ # prepare trajectories ttrajs = _types.ensure_dtraj_list(ttrajs) dtrajs = _types.ensure_dtraj_list(dtrajs) assert len(ttrajs) == len(dtrajs) assert len(ttrajs) == len(bias) for ttraj, dtraj, btraj in zip(ttrajs, dtrajs, bias): assert len(ttraj) == len(dtraj) assert len(ttraj) == btraj.shape[0] # check lag time(s) lags = _np.asarray(lag, dtype=_np.intc).reshape((-1, )).tolist() # build TRAM and run estimation from pyemma.thermo import TRAM as _TRAM tram_estimators = [ _TRAM(_lag, count_mode=count_mode, connectivity=connectivity, maxiter=maxiter, maxerr=maxerr, save_convergence_info=save_convergence_info, dt_traj=dt_traj, connectivity_factor=connectivity_factor, nn=nn, direct_space=direct_space, N_dtram_accelerations=N_dtram_accelerations, callback=callback, init='mbar', init_maxiter=init_maxiter, init_maxerr=init_maxerr).estimate((ttrajs, dtrajs, bias)) for _lag in lags ] _assign_unbiased_state_label(tram_estimators, unbiased_state) # return if len(tram_estimators) == 1: return tram_estimators[0] return tram_estimators
def _estimate(self, dtrajs): ### PREPARE AND CHECK DATA # TODO: Currently only discrete trajectories are implemented. For a general class this needs to be changed. dtrajs = _types.ensure_dtraj_list(dtrajs) # check trajectory lengths if self._estimated: # if dtrajs has now changed, unset the _estimated flag to re-set every derived quantity. assert hasattr(self, '_last_dtrajs_input_hash') current_hash = _hash_dtrajs(dtrajs) if self._last_dtrajs_input_hash != current_hash: self.logger.warning( "estimating from new data, discard all previously computed models." ) self._estimated = False self._last_dtrajs_input_hash = current_hash else: self._last_dtrajs_input_hash = _hash_dtrajs(dtrajs) self._trajlengths = np.fromiter((len(traj) for traj in dtrajs), dtype=int, count=len(dtrajs)) maxlength = np.max(self._trajlengths) # set lag times by data if not yet set if self._lags is None: maxlag = 0.5 * np.sum(self._trajlengths) / float( len(self._trajlengths)) self._lags = _generate_lags(maxlag, 1.5) # check if some lag times are forbidden. if np.max(self._lags) >= maxlength: Ifit = np.where(self._lags < maxlength)[0] Inofit = np.where(self._lags >= maxlength)[0] self.logger.warning( 'Ignoring lag times that exceed the longest trajectory: %s', self._lags[Inofit]) self._lags = self._lags[Ifit] ### RUN ESTIMATION if self._estimated: # we already had run an estimation, determine which lag times we need to compute # TODO: this will re-evaluate problematic lag times, wont it? lags = sorted(list(set(self._lags).difference(self._last_lags))) if len(lags) == 0: self.logger.info("All lag times already estimated.") return self assert lags self.logger.info( "Running estimating for not yet estimated lags times: %s", lags) else: lags = self._lags # construct all parameter sets for the estimator param_sets = tuple(param_grid({'lag': lags})) # run estimation on all lag times if hasattr(self.estimator, 'show_progress'): self.estimator.show_progress = False if self.show_progress: pg = ProgressReporter() ctx = pg.context() else: pg = None # TODO: replace with nullcontext from util once merged. from contextlib import contextmanager @contextmanager def dummy(): yield ctx = dummy() with ctx: if not self.only_timescales: models, estimators = estimate_param_scan( self.estimator, dtrajs, param_sets, failfast=False, return_estimators=True, n_jobs=self.n_jobs, progress_reporter=pg, return_exceptions=True) self._estimators = estimators else: evaluate = ['timescales'] evaluate_args = [[self.nits]] if self._estimator_produces_samples(): evaluate.append('sample_f') evaluate_args.append('timescales') results = estimate_param_scan( self.estimator, dtrajs, param_sets, failfast=False, return_estimators=False, n_jobs=self.n_jobs, evaluate=evaluate, evaluate_args=evaluate_args, progress_reporter=pg, return_exceptions=True, ) if self._estimator_produces_samples(): models = [ _DummyModel(lag, ts, ts_sample) for lag, (ts, ts_sample) in zip(lags, results) ] else: models = [ _DummyModel( lag, ts, None, ) for lag, ts in zip(lags, results) ] self._postprocess_results(models) return self
def _estimate(self, dtrajs): """ Parameters ---------- Return ------ hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>` Estimated Hidden Markov state model """ # ensure right format dtrajs = _types.ensure_dtraj_list(dtrajs) # if no initial MSM is given, estimate it now if self.msm_init is None: # estimate with sparse=False, because we need to do PCCA which is currently not implemented for sparse # estimate with store_data=True, because we need an EstimatedMSM msm_estimator = _MSMEstimator(lag=self.lag, reversible=self.reversible, sparse=False, connectivity=self.connectivity, dt_traj=self.timestep_traj) msm_init = msm_estimator.estimate(dtrajs) else: assert isinstance(self.msm_init, _EstimatedMSM), 'msm_init must be of type EstimatedMSM' msm_init = self.msm_init self.reversible = msm_init.is_reversible # print 'Connected set: ', msm_init.active_set # generate lagged observations if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_init.nstates > self.nstates: corrtime = int(max(1, msm_init.timescales()[self.nstates-1])) # use the smaller of these two pessimistic estimates self.stride = min(self.stride, 2*corrtime) # TODO: Here we always use the full observation state space for the estimation. dtrajs_lagged = _lag_observations(dtrajs, self.lag, stride=self.stride) # check input assert _types.is_int(self.nstates) and self.nstates > 1 and self.nstates <= msm_init.nstates, \ 'nstates must be an int in [2,msmobj.nstates]' # if hmm.nstates = msm.nstates there is no problem. Otherwise, check spectral gap if msm_init.nstates > self.nstates: timescale_ratios = msm_init.timescales()[:-1] / msm_init.timescales()[1:] if timescale_ratios[self.nstates-2] < 2.0: self.logger.warn('Requested coarse-grained model with ' + str(self.nstates) + ' metastable states at ' + 'lag=' + str(self.lag) + '.' + 'The ratio of relaxation timescales between ' + str(self.nstates) + ' and ' + str(self.nstates+1) + ' states is only ' + str(timescale_ratios[self.nstates-2]) + ' while we recommend at least 2. ' + ' It is possible that the resulting HMM is inaccurate. Handle with caution.') # set things from MSM # TODO: dtrajs_obs is set here, but not used in estimation. Estimation is alwas done with # TODO: respect to full observation (see above). This is confusing. Define how we want to do this in gen. # TODO: observable set is also not used, it is just saved. nstates_obs_full = msm_init.nstates_full if self.observe_active: nstates_obs = msm_init.nstates observable_set = msm_init.active_set dtrajs_obs = msm_init.discrete_trajectories_active else: nstates_obs = msm_init.nstates_full observable_set = np.arange(nstates_obs_full) dtrajs_obs = msm_init.discrete_trajectories_full # TODO: this is redundant with BHMM code because that code is currently not easily accessible and # TODO: we don't want to re-estimate. Should be reengineered in bhmm. # --------------------------------------------------------------------------------------- # PCCA-based coarse-graining # --------------------------------------------------------------------------------------- # pcca- to number of metastable states pcca = msm_init.pcca(self.nstates) # HMM output matrix eps = 0.01 * (1.0/nstates_obs_full) # default output probability, in order to avoid zero columns # Use PCCA distributions, but at least eps to avoid 100% assignment to any state (breaks convergence) B_conn = np.maximum(msm_init.metastable_distributions, eps) # full state space output matrix B = eps * np.ones((self.nstates, nstates_obs_full), dtype=np.float64) # expand B_conn to full state space # TODO: here we always select the active set, no matter if observe_active=True or False. B[:, msm_init.active_set] = B_conn[:, :] # TODO: at this point we will have zero observation probabilities for states that are not in the active # TODO: set. If these occur in the trajectory, that will mean zero columns in the output probabilities # TODO: and crash of forward-backward and sampling algorithms. # renormalize B to make it row-stochastic B /= B.sum(axis=1)[:, None] # coarse-grained transition matrix P_coarse = pcca.coarse_grained_transition_matrix # take care of unphysical values. First symmetrize X = np.dot(np.diag(pcca.coarse_grained_stationary_probability), P_coarse) X = 0.5*(X + X.T) # if there are values < 0, set to eps X = np.maximum(X, eps) # turn into coarse-grained transition matrix A = X / X.sum(axis=1)[:, None] # --------------------------------------------------------------------------------------- # Estimate discrete HMM # --------------------------------------------------------------------------------------- # lazy import bhmm here in order to avoid dependency loops import bhmm # initialize discrete HMM hmm_init = bhmm.discrete_hmm(A, B, stationary=True, reversible=self.reversible) # run EM hmm = bhmm.estimate_hmm(dtrajs_lagged, self.nstates, lag=1, initial_model=hmm_init, accuracy=self.accuracy, maxit=self.maxit) self.hmm = bhmm.DiscreteHMM(hmm) # find observable set transition_matrix = self.hmm.transition_matrix observation_probabilities = self.hmm.output_probabilities # TODO: Cutting down... OK, this can be done if self.observe_active: # cut down observation probabilities to active set observation_probabilities = observation_probabilities[:, msm_init.active_set] observation_probabilities /= observation_probabilities.sum(axis=1)[:,None] # renormalize # parametrize self self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged self._observable_set = observable_set self._dtrajs_obs = dtrajs_obs self.set_model_params(P=transition_matrix, pobs=observation_probabilities, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def score(self, dtrajs, score_method=None, score_k=None): """ Scores the MSM using the dtrajs using the variational approach for Markov processes [1]_ [2]_ Currently only implemented using dense matrices - will be slow for large state spaces. Parameters ---------- dtrajs : list of arrays test data (discrete trajectories). score_method : str Overwrite scoring method if desired. If `None`, the estimators scoring method will be used. See __init__ for documentation. score_k : int or None Overwrite scoring rank if desired. If `None`, the estimators scoring rank will be used. See __init__ for documentation. score_method : str, optional, default='VAMP2' Overwrite scoring method to be used if desired. If `None`, the estimators scoring method will be used. Available scores are based on the variational approach for Markov processes [1]_ [2]_ : * 'VAMP1' Sum of singular values of the symmetrized transition matrix [2]_ . If the MSM is reversible, this is equal to the sum of transition matrix eigenvalues, also called Rayleigh quotient [1]_ [3]_ . * 'VAMP2' Sum of squared singular values of the symmetrized transition matrix [2]_ . If the MSM is reversible, this is equal to the kinetic variance [4]_ . score_k : int or None The maximum number of eigenvalues or singular values used in the score. If set to None, all available eigenvalues will be used. References ---------- .. [1] Noe, F. and F. Nueske: A variational approach to modeling slow processes in stochastic dynamical systems. SIAM Multiscale Model. Simul. 11, 635-655 (2013). .. [2] Wu, H and F. Noe: Variational approach for learning Markov processes from time series data (in preparation) .. [3] McGibbon, R and V. S. Pande: Variational cross-validation of slow dynamical modes in molecular kinetics, J. Chem. Phys. 142, 124105 (2015) .. [4] Noe, F. and C. Clementi: Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015) """ dtrajs = ensure_dtraj_list(dtrajs) # ensure format # reset estimator data if needed if score_method is not None: self.score_method = score_method if score_k is not None: self.score_k = score_k # determine actual scoring rank if self.score_k is None: self.score_k = self.nstates if self.score_k > self.nstates: self.logger.warning('Requested scoring rank {rank} exceeds number of MSM states. ' 'Reduced to score_k = {nstates}'.format(rank=self.score_k, nstates=self.nstates)) self.score_k = self.nstates # limit to nstates # training data K = self.transition_matrix # model C0t_train = self.count_matrix_active from scipy.sparse import issparse if issparse(K): # can't deal with sparse right now. K = K.toarray() if issparse(C0t_train): # can't deal with sparse right now. C0t_train = C0t_train.toarray() C00_train = _np.diag(C0t_train.sum(axis=1)) # empirical cov Ctt_train = _np.diag(C0t_train.sum(axis=0)) # empirical cov # test data C0t_test_raw = count_matrix(dtrajs, self.lag, sparse_return=False) # map to present active set map_from = self.active_set[_np.where(self.active_set < C0t_test_raw.shape[0])[0]] map_to = _np.arange(len(map_from)) C0t_test = _np.zeros((self.nstates, self.nstates)) C0t_test[_np.ix_(map_to, map_to)] = C0t_test_raw[_np.ix_(map_from, map_from)] C00_test = _np.diag(C0t_test.sum(axis=1)) Ctt_test = _np.diag(C0t_test.sum(axis=0)) # score from pyemma.util.metrics import vamp_score return vamp_score(K, C00_train, C0t_train, Ctt_train, C00_test, C0t_test, Ctt_test, k=self.score_k, score=self.score_method)
def score_cv(self, dtrajs, n=10, score_method=None, score_k=None): """ Scores the MSM using the variational approach for Markov processes [1]_ [2]_ and crossvalidation [3]_ . Divides the data into training and test data, fits a MSM using the training data using the parameters of this estimator, and scores is using the test data. Currently only one way of splitting is implemented, where for each n, the data is randomly divided into two approximately equally large sets of discrete trajectory fragments with lengths of at least the lagtime. Currently only implemented using dense matrices - will be slow for large state spaces. Parameters ---------- dtrajs : list of arrays Test data (discrete trajectories). n : number of samples Number of repetitions of the cross-validation. Use large n to get solid means of the score. score_method : str, optional, default='VAMP2' Overwrite scoring method to be used if desired. If `None`, the estimators scoring method will be used. Available scores are based on the variational approach for Markov processes [1]_ [2]_ : * 'VAMP1' Sum of singular values of the symmetrized transition matrix [2]_ . If the MSM is reversible, this is equal to the sum of transition matrix eigenvalues, also called Rayleigh quotient [1]_ [3]_ . * 'VAMP2' Sum of squared singular values of the symmetrized transition matrix [2]_ . If the MSM is reversible, this is equal to the kinetic variance [4]_ . score_k : int or None The maximum number of eigenvalues or singular values used in the score. If set to None, all available eigenvalues will be used. References ---------- .. [1] Noe, F. and F. Nueske: A variational approach to modeling slow processes in stochastic dynamical systems. SIAM Multiscale Model. Simul. 11, 635-655 (2013). .. [2] Wu, H and F. Noe: Variational approach for learning Markov processes from time series data (in preparation). .. [3] McGibbon, R and V. S. Pande: Variational cross-validation of slow dynamical modes in molecular kinetics, J. Chem. Phys. 142, 124105 (2015). .. [4] Noe, F. and C. Clementi: Kinetic distance and kinetic maps from molecular dynamics simulation. J. Chem. Theory Comput. 11, 5002-5011 (2015). """ from deeptime.decomposition import cvsplit_trajs dtrajs = ensure_dtraj_list(dtrajs) # ensure format if self.count_mode not in ('sliding', 'sample'): raise ValueError('score_cv currently only supports count modes "sliding" and "sample"') sliding = self.count_mode == 'sliding' scores = [] from pyemma._ext.sklearn.base import clone estimator = clone(self) for i in range(n): dtrajs_split = self._blocksplit_dtrajs(dtrajs, sliding) dtrajs_train, dtrajs_test = cvsplit_trajs(dtrajs_split) estimator.fit(dtrajs_train) s = estimator.score(dtrajs_test, score_method=score_method, score_k=score_k) scores.append(s) return _np.array(scores)
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) if self.init_hmsm is None: # estimate using maximum-likelihood superclass # memorize the observation state for bhmm and reset # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs) default_connectivity = self.connectivity default_mincount_connectivity = self.mincount_connectivity default_observe_nonempty = self.observe_nonempty self.connectivity = None self.observe_nonempty = False self.mincount_connectivity = 0 self.accuracy = 1e-2 # this is sufficient for an initial guess super(BayesianHMSM, self)._estimate(dtrajs) self.connectivity = default_connectivity self.mincount_connectivity = default_mincount_connectivity self.observe_nonempty = default_observe_nonempty else: # if given another initialization, must copy its attributes # TODO: this is too tedious - need to automatize parameter+result copying between estimators. self.nstates = self.init_hmsm.nstates self.reversible = self.init_hmsm.is_reversible self.stationary = self.init_hmsm.stationary # trajectories self._dtrajs_full = self.init_hmsm._dtrajs_full self._dtrajs_lagged = self.init_hmsm._dtrajs_lagged self._observable_set = self.init_hmsm._observable_set self._dtrajs_obs = self.init_hmsm._dtrajs_obs # MLE estimation results self.likelihoods = self.init_hmsm.likelihoods # Likelihood history self.likelihood = self.init_hmsm.likelihood self.hidden_state_probabilities = self.init_hmsm.hidden_state_probabilities # gamma variables self.hidden_state_trajectories = self.init_hmsm.hidden_state_trajectories # Viterbi path self.count_matrix = self.init_hmsm.count_matrix # hidden count matrix self.initial_count = self.init_hmsm.initial_count # hidden init count self.initial_distribution = self.init_hmsm.initial_distribution self._active_set = self.init_hmsm._active_set # update HMM Model self.update_model_params( P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities, dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag)) # check if we have a valid initial model import msmtools.estimation as msmest if self.reversible and not msmest.is_connected(self.count_matrix): raise NotImplementedError( 'Encountered disconnected count matrix:\n ' + str(self.count_matrix) + 'with reversible Bayesian HMM sampler using lag=' + str(self.lag) + ' and stride=' + str(self.stride) + '. Consider using shorter lag, ' + 'or shorter stride (to use more of the data), ' + 'or using a lower value for mincount_connectivity.') # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the # Bayesian HMM sampler. This is just an initialization. nstates_full = msmest.number_of_states(dtrajs) if self.nstates_obs < nstates_full: eps = 0.01 / nstates_full # default output probability, in order to avoid zero columns # full state space output matrix. make sure there are no zero columns B_init = eps * _np.ones( (self.nstates, nstates_full), dtype=_np.float64) # fill active states B_init[:, self.observable_set] = _np.maximum( eps, self.observation_probabilities) # renormalize B to make it row-stochastic B_init /= B_init.sum(axis=1)[:, None] else: B_init = self.observation_probabilities # HMM sampler if self.show_progress: self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0) def call_back(): self._progress_update(1, stage=0) else: call_back = None from bhmm import discrete_hmm, bayesian_hmm hmm_mle = discrete_hmm(self.initial_distribution, self.transition_matrix, B_init) sampled_hmm = bayesian_hmm( self.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples, reversible=self.reversible, stationary=self.stationary, p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior, store_hidden=self.store_hidden, call_back=call_back) if self.show_progress: self._progress_force_finish(stage=0) # Samples sample_Ps = [ sampled_hmm.sampled_hmms[i].transition_matrix for i in range(self.nsamples) ] sample_pis = [ sampled_hmm.sampled_hmms[i].stationary_distribution for i in range(self.nsamples) ] sample_pobs = [ sampled_hmm.sampled_hmms[i].output_model.output_probabilities for i in range(self.nsamples) ] samples = [] for i in range( self.nsamples): # restrict to observable set if necessary Bobs = sample_pobs[i][:, self.observable_set] sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None] # renormalize samples.append( _HMSM(sample_Ps[i], sample_pobs[i], pi=sample_pis[i], dt_model=self.dt_model)) # store results self.sampled_trajs = [ sampled_hmm.sampled_hmms[i].hidden_state_trajectories for i in range(self.nsamples) ] self.update_model_params(samples=samples) # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity)
def _estimate(self, dtrajs): """ Return ------ hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>` Estimated Hidden Markov state model """ # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # if no initial MSM is given, estimate it now if self.init_hmsm is None: # estimate with store_data=True, because we need an EstimatedHMSM hmsm_estimator = _MaximumLikelihoodHMSM( lag=self.lag, stride=self.stride, nstates=self.nstates, reversible=self.reversible, connectivity=self.connectivity, observe_active=self.observe_active, dt_traj=self.dt_traj) init_hmsm = hmsm_estimator.estimate( dtrajs) # estimate with lagged trajectories else: # check input assert isinstance( self.init_hmsm, _EstimatedHMSM), 'hmsm must be of type EstimatedHMSM' init_hmsm = self.init_hmsm self.nstates = init_hmsm.nstates self.reversible = init_hmsm.is_reversible # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the # Bayesian HMM sampler if self.observe_active: import msmtools.estimation as msmest nstates_full = msmest.number_of_states(dtrajs) # pobs = _np.zeros((init_hmsm.nstates, nstates_full)) # currently unused because that produces zero cols eps = 0.01 / nstates_full # default output probability, in order to avoid zero columns # full state space output matrix. make sure there are no zero columns pobs = eps * _np.ones( (self.nstates, nstates_full), dtype=_np.float64) # fill active states pobs[:, init_hmsm.observable_set] = _np.maximum( eps, init_hmsm.observation_probabilities) # renormalize B to make it row-stochastic pobs /= pobs.sum(axis=1)[:, None] else: pobs = init_hmsm.observation_probabilities # HMM sampler if self.show_progress: self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0) def call_back(): self._progress_update(1, stage=0) else: call_back = None from bhmm import discrete_hmm, bayesian_hmm hmm_mle = discrete_hmm(init_hmsm.transition_matrix, pobs, stationary=True, reversible=self.reversible) # define prior if self.prior == 'sparse': self.prior_count_matrix = _np.zeros((self.nstates, self.nstates), dtype=_np.float64) elif self.prior == 'uniform': self.prior_count_matrix = _np.ones((self.nstates, self.nstates), dtype=_np.float64) elif self.prior == 'mixed': # C0 = _np.dot(_np.diag(init_hmsm.stationary_distribution), init_hmsm.transition_matrix) P0 = init_hmsm.transition_matrix P0_offdiag = P0 - _np.diag(_np.diag(P0)) scaling_factor = 1.0 / _np.sum(P0_offdiag, axis=1) self.prior_count_matrix = P0 * scaling_factor[:, None] else: raise ValueError('Unknown prior mode: ' + self.prior) sampled_hmm = bayesian_hmm( init_hmsm.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples, transition_matrix_prior=self.prior_count_matrix, call_back=call_back) if self.show_progress: self._progress_force_finish(stage=0) # Samples sample_Ps = [ sampled_hmm.sampled_hmms[i].transition_matrix for i in range(self.nsamples) ] sample_pis = [ sampled_hmm.sampled_hmms[i].stationary_distribution for i in range(self.nsamples) ] sample_pobs = [ sampled_hmm.sampled_hmms[i].output_model.output_probabilities for i in range(self.nsamples) ] samples = [] for i in range( self.nsamples): # restrict to observable set if necessary Bobs = sample_pobs[i][:, init_hmsm.observable_set] sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None] # renormalize samples.append( _HMSM(sample_Ps[i], sample_pobs[i], pi=sample_pis[i], dt_model=init_hmsm.dt_model)) # parametrize self self._dtrajs_full = dtrajs self._observable_set = init_hmsm._observable_set self._dtrajs_obs = init_hmsm._dtrajs_obs self.set_model_params(samples=samples, P=init_hmsm.transition_matrix, pobs=init_hmsm.observation_probabilities, dt_model=init_hmsm.dt_model) return self
def _estimate(self, dtrajs): """ Parameters ---------- dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) or :class:`pyemma.msm.util.dtraj_states.DiscreteTrajectoryStats` discrete trajectories, stored as integer ndarrays (arbitrary size) or a single ndarray for only one trajectory. **params : Other keyword parameters if different from the settings when this estimator was constructed Returns ------- MSM : :class:`pyemma.msm.EstimatedMSM` or :class:`pyemma.msm.MSM` """ # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warning( 'Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': if self.statdist_constraint is None: # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: active_set = self._prepare_input_revpi( self._C_full, self.statdist_constraint) self.active_set = active_set else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # restrict stationary distribution to active set if self.statdist_constraint is None: statdist_active = None else: statdist_active = self.statdist_constraint[self.active_set] statdist_active /= statdist_active.sum() # renormalize # Estimate transition matrix if self.connectivity == 'largest': P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) elif self.connectivity == 'none': # reversible mode only possible if active set is connected # - in this case all visited states are connected and thus # this mode is identical to 'largest' if self.reversible and not msmest.is_connected(self._C_active): raise ValueError( 'Reversible MSM estimation is not possible with connectivity mode "none", ' 'because the set of all visited states is not reversibly connected' ) P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) else: raise NotImplementedError( 'MSM estimation with connectivity=%s is currently not implemented.' % self.connectivity) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() P = P.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def _estimate(self, dtrajs): """ Parameters ---------- Return ------ hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>` Estimated Hidden Markov state model """ import bhmm # ensure right format dtrajs = _types.ensure_dtraj_list(dtrajs) # CHECK LAG trajlengths = [_np.size(dtraj) for dtraj in dtrajs] if self.lag >= _np.max(trajlengths): raise ValueError('Illegal lag time ' + str(self.lag) + ' exceeds longest trajectory length') if self.lag > _np.mean(trajlengths): self.logger.warning('Lag time ' + str(self.lag) + ' is on the order of mean trajectory length' + _np.mean(trajlengths) + '. It is recommended to fit four lag times in each ' + 'trajectory. HMM might be inaccurate.') # EVALUATE STRIDE if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # get a quick estimate from the spectral radius of the nonreversible from pyemma.msm import estimate_markov_model msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False, connectivity='largest', dt_traj=self.timestep_traj) # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_nr.nstates > self.nstates: corrtime = max(1, msm_nr.timescales()[self.nstates-1]) # use the smaller of these two pessimistic estimates self.stride = int(min(self.lag, 2*corrtime)) # LAG AND STRIDE DATA dtrajs_lagged_strided = bhmm.lag_observations(dtrajs, self.lag, stride=self.stride) # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # INIT HMM from bhmm import init_discrete_hmm from pyemma.msm.estimators import MaximumLikelihoodMSM if self.msm_init=='largest-strong': hmm_init = init_discrete_hmm(dtrajs_lagged_strided, self.nstates, lag=1, reversible=self.reversible, stationary=True, regularize=True, method='lcs-spectral', separate=self.separate) elif self.msm_init=='all': hmm_init = init_discrete_hmm(dtrajs_lagged_strided, self.nstates, lag=1, reversible=self.reversible, stationary=True, regularize=True, method='spectral', separate=self.separate) elif issubclass(self.msm_init.__class__, MaximumLikelihoodMSM): # initial MSM given. from bhmm.init.discrete import init_discrete_hmm_spectral p0, P0, pobs0 = init_discrete_hmm_spectral(self.msm_init.count_matrix_full, self.nstates, reversible=self.reversible, stationary=True, active_set=self.msm_init.active_set, P=self.msm_init.transition_matrix, separate=self.separate) hmm_init = bhmm.discrete_hmm(p0, P0, pobs0) observe_subset = self.msm_init.active_set # override observe_subset. else: raise ValueError('Unknown MSM initialization option: ' + str(self.msm_init)) # --------------------------------------------------------------------------------------- # Estimate discrete HMM # --------------------------------------------------------------------------------------- # run EM from bhmm.estimators.maximum_likelihood import MaximumLikelihoodEstimator as _MaximumLikelihoodEstimator hmm_est = _MaximumLikelihoodEstimator(dtrajs_lagged_strided, self.nstates, initial_model=hmm_init, output='discrete', reversible=self.reversible, stationary=self.stationary, accuracy=self.accuracy, maxit=self.maxit) # run hmm_est.fit() # package in discrete HMM self.hmm = bhmm.DiscreteHMM(hmm_est.hmm) # get model parameters self.initial_distribution = self.hmm.initial_distribution transition_matrix = self.hmm.transition_matrix observation_probabilities = self.hmm.output_probabilities # get estimation parameters self.likelihoods = hmm_est.likelihoods # Likelihood history self.likelihood = self.likelihoods[-1] self.hidden_state_probabilities = hmm_est.hidden_state_probabilities # gamma variables self.hidden_state_trajectories = hmm_est.hmm.hidden_state_trajectories # Viterbi path self.count_matrix = hmm_est.count_matrix # hidden count matrix self.initial_count = hmm_est.initial_count # hidden init count self._active_set = _np.arange(self.nstates) # TODO: it can happen that we loose states due to striding. Should we lift the output probabilities afterwards? # parametrize self self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged_strided self._nstates_obs_full = msmest.number_of_states(dtrajs) self._nstates_obs = msmest.number_of_states(dtrajs_lagged_strided) self._observable_set = _np.arange(self._nstates_obs) self._dtrajs_obs = dtrajs self.set_model_params(P=transition_matrix, pobs=observation_probabilities, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) # TODO: perhaps remove connectivity and just rely on .submodel()? # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity)
def _estimate(self, data): r"""Estimates ITS at set of lagtimes """ ### PREPARE AND CHECK DATA # TODO: Currenlty only discrete trajectories are implemented. For a general class this needs to be changed. data = _types.ensure_dtraj_list(data) # check trajectory lengths self._trajlengths = np.array([len(traj) for traj in data]) maxlength = np.max(self._trajlengths) # set lag times by data if not yet set if self._lags is None: maxlag = 0.5 * np.sum(self._trajlengths) / float( len(self._trajlengths)) self._lags = _generate_lags(maxlag, 1.5) # check if some lag times are forbidden. if np.max(self._lags) >= maxlength: Ifit = np.where(self._lags < maxlength)[0] Inofit = np.where(self._lags >= maxlength)[0] self.logger.warning( 'Ignoring lag times that exceed the longest trajectory: ' + str(self._lags[Inofit])) self._lags = self._lags[Ifit] ### RUN ESTIMATION # construct all parameter sets for the estimator param_sets = tuple(param_grid({'lag': self._lags})) if isinstance(self.estimator, SampledModel): self.estimator.show_progress = False # run estimation on all lag times self._models, self._estimators = estimate_param_scan( self.estimator, data, param_sets, failfast=False, return_estimators=True, n_jobs=self.n_jobs, progress_reporter=self) ### PROCESS RESULTS # if some results are None, estimation has failed. Warn and truncate models and lag times good = np.array( [i for i, m in enumerate(self._models) if m is not None], dtype=int) bad = np.array([i for i, m in enumerate(self._models) if m is None], dtype=int) if good.size == 0: raise RuntimeError( 'Estimation has failed at ALL lagtimes. Check for errors.') if bad.size > 0: self.logger.warning( 'Estimation has failed at lagtimes: ' + str(self._lags[bad]) + '. Run single-lag estimation at these lags to track down the error.' ) self._lags = self._lags[good] self._models = list(np.array(self._models)[good]) # timescales timescales = [m.timescales() for m in self._models] # how many finite timescales do we really have? maxnts = max([len(ts[np.isfinite(ts)]) for ts in timescales]) if self.nits is None: self.nits = maxnts if maxnts < self.nits: self.nits = maxnts self.logger.warning( 'Changed user setting nits to the number of available timescales nits=' + str(self.nits)) # sort timescales into matrix computed_all = True # flag if we have found any problems self._its = np.empty((len(self._lags), self.nits)) self._its[:] = np.NAN # initialize with NaN in order to point out timescales that were not computed self._successful_lag_indexes = [] for i, ts in enumerate(timescales): if ts is not None: if np.any( np.isfinite(ts) ): # if there are any finite timescales available, add them self._its[i, :len( ts )] = ts[:self. nits] # copy into array. Leave NaN if there is no timescale self._successful_lag_indexes.append(i) if len(self._successful_lag_indexes) < len(self._lags): computed_all = False if np.any(np.isnan(self._its)): computed_all = False # timescales samples if available if issubclass(self._models[0].__class__, SampledModel): # samples timescales_samples = [ m.sample_f('timescales') for m in self._models ] nsamples = np.shape(timescales_samples[0])[0] self._its_samples = np.empty( (nsamples, len(self._lags), self.nits)) self._its_samples[:] = np.NAN # initialize with NaN in order to point out timescales that were not computed for i, ts in enumerate(timescales_samples): if ts is not None: ts = np.vstack(ts) ts = ts[:, :self.nits] self._its_samples[:, i, :ts.shape[ 1]] = ts # copy into array. Leave NaN if there is no timescales if np.any(np.isnan(self._its_samples)): computed_all = False if not computed_all: self.logger.warning( 'Some timescales could not be computed. Timescales array is smaller than ' 'expected or contains NaNs')
def timescales_msm(dtrajs, lags=None, nits=None, reversible=True, connected=True, errors=None, nsamples=50, n_jobs=1, show_progress=True): # format data r""" Implied timescales from Markov state models estimated at a series of lag times. Parameters ---------- dtrajs : array-like or list of array-likes discrete trajectories lags : array-like of integers, optional integer lag times at which the implied timescales will be calculated nits : int, optional number of implied timescales to be computed. Will compute less if the number of states are smaller. If None, the number of timescales will be automatically determined. connected : boolean, optional If true compute the connected set before transition matrix estimation at each lag separately reversible : boolean, optional Estimate transition matrix reversibly (True) or nonreversibly (False) errors : None | 'bayes', optional Specifies whether to compute statistical uncertainties (by default not), an which algorithm to use if yes. Currently the only option is: * 'bayes' for Bayesian sampling of the posterior Attention: Computing errors can be *very* slow if the MSM has many states. Moreover there are still unsolved theoretical problems, and therefore the uncertainty interval and the maximum likelihood estimator can be inconsistent. Use this as a rough guess for statistical uncertainties. nsamples : int, optional The number of approximately independent transition matrix samples generated for each lag time for uncertainty quantification. Only used if errors is not None. n_jobs : int, optional how many subprocesses to start to estimate the models for each lag time. Returns ------- itsobj : :class:`ImpliedTimescales <pyemma.msm.estimators.implied_timescales.ImpliedTimescales>` object Example ------- >>> from pyemma import msm >>> dtraj = [0,1,1,2,2,2,1,2,2,2,1,0,0,1,1,1,2,2,1,1,2,1,1,0,0,0,1,1,2,2,1] # mini-trajectory >>> ts = msm.its(dtraj, [1,2,3,4,5]) >>> print(ts.timescales) # doctest: +ELLIPSIS [[ 1.5... 0.2...] [ 3.1... 1.0...] [ 2.03... 1.02...] [ 4.63... 3.42...] [ 5.13... 2.59...]] See also -------- ImpliedTimescales The object returned by this function. pyemma.plots.plot_implied_timescales Implied timescales plotting function. Just call it with the :class:`ImpliedTimescales <pyemma.msm.estimators.ImpliedTimescales>` object produced by this function as an argument. .. autoclass:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales :members: :undoc-members: .. rubric:: Methods .. autoautosummary:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales :methods: .. rubric:: Attributes .. autoautosummary:: pyemma.msm.estimators.implied_timescales.ImpliedTimescales :attributes: References ---------- Implied timescales as a lagtime-selection and MSM-validation approach were suggested in [1]_. Error estimation is done either using moving block bootstrapping [2]_ or a Bayesian analysis using Metropolis-Hastings Monte Carlo sampling of the posterior. Nonreversible Bayesian sampling is done by independently sampling Dirichtlet distributions of the transition matrix rows. A Monte Carlo method for sampling reversible MSMs was introduced in [3]_. Here we employ a much more efficient algorithm introduced in [4]_. .. [1] Swope, W. C. and J. W. Pitera and F. Suits: Describing protein folding kinetics by molecular dynamics simulations: 1. Theory. J. Phys. Chem. B 108: 6571-6581 (2004) .. [2] Kuensch, H. R.: The jackknife and the bootstrap for general stationary observations. Ann. Stat. 17, 1217-1241 (1989) .. [3] Noe, F.: Probability Distributions of Molecular Observables computed from Markov Models. J. Chem. Phys. 128, 244103 (2008) .. [4] Trendelkamp-Schroer, B, H. Wu, F. Paul and F. Noe: Estimation and uncertainty of reversible Markov models. http://arxiv.org/abs/1507.05990 """ # format data dtrajs = _types.ensure_dtraj_list(dtrajs) if connected: connectivity = 'largest' else: connectivity = 'none' # MLE or error estimation? if errors is None: estimator = _ML_MSM(reversible=reversible, connectivity=connectivity) elif errors == 'bayes': estimator = _Bayes_MSM(reversible=reversible, connectivity=connectivity, nsamples=nsamples, show_progress=show_progress) else: raise NotImplementedError('Error estimation method' + errors + 'currently not implemented') # go itsobj = _ImpliedTimescales(estimator, lags=lags, nits=nits, n_jobs=n_jobs, show_progress=show_progress) itsobj.estimate(dtrajs) return itsobj
def rewrite_dtrajs_to_core_sets(dtrajs, core_set, in_place=False): r""" Rewrite trajectories that contain unassigned states. The given discrete trajectories are rewritten such that states not in the core set are -1. Trajectories that begin with unassigned states will be truncated here. Index offsets are computed to keep assignment to original data. Examples -------- Let's assume we want to restrict the core sets to 1, 2 and 3: >>> import numpy as np >>> dtrajs = [np.array([5, 4, 1, 3, 4, 4, 5, 3, 0, 1]), ... np.array([4, 4, 4, 5]), ... np.array([4, 4, 5, 1, 2, 3])] >>> dtraj_core, offsets, n_cores = rewrite_dtrajs_to_core_sets(dtrajs, core_set=[0, 1, 3]) >>> print(dtraj_core) [array([ 1, 3, -1, -1, -1, 3, 0, 1]), array([ 1, -1, 3])] We reach the first milestone in the first trajectory after two steps, after four in the second and so on: >>> print(offsets) [2, None, 3] Since the second trajectory never visited a core set, it will be removed and marked as such in the offsets lists by a 'None'. Each entry corresponds to one entry in the input list. Parameters ---------- dtrajs: array_like or list of array_like Discretized trajectory or list of discretized trajectories. core_set: array -like of ints Pass an array of micro-states to define the core sets. in_place: boolean, default=False if True, replace the current dtrajs if False, return a copy Returns ------- dtrajs, offsets, n_cores: list of ndarray(dtype=int), list, int """ import copy from pyemma.util import types dtrajs = types.ensure_dtraj_list(dtrajs) if isinstance(core_set, (list, tuple)): core_set = list(map(types.ensure_int_vector, core_set)) core_set = np.unique(np.concatenate(core_set)) else: core_set = np.unique(types.ensure_int_vector(core_set)) n_cores = len(core_set) if not in_place: dtrajs = copy.deepcopy(dtrajs) # if we have no state definition at the beginning of a trajectory, we store the offset to the first milestone. offsets = [0] * len(dtrajs) for i, d in enumerate(dtrajs): # set non-core states to -1 outside_core_set = ~np.in1d(d, core_set) if not np.any(outside_core_set): continue d[outside_core_set] = -1 where_positive = np.where(d >= 0)[0] offsets[i] = where_positive.min() if len(where_positive) > 0 else None # traj never reached a core set? if offsets[i] is None: warnings.warn( 'The entire trajectory with index {i} never visited a core set!' .format(i=i)) elif offsets[i] > 0: warnings.warn( 'The trajectory with index {i} had to be truncated for not starting in a core.' .format(i=i)) dtrajs[i] = d[np.where(d >= 0)[0][0]:] # filter empty dtrajs dtrajs = [d for i, d in enumerate(dtrajs) if offsets[i] is not None] return dtrajs, offsets, n_cores