def test_iterator(self): for c in self.cl: for itraj, chunk in c: assert types.is_int(itraj) assert types.is_int_matrix(chunk) assert chunk.shape[0] <= c.chunksize assert chunk.shape[1] == c.dimension()
def propagate(self, p0, k): """ Propagates the initial distribution p0 k times Computes the product .. math:: p_k = p_0^T P^k If the lag time of transition matrix :math:`P` is :math:`\tau`, this will provide the probability distribution at time :math:`k \tau`. :param p0: ndarray - initial distribution. Vector of size of the active set :param k: int - number of time steps :return: ndarray - distribution after k steps, vector of size of the active set """ p0 = _types.ensure_ndarray(p0, ndim=1, kind='numeric') assert _types.is_int(k) and k >= 0, 'k must be a non-negative integer' if k == 0 or k == 1: return self.eval(0).propagate(p0, k).real else: pprop = self.eval(0).propagate(p0, 1).real for i in range(1, k): pprop = self.eval(i).propagate(pprop, 1).real return pprop
def propagate(self, p0, k): r""" Propagates the initial distribution p0 k times Computes the product .. math:: p_k = p_0^T P^k If the lag time of transition matrix :math:`P` is :math:`\tau`, this will provide the probability distribution at time :math:`k \tau`. Parameters ---------- p0 : ndarray(n,) Initial distribution. Vector of size of the active set. k : int Number of time steps Returns ---------- pk : ndarray(n,) Distribution after k steps. Vector of size of the active set. """ p0 = _types.ensure_ndarray(p0, ndim=1, size=self.nstates, kind='numeric') assert _types.is_int(k) and k >= 0, 'k must be a non-negative integer' if k == 0: # simply return p0 normalized return p0 / p0.sum() if self.is_sparse: # sparse: we don't have a full eigenvalue set, so just propagate pk = _np.array(p0) for i in range(k): pk = _np.dot(pk.T, self.transition_matrix) else: # dense: employ eigenvalue decomposition self._ensure_eigendecomposition(self.nstates) from pyemma.util.linalg import mdot pk = mdot(p0.T, self.eigenvectors_right(), _np.diag(_np.power(self.eigenvalues(), k)), self.eigenvectors_left()).real # normalize to 1.0 and return return pk / pk.sum()
def fixed_seed(self, val): from pyemma.util import types if isinstance(val, bool) or val is None: if val: self._fixed_seed = 42 else: self._fixed_seed = random.randint(0, 2**32-1) elif types.is_int(val): if val < 0 or val > 2**32-1: self.logger.warn("seed has to be positive (or smaller than 2**32-1)." " Seed will be chosen randomly.") self.fixed_seed = False else: self._fixed_seed = val else: raise ValueError("fixed seed has to be bool or integer") self.logger.debug("seed = %i", self._fixed_seed)
def test_dimension(self): assert types.is_int(self.tica_obj.dimension()) # Here: assert self.tica_obj.dimension() == 1 # Test other variants tica = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=1.0) assert tica.dimension() == 2 tica = api.tica(data=self.X, lag=self.lag, dim=-1, var_cutoff=0.9) assert tica.dimension() == 1 with self.assertRaises( ValueError ): # trying to set both dim and subspace_variance is forbidden api.tica(data=self.X, lag=self.lag, dim=1, var_cutoff=0.9) with self.assertRaises(ValueError): api.tica(lag=self.lag, var_cutoff=0) with self.assertRaises(ValueError): api.tica(lag=self.lag, var_cutoff=1.1)
def __init__(self, estimator, lags=None, nits=None, n_jobs=1, show_progress=True): r"""Implied timescales for a series of lag times. Parameters ---------- estimator : Estimator Estimator to be used for estimating timescales at each lag time. lags : array-like with integers or None, optional integer lag times at which the implied timescales will be calculated. If set to None (default) as list of lagtimes will be automatically generated. nits : int, optional maximum number of implied timescales to be computed and stored. If less timescales are available, nits will be set to a smaller value during estimation. None means the number of timescales will be automatically determined. n_jobs: int, optional how many subprocesses to start to estimate the models for each lag time. """ # initialize self.estimator = get_estimator(estimator) self.nits = nits self.n_jobs = n_jobs self.show_progress = show_progress # set lag times if _types.is_int(lags): # got a single integer. We create a list self._lags = _generate_lags(lags, 1.5) else: # got a list of ints or None - otherwise raise exception. self._lags = _types.ensure_int_vector_or_None(lags, require_order=True) # estimated its. 2D-array with indexing: lagtime, its self._its = None # sampled its's. 3D-array with indexing: lagtime, its, sample self._its_samples = None
def _progress_register(self, amount_of_work, description='', stage=0): """ Registers a progress which can be reported/displayed via a progress bar. Parameters ---------- amount_of_work : int Amount of steps the underlying algorithm has to perform. description : str, optional This string will be displayed in the progress bar widget. stage : int, optional, default=0 If the algorithm has multiple different stages (eg. calculate means in the first pass over the data, calculate covariances in the second), one needs to estimate different times of arrival. """ if not self.show_progress: return if not is_int(amount_of_work): raise ValueError( "amount_of_work has to be of integer type. But is %s" % type(amount_of_work)) # if we do not have enough work to do for the overhead of a progress bar, # we just define a dummy here if amount_of_work <= ProgressReporter._pg_threshold: class dummy(object): pass pg = dummy() pg.__str__ = lambda: description pg.__repr__ = pg.__str__ pg._dummy = None pg.description = '' else: pg = _ProgressBar(amount_of_work, description=description) self._prog_rep_progressbars[stage] = pg # pg.description = description self._prog_rep_descriptions[stage] = description
def __init__(self, test_model, test_estimator, mlags=None, conf=0.95, err_est=False, n_jobs=None, show_progress=True): # set model and estimator # copy the test model, since the estimation of cktest modifies the model. from copy import deepcopy self.test_model = deepcopy(test_model) self.test_estimator = test_estimator # set mlags try: maxlength = np.max([len(dtraj) for dtraj in test_estimator.discrete_trajectories_full]) except AttributeError: maxlength = np.max(test_estimator.trajectory_lengths()) maxmlag = int(math.floor(maxlength / test_estimator.lag)) if mlags is None: mlags = maxmlag if types.is_int(mlags): mlags = np.arange(mlags) mlags = types.ensure_ndarray(mlags, ndim=1, kind='i') if np.any(mlags > maxmlag): mlags = mlags[np.where(mlags <= maxmlag)] self.logger.warning('Changed mlags as some mlags exceeded maximum trajectory length.') if np.any(mlags < 0): mlags = mlags[np.where(mlags >= 0)] self.logger.warning('Changed mlags as some mlags were negative.') self.mlags = mlags # set conf and error handling self.conf = conf self.has_errors = issubclass(self.test_model.__class__, SampledModel) if self.has_errors: self.test_model.set_model_params(conf=conf) self.err_est = err_est if err_est and not self.has_errors: raise ValueError('Requested errors on the estimated models, ' 'but the model is not able to calculate errors at all') self.n_jobs = n_jobs self.show_progress = show_progress
def lags(self, lags): """Sets the lag times at which the models will be estimated. Remembers the last non None value, in order to extend the lag list. If the input data during estimation is unchanged, we will only need to re-estimate new lag times. If a lag time is removed, we clean up the underlying models and derived data (timescales). """ if hasattr(self, '_lags') and self._lags is not None: self._last_lags = frozenset(self._lags) # remove obsolete models and computed data. if self._models and lags is not None: survivors = np.array([ i for i in self._successful_lag_indexes if self._models[i].lag in lags ]) if survivors.size == 0: self._models = [] self._its = None self._its_samples = None self._successful_lag_indexes = None else: self._models = np.array(self._models)[survivors].tolist() self._successful_lag_indexes = np.arange(len(self._models)) self._its = self._its[survivors] if self.samples_available: self._its_samples = self._its_samples[survivors] else: self._last_lags = set() # set lag times if _types.is_int(lags): # got a single integer. We create a list self._lags = _generate_lags(lags, 1.5) elif lags is None: # obtain a series of meaningful lag times from the input trajectory length self._lags = None else: # got a list of ints or None - otherwise raise exception. self._lags = _types.ensure_int_vector_or_None(lags, require_order=True) self._lags.sort()
def test_chunksize(self): assert types.is_int(self.pca_obj.chunksize)
def test_lag(self): assert types.is_int(self.tica_obj.lag) # Here: assert self.tica_obj.lag == self.lag
def test_iterator(self): for itraj, chunk in self.pca_obj: assert types.is_int(itraj) assert types.is_float_matrix(chunk) assert chunk.shape[1] == self.pca_obj.dimension()
def test_dimension(self): c = self.ass assert types.is_int(c.dimension()) assert c.dimension() == 1
def test_dimension(self): for c in self.cl: assert types.is_int(c.dimension()) assert c.dimension() == 1
def __init__(self, model, estimator, mlags=None, conf=0.95, err_est=False, n_jobs=1, show_progress=True): r""" Parameters ---------- model : Model Model to be tested estimator : Estimator Parametrized Estimator that has produced the model mlags : int or int-array, default=10 multiples of lag times for testing the Model, e.g. range(10). A single int will trigger a range, i.e. mlags=10 maps to mlags=range(10). The setting None will choose mlags automatically according to the longest available trajectory Note that you need to be able to do a model prediction for each of these lag time multiples, e.g. the value 0 only make sense if _predict_observables(0) will work. conf : float, default = 0.95 confidence interval for errors err_est : bool, default=False if the Estimator is capable of error calculation, will compute errors for each tau estimate. This option can be computationally expensive. n_jobs : int, default=1 how many jobs to use during calculation show_progress : bool, default=True Show progressbars for calculation? """ # set model and estimator self.test_model = model self.test_estimator = estimator # set mlags maxlength = np.max([len(dtraj) for dtraj in estimator.discrete_trajectories_full]) maxmlag = int(math.floor(maxlength / estimator.lag)) if mlags is None: mlags = maxmlag if types.is_int(mlags): mlags = np.arange(mlags) mlags = types.ensure_ndarray(mlags, ndim=1, kind='i') if np.any(mlags > maxmlag): mlags = mlags[np.where(mlags <= maxmlag)] self.logger.warn('Changed mlags as some mlags exceeded maximum trajectory length.') if np.any(mlags < 0): mlags = mlags[np.where(mlags >= 0)] self.logger.warn('Changed mlags as some mlags were negative.') self.mlags = mlags # set conf and error handling self.conf = conf self.has_errors = issubclass(self.test_model.__class__, SampledModel) if self.has_errors: self.test_model.set_model_params(conf=conf) self.err_est = err_est if err_est and not self.has_errors: raise ValueError('Requested errors on the estimated models, ' 'but the model is not able to calculate errors at all') self.n_jobs = n_jobs self.show_progress = show_progress
def test_chunksize(self): for c in self.cl: assert types.is_int(c.chunksize)
def test_dimension(self): assert types.is_int(self.pca_obj.dimension()) # Here: assert self.pca_obj.dimension() == 1
def test_iterator(self): for itraj, chunk in self.inp: assert types.is_int(itraj) assert types.is_float_matrix(chunk) assert chunk.shape[0] == self.inp.chunksize assert chunk.shape[1] == self.inp.dimension()
def test_iterator(self): for itraj, chunk in self.pca_obj: assert types.is_int(itraj) assert types.is_float_matrix(chunk) assert chunk.shape[0] <= self.pca_obj.chunksize + self.lag assert chunk.shape[1] == self.pca_obj.dimension()
def _estimate(self, dtrajs): """ Parameters ---------- Return ------ hmsm : :class:`EstimatedHMSM <pyemma.msm.estimators.hmsm_estimated.EstimatedHMSM>` Estimated Hidden Markov state model """ # ensure right format dtrajs = _types.ensure_dtraj_list(dtrajs) # if no initial MSM is given, estimate it now if self.msm_init is None: # estimate with sparse=False, because we need to do PCCA which is currently not implemented for sparse # estimate with store_data=True, because we need an EstimatedMSM msm_estimator = _MSMEstimator(lag=self.lag, reversible=self.reversible, sparse=False, connectivity=self.connectivity, dt_traj=self.timestep_traj) msm_init = msm_estimator.estimate(dtrajs) else: assert isinstance(self.msm_init, _EstimatedMSM), 'msm_init must be of type EstimatedMSM' msm_init = self.msm_init self.reversible = msm_init.is_reversible # print 'Connected set: ', msm_init.active_set # generate lagged observations if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_init.nstates > self.nstates: corrtime = int(max(1, msm_init.timescales()[self.nstates-1])) # use the smaller of these two pessimistic estimates self.stride = min(self.stride, 2*corrtime) # TODO: Here we always use the full observation state space for the estimation. dtrajs_lagged = _lag_observations(dtrajs, self.lag, stride=self.stride) # check input assert _types.is_int(self.nstates) and self.nstates > 1 and self.nstates <= msm_init.nstates, \ 'nstates must be an int in [2,msmobj.nstates]' # if hmm.nstates = msm.nstates there is no problem. Otherwise, check spectral gap if msm_init.nstates > self.nstates: timescale_ratios = msm_init.timescales()[:-1] / msm_init.timescales()[1:] if timescale_ratios[self.nstates-2] < 2.0: self.logger.warn('Requested coarse-grained model with ' + str(self.nstates) + ' metastable states at ' + 'lag=' + str(self.lag) + '.' + 'The ratio of relaxation timescales between ' + str(self.nstates) + ' and ' + str(self.nstates+1) + ' states is only ' + str(timescale_ratios[self.nstates-2]) + ' while we recommend at least 2. ' + ' It is possible that the resulting HMM is inaccurate. Handle with caution.') # set things from MSM # TODO: dtrajs_obs is set here, but not used in estimation. Estimation is alwas done with # TODO: respect to full observation (see above). This is confusing. Define how we want to do this in gen. # TODO: observable set is also not used, it is just saved. nstates_obs_full = msm_init.nstates_full if self.observe_active: nstates_obs = msm_init.nstates observable_set = msm_init.active_set dtrajs_obs = msm_init.discrete_trajectories_active else: nstates_obs = msm_init.nstates_full observable_set = np.arange(nstates_obs_full) dtrajs_obs = msm_init.discrete_trajectories_full # TODO: this is redundant with BHMM code because that code is currently not easily accessible and # TODO: we don't want to re-estimate. Should be reengineered in bhmm. # --------------------------------------------------------------------------------------- # PCCA-based coarse-graining # --------------------------------------------------------------------------------------- # pcca- to number of metastable states pcca = msm_init.pcca(self.nstates) # HMM output matrix eps = 0.01 * (1.0/nstates_obs_full) # default output probability, in order to avoid zero columns # Use PCCA distributions, but at least eps to avoid 100% assignment to any state (breaks convergence) B_conn = np.maximum(msm_init.metastable_distributions, eps) # full state space output matrix B = eps * np.ones((self.nstates, nstates_obs_full), dtype=np.float64) # expand B_conn to full state space # TODO: here we always select the active set, no matter if observe_active=True or False. B[:, msm_init.active_set] = B_conn[:, :] # TODO: at this point we will have zero observation probabilities for states that are not in the active # TODO: set. If these occur in the trajectory, that will mean zero columns in the output probabilities # TODO: and crash of forward-backward and sampling algorithms. # renormalize B to make it row-stochastic B /= B.sum(axis=1)[:, None] # coarse-grained transition matrix P_coarse = pcca.coarse_grained_transition_matrix # take care of unphysical values. First symmetrize X = np.dot(np.diag(pcca.coarse_grained_stationary_probability), P_coarse) X = 0.5*(X + X.T) # if there are values < 0, set to eps X = np.maximum(X, eps) # turn into coarse-grained transition matrix A = X / X.sum(axis=1)[:, None] # --------------------------------------------------------------------------------------- # Estimate discrete HMM # --------------------------------------------------------------------------------------- # lazy import bhmm here in order to avoid dependency loops import bhmm # initialize discrete HMM hmm_init = bhmm.discrete_hmm(A, B, stationary=True, reversible=self.reversible) # run EM hmm = bhmm.estimate_hmm(dtrajs_lagged, self.nstates, lag=1, initial_model=hmm_init, accuracy=self.accuracy, maxit=self.maxit) self.hmm = bhmm.DiscreteHMM(hmm) # find observable set transition_matrix = self.hmm.transition_matrix observation_probabilities = self.hmm.output_probabilities # TODO: Cutting down... OK, this can be done if self.observe_active: # cut down observation probabilities to active set observation_probabilities = observation_probabilities[:, msm_init.active_set] observation_probabilities /= observation_probabilities.sum(axis=1)[:,None] # renormalize # parametrize self self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged self._observable_set = observable_set self._dtrajs_obs = dtrajs_obs self.set_model_params(P=transition_matrix, pobs=observation_probabilities, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def test_chunksize(self): assert types.is_int(self.inp.chunksize)
def test_dimension(self): assert types.is_int(self.inp.dimension())
def test_chunksize(self): assert types.is_int(self.ass.chunksize)