def test_connected_count_matrix(self): """Directed""" is_c = is_connected(self.C_not_connected) self.assertFalse(is_c) is_c = is_connected(self.C_connected) self.assertTrue(is_c) """Undirected""" is_c = is_connected(self.C_not_connected, directed=False) self.assertTrue(is_c)
def __init__(self, n_states, initial_model=None, reversible=True, stationary=False, transition_matrix_sampling_steps=1000, p0_prior='mixed', transition_matrix_prior='mixed', output='gaussian', nsamples=100, ): super(BayesianHMMSampler, self).__init__() self.reversible = reversible self.stationary = stationary self.n_states = n_states # Use user-specified initial model, if provided. if initial_model is not None: self.initial_model = initial_model.copy() else: self.initial_model = None # prior initial vector if p0_prior is None or p0_prior == 'sparse': self.prior_n0 = np.zeros(self.n_states) elif isinstance(p0_prior, np.ndarray): if len(p0_prior.shape) == 1 and p0_prior.shape[0] == self.n_states: self.prior_n0 = np.array(p0_prior) else: raise ValueError(f'initial distribution prior must have dimension {n_states}') elif p0_prior == 'mixed': if initial_model is not None: self.prior_n0 = np.array(self.initial_model.initial_distribution) else: self.prior_n0 = None elif p0_prior == 'uniform': self.prior_n0 = np.ones(n_states) else: raise ValueError(f'initial distribution prior mode undefined: {p0_prior}') # prior count matrix if transition_matrix_prior is None or p0_prior == 'sparse': self.prior_C = np.zeros((self.n_states, self.n_states)) elif isinstance(transition_matrix_prior, np.ndarray): if np.array_equal(transition_matrix_prior.shape, (self.n_states, self.n_states)): self.prior_C = np.array(transition_matrix_prior) elif transition_matrix_prior == 'mixed': if initial_model is not None: self.prior_C = np.array(self.initial_model.transition_matrix) else: self.prior_C = None elif p0_prior == 'uniform': self.prior_C = np.ones((n_states, n_states)) else: raise ValueError(f'transition matrix prior mode undefined: {transition_matrix_prior}') # check if we work with these options if (reversible and self.initial_model is not None and not msmest.is_connected(self.initial_model.transition_matrix + self.prior_C, directed=True)): raise NotImplementedError('Trying to sample disconnected HMM with option reversible:\n ' f'{self.initial_model.transition_matrix}\n' 'Use prior to connect, select connected subset, or use reversible=False.') # sampling options self.transition_matrix_sampling_steps = transition_matrix_sampling_steps self.nsamples = nsamples self.output = output
def _update_transition_matrix(self, model): """ Updates the hidden-state transition matrix and the initial distribution """ C = model.count_matrix() + self.prior_C # posterior count matrix # check if we work with these options if self.reversible and not msmest.is_connected(C, directed=True): raise NotImplementedError('Encountered disconnected count matrix with sampling option reversible:\n ' f'{C}\nUse prior to ensure connectivity or use reversible=False.') # ensure consistent sparsity pattern (P0 might have additional zeros because of underflows) # TODO: these steps work around a bug in msmtools. Should be fixed there P0 = msmest.transition_matrix(C, reversible=self.reversible, maxiter=10000, warn_not_converged=False) zeros = np.where(P0 + P0.T == 0) C[zeros] = 0 # run sampler Tij = msmest.sample_tmatrix(C, nsample=1, nsteps=self.transition_matrix_sampling_steps, reversible=self.reversible) # INITIAL DISTRIBUTION if self.stationary: # p0 is consistent with P p0 = _tmatrix_disconnected.stationary_distribution(Tij, C=C) else: n0 = model.count_init().astype(float) first_timestep_counts_with_prior = n0 + self.prior_n0 positive = first_timestep_counts_with_prior > 0 p0 = np.zeros_like(n0) p0[positive] = np.random.dirichlet(first_timestep_counts_with_prior[positive]) # sample p0 from posterior # update HMM with new sample model.update(p0, Tij)
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warning('Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': if self.statdist_constraint is None: # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: active_set = self._prepare_input_revpi(self._C_full, self.statdist_constraint) self.active_set = active_set else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # restrict stationary distribution to active set if self.statdist_constraint is None: statdist_active = None else: statdist_active = self.statdist_constraint[self.active_set] statdist_active /= statdist_active.sum() # renormalize # Estimate transition matrix if self.connectivity == 'largest': P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) elif self.connectivity == 'none': # reversible mode only possible if active set is connected # - in this case all visited states are connected and thus # this mode is identical to 'largest' if self.reversible and not msmest.is_connected(self._C_active): raise ValueError('Reversible MSM estimation is not possible with connectivity mode "none", ' 'because the set of all visited states is not reversibly connected') P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) else: raise NotImplementedError( 'MSM estimation with connectivity=%s is currently not implemented.' % self.connectivity) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() P = P.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) if self.init_hmsm is None: # estimate using maximum-likelihood superclass # memorize the observation state for bhmm and reset # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs) default_connectivity = self.connectivity default_mincount_connectivity = self.mincount_connectivity default_observe_nonempty = self.observe_nonempty self.connectivity = None self.observe_nonempty = False self.mincount_connectivity = 0 self.accuracy = 1e-2 # this is sufficient for an initial guess super(BayesianHMSM, self)._estimate(dtrajs) self.connectivity = default_connectivity self.mincount_connectivity = default_mincount_connectivity self.observe_nonempty = default_observe_nonempty else: # if given another initialization, must copy its attributes # TODO: this is too tedious - need to automatize parameter+result copying between estimators. self.nstates = self.init_hmsm.nstates self.reversible = self.init_hmsm.is_reversible self.stationary = self.init_hmsm.stationary # trajectories self._dtrajs_full = self.init_hmsm._dtrajs_full self._dtrajs_lagged = self.init_hmsm._dtrajs_lagged self._observable_set = self.init_hmsm._observable_set self._dtrajs_obs = self.init_hmsm._dtrajs_obs # MLE estimation results self.likelihoods = self.init_hmsm.likelihoods # Likelihood history self.likelihood = self.init_hmsm.likelihood self.hidden_state_probabilities = self.init_hmsm.hidden_state_probabilities # gamma variables self.hidden_state_trajectories = self.init_hmsm.hidden_state_trajectories # Viterbi path self.count_matrix = self.init_hmsm.count_matrix # hidden count matrix self.initial_count = self.init_hmsm.initial_count # hidden init count self.initial_distribution = self.init_hmsm.initial_distribution self._active_set = self.init_hmsm._active_set # update HMM Model self.update_model_params( P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities, dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag)) # check if we have a valid initial model import msmtools.estimation as msmest if self.reversible and not msmest.is_connected(self.count_matrix): raise NotImplementedError( 'Encountered disconnected count matrix:\n ' + str(self.count_matrix) + 'with reversible Bayesian HMM sampler using lag=' + str(self.lag) + ' and stride=' + str(self.stride) + '. Consider using shorter lag, ' + 'or shorter stride (to use more of the data), ' + 'or using a lower value for mincount_connectivity.') # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the # Bayesian HMM sampler. This is just an initialization. import msmtools.estimation as msmest nstates_full = msmest.number_of_states(dtrajs) if self.nstates_obs < nstates_full: eps = 0.01 / nstates_full # default output probability, in order to avoid zero columns # full state space output matrix. make sure there are no zero columns B_init = eps * _np.ones( (self.nstates, nstates_full), dtype=_np.float64) # fill active states B_init[:, self.observable_set] = _np.maximum( eps, self.observation_probabilities) # renormalize B to make it row-stochastic B_init /= B_init.sum(axis=1)[:, None] else: B_init = self.observation_probabilities # HMM sampler if self.show_progress: self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0) def call_back(): self._progress_update(1, stage=0) else: call_back = None from bhmm import discrete_hmm, bayesian_hmm hmm_mle = discrete_hmm(self.initial_distribution, self.transition_matrix, B_init) sampled_hmm = bayesian_hmm( self.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples, reversible=self.reversible, stationary=self.stationary, p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior, store_hidden=self.store_hidden, call_back=call_back) if self.show_progress: self._progress_force_finish(stage=0) # Samples sample_Ps = [ sampled_hmm.sampled_hmms[i].transition_matrix for i in range(self.nsamples) ] sample_pis = [ sampled_hmm.sampled_hmms[i].stationary_distribution for i in range(self.nsamples) ] sample_pobs = [ sampled_hmm.sampled_hmms[i].output_model.output_probabilities for i in range(self.nsamples) ] samples = [] for i in range( self.nsamples): # restrict to observable set if necessary Bobs = sample_pobs[i][:, self.observable_set] sample_pobs[i] = Bobs / Bobs.sum(axis=1)[:, None] # renormalize samples.append( _HMSM(sample_Ps[i], sample_pobs[i], pi=sample_pis[i], dt_model=self.dt_model)) # store results self.sampled_trajs = [ sampled_hmm.sampled_hmms[i].hidden_state_trajectories for i in range(self.nsamples) ] self.update_model_params(samples=samples) # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity)
def _estimate(self, dtrajs): """ Estimates the MSM """ # get trajectory counts. This sets _C_full and _nstates_full dtrajstats = self._get_dtraj_stats(dtrajs) self._C_full = dtrajstats.count_matrix() # full count matrix self._nstates_full = self._C_full.shape[0] # number of states # check for consistency between statdist constraints and core set if self.core_set is not None and self.statdist_constraint is not None: if len(self.core_set) != len(self.statdist_constraint): raise ValueError('Number of core sets and stationary distribution ' 'constraints do not match.') # rewrite statdist constraints to full set for compatibility reasons #TODO: find a more consistent way of dealing with this import copy _stdist_constr_coreset = copy.deepcopy(self.statdist_constraint) self.statdist_constraint = _np.zeros(self._nstates_full) self.statdist_constraint[self.core_set] = _stdist_constr_coreset # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': if self.statdist_constraint is None: # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: active_set = self._prepare_input_revpi(self._C_full, self.statdist_constraint) self.active_set = active_set else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # if active set is empty, we can't do anything. if _np.size(self.active_set) == 0: raise RuntimeError('Active set is empty. Cannot estimate MSM.') # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int) self._full2active[self.active_set] = _np.arange(len(self.active_set)) # restrict stationary distribution to active set if self.statdist_constraint is None: statdist_active = None else: statdist_active = self.statdist_constraint[self.active_set] statdist_active /= statdist_active.sum() # renormalize opt_args = {} # TODO: non-rev estimate of msmtools does not comply with its own api... if statdist_active is None and self.reversible: opt_args['return_statdist'] = True # Estimate transition matrix if self.connectivity == 'largest': P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr, **opt_args) elif self.connectivity == 'none': # reversible mode only possible if active set is connected # - in this case all visited states are connected and thus # this mode is identical to 'largest' if self.reversible and not msmest.is_connected(self._C_active): raise ValueError('Reversible MSM estimation is not possible with connectivity mode "none", ' 'because the set of all visited states is not reversibly connected') P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr, **opt_args ) else: raise NotImplementedError( 'MSM estimation with connectivity=%s is currently not implemented.' % self.connectivity) # msmtools returns a tuple for statdist_active = None. if isinstance(P, tuple): P, statdist_active = P # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._connected_sets = dtrajstats.connected_sets self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def calculate_transition_matrix(transitions, topic_labels): topic_list = [] #[topic_list.extend(list(itertools.chain(*interview))) for interview in transitions] #topic_list = [list(itertools.chain(*interview)) for interview in transitions] [ topic_list.extend(list(itertools.chain(*item))) for sublist in transitions for item in transitions ] topic_list = list(set(topic_list)) topic_list = sorted(topic_list) transition_matrix = np.zeros([len(topic_list), len(topic_list)]).astype(float) # Iterate through all transitions count = 0 for interview in transitions: for element in interview: # Get the two states if ((element[1] == 'topic_9') and (element[0] == 'topic_9_26')): count = count + 1 state1 = element[0] state2 = element[1] # Get the indices of the two states state1_index = topic_list.index(state1) state2_index = topic_list.index(state2) # Fill in the necessary row - column based on the transition transition_matrix[state1_index, state2_index] = transition_matrix[ state1_index, state2_index] + 1 # Create the final transition matrix with probability values transition_matrix_scaled = (transition_matrix.T / transition_matrix.sum(axis=1)).T transition_matrix_scaled[np.isnan(transition_matrix_scaled)] = 0 transition_matrix_scaled, removed_nodes = transform_transition_matrix_connected( transition_matrix_scaled) for element in removed_nodes: del topic_list[element] try: assert (len(topic_list) == transition_matrix_scaled.shape[0]) except: pdb.set_trace() transition_matrix_scaled = transition_matrix_scaled.astype(float) #transition_matrix_scaled = np.around(transition_matrix_scaled, 3) transition_matrix_scaled = ( transition_matrix_scaled / transition_matrix_scaled.sum(axis=1, keepdims=1)) #transition_matrix_scaled = preprocessing.normalize(transition_matrix_scaled,axis=1,norm="l1") assert np.allclose(transition_matrix_scaled.sum(axis=1), 1) transition_matrix_scaled, removed_nodes = transform_transition_matrix_connected( transition_matrix_scaled) for element in removed_nodes: del topic_list[element] try: assert (len(topic_list) == transition_matrix_scaled.shape[0]) except: pdb.set_trace() assert msmtools.analysis.is_connected(transition_matrix_scaled) #transition_matrix_scaled = transition_matrix_scaled.astype(np.float64) assert np.allclose(transition_matrix_scaled.sum(axis=1), 1) #transition_matrix_scaled = softmax(transition_matrix_scaled,axis=1) #transition_matrix_scaled = transition_matrix_scaled/transition_matrix_scaled.sum(axis=0,keepdims=1) # Create a binary map binary_map = np.zeros([len(transition_matrix_scaled), len(topic_labels)]) for i, label in enumerate(topic_list): if label == "topic_9_21_23": pass topic_numbers = label.split('_')[1:] for topic_number in topic_numbers: try: binary_map[i, int(topic_number)] = 1 / len(topic_numbers) except: pdb.set_trace() transition = cg_transition_matrix(transition_matrix_scaled, binary_map) transition[np.isnan(transition)] = 0 if not is_connected(transition): transition, removed_nodes = transform_transition_matrix_connected( transition) transition = softmax(transition, axis=1) for element in removed_nodes: del topic_labels[element] #(transition+1e-12).sum(axis=1) transition = (transition / transition.sum(axis=1, keepdims=1)) try: assert np.allclose(transition.sum(axis=1), 1) except: pdb.set_trace() try: assert msmtools.analysis.is_transition_matrix(transition) except: pdb.set_trace() return (transition, topic_labels)
def _estimate(self, dtrajs): # ensure right format dtrajs = ensure_dtraj_list(dtrajs) if self.init_hmsm is None: # estimate using maximum-likelihood superclass # memorize the observation state for bhmm and reset # TODO: more elegant solution is to set Estimator params only temporarily in estimate(X, **kwargs) default_connectivity = self.connectivity default_mincount_connectivity = self.mincount_connectivity default_observe_nonempty = self.observe_nonempty self.connectivity = None self.observe_nonempty = False self.mincount_connectivity = 0 self.accuracy = 1e-2 # this is sufficient for an initial guess super(BayesianHMSM, self)._estimate(dtrajs) self.connectivity = default_connectivity self.mincount_connectivity = default_mincount_connectivity self.observe_nonempty = default_observe_nonempty else: # if given another initialization, must copy its attributes copy_attributes = ['_nstates', '_reversible', '_pi', '_observable_set', 'likelihoods', 'likelihood', 'hidden_state_probabilities', 'hidden_state_trajectories', 'count_matrix', 'initial_count', 'initial_distribution', '_active_set'] check_user_choices = ['lag', '_nstates'] # check if nstates and lag are compatible for attr in check_user_choices: if not getattr(self, attr) == getattr(self.init_hmsm, attr): raise UserWarning('BayesianHMSM cannot be initialized with init_hmsm with ' 'incompatible lag or nstates.') if (len(dtrajs) != len(self.init_hmsm.dtrajs_full) or not all((_np.array_equal(d1, d2) for d1, d2 in zip(dtrajs, self.init_hmsm.dtrajs_full)))): raise NotImplementedError('Bayesian HMM estimation with init_hmsm is currently only implemented ' + 'if applied to the same data.') # TODO: implement more elegant solution to copy-pasting effective stride evaluation from ML HMM. # EVALUATE STRIDE if self.stride == 'effective': # by default use lag as stride (=lag sampling), because we currently have no better theory for deciding # how many uncorrelated counts we can make self.stride = self.lag # get a quick estimate from the spectral radius of the nonreversible from pyemma.msm import estimate_markov_model msm_nr = estimate_markov_model(dtrajs, lag=self.lag, reversible=False, sparse=False, connectivity='largest', dt_traj=self.timestep_traj) # if we have more than nstates timescales in our MSM, we use the next (neglected) timescale as an # estimate of the decorrelation time if msm_nr.nstates > self.nstates: corrtime = max(1, msm_nr.timescales()[self.nstates - 1]) # use the smaller of these two pessimistic estimates self.stride = int(min(self.lag, 2 * corrtime)) # if stride is different to init_hmsm, check if microstates in lagged-strided trajs are compatible if self.stride != self.init_hmsm.stride: dtrajs_lagged_strided = _lag_observations(dtrajs, self.lag, stride=self.stride) _nstates_obs = _number_of_states(dtrajs_lagged_strided, only_used=True) _nstates_obs_full = _number_of_states(dtrajs) if _np.setxor1d(_np.concatenate(dtrajs_lagged_strided), _np.concatenate(self.init_hmsm._dtrajs_lagged)).size != 0: raise UserWarning('Choice of stride has excluded a different set of microstates than in ' + 'init_hmsm. Set of observed microstates in time-lagged strided trajectories ' + 'must match to the one used for init_hmsm estimation.') self._dtrajs_full = dtrajs self._dtrajs_lagged = dtrajs_lagged_strided self._nstates_obs_full = _nstates_obs_full self._nstates_obs = _nstates_obs self._observable_set = _np.arange(self._nstates_obs) self._dtrajs_obs = dtrajs else: copy_attributes += ['_dtrajs_full', '_dtrajs_lagged', '_nstates_obs_full', '_nstates_obs', '_observable_set', '_dtrajs_obs'] # update self with estimates from init_hmsm self.__dict__.update( {k: i for k, i in self.init_hmsm.__dict__.items() if k in copy_attributes}) # as mentioned in the docstring, take init_hmsm observed set observation probabilities self.observe_nonempty = False # update HMM Model self.update_model_params(P=self.init_hmsm.transition_matrix, pobs=self.init_hmsm.observation_probabilities, dt_model=TimeUnit(self.dt_traj).get_scaled(self.lag)) # check if we have a valid initial model import msmtools.estimation as msmest if self.reversible and not msmest.is_connected(self.count_matrix): raise NotImplementedError('Encountered disconnected count matrix:\n ' + str(self.count_matrix) + 'with reversible Bayesian HMM sampler using lag=' + str(self.lag) + ' and stride=' + str(self.stride) + '. Consider using shorter lag, ' + 'or shorter stride (to use more of the data), ' + 'or using a lower value for mincount_connectivity.') # here we blow up the output matrix (if needed) to the FULL state space because we want to use dtrajs in the # Bayesian HMM sampler. This is just an initialization. nstates_full = msmest.number_of_states(dtrajs) if self.nstates_obs < nstates_full: eps = 0.01 / nstates_full # default output probability, in order to avoid zero columns # full state space output matrix. make sure there are no zero columns B_init = eps * _np.ones((self.nstates, nstates_full), dtype=_np.float64) # fill active states B_init[:, self.observable_set] = _np.maximum(eps, self.observation_probabilities) # renormalize B to make it row-stochastic B_init /= B_init.sum(axis=1)[:, None] else: B_init = self.observation_probabilities # HMM sampler if self.show_progress: self._progress_register(self.nsamples, description='Sampling HMSMs', stage=0) def call_back(): self._progress_update(1, stage=0) else: call_back = None from bhmm import discrete_hmm, bayesian_hmm if self.init_hmsm is not None: hmm_mle = self.init_hmsm.hmm else: hmm_mle = discrete_hmm(self.initial_distribution, self.transition_matrix, B_init) sampled_hmm = bayesian_hmm(self.discrete_trajectories_lagged, hmm_mle, nsample=self.nsamples, reversible=self.reversible, stationary=self.stationary, p0_prior=self.p0_prior, transition_matrix_prior=self.transition_matrix_prior, store_hidden=self.store_hidden, call_back=call_back) if self.show_progress: self._progress_force_finish(stage=0) # Samples sample_inp = [(m.transition_matrix, m.stationary_distribution, m.output_probabilities) for m in sampled_hmm.sampled_hmms] samples = [] for P, pi, pobs in sample_inp: # restrict to observable set if necessary Bobs = pobs[:, self.observable_set] pobs = Bobs / Bobs.sum(axis=1)[:, None] # renormalize samples.append(_HMSM(P, pobs, pi=pi, dt_model=self.dt_model)) # store results self.sampled_trajs = [sampled_hmm.sampled_hmms[i].hidden_state_trajectories for i in range(self.nsamples)] self.update_model_params(samples=samples) # deal with connectivity states_subset = None if self.connectivity == 'largest': states_subset = 'largest-strong' elif self.connectivity == 'populous': states_subset = 'populous-strong' # OBSERVATION SET if self.observe_nonempty: observe_subset = 'nonempty' else: observe_subset = None # return submodel (will return self if all None) return self.submodel(states=states_subset, obs=observe_subset, mincount_connectivity=self.mincount_connectivity)
def set_model_params(self, P=None, pi=None, reversible=None, dt_model='1 step', neig=None): """ Call to set all basic model parameters. Sets or updates given model parameters. This argument list of this method must contain the full list of essential, or independent model parameters. It can additionally contain derived parameters, e.g. in order to save computational costs of re-computing them. Parameters ---------- P : ndarray(n,n) transition matrix pi : ndarray(n), optional, default=None stationary distribution. Can be optionally given in case if it was already computed, e.g. by the estimator. reversible : bool, optional, default=None whether P is reversible with respect to its stationary distribution. If None (default), will be determined from P dt_model : str, optional, default='1 step' Description of the physical time corresponding to the model time step. May be used by analysis algorithms such as plotting tools to pretty-print the axes. By default '1 step', i.e. there is no physical time unit. Specify by a number, whitespace and unit. Permitted units are (* is an arbitrary string): | 'fs', 'femtosecond*' | 'ps', 'picosecond*' | 'ns', 'nanosecond*' | 'us', 'microsecond*' | 'ms', 'millisecond*' | 's', 'second*' neig : int or None The number of eigenvalues / eigenvectors to be kept. If set to None, defaults will be used. For a dense MSM the default is all eigenvalues. For a sparse MSM the default is 10. Notes ----- Explicitly define all independent model parameters in the argument list of this function (by mandatory or keyword arguments) """ import msmtools.analysis as msmana # check input if P is not None: import msmtools.estimation as msmest if not msmana.is_transition_matrix(P): raise ValueError('T is not a transition matrix.') # check connectivity # TODO: abusing C-connectivity test for T. Either provide separate T-connectivity test or move to a central # TODO: location because it's the same code. if not msmest.is_connected(P): raise NotImplementedError( 'Transition matrix T is disconnected. ' + 'This is currently not supported in the MSM object.') # update all parameters self.update_model_params(P=P, pi=pi, reversible=reversible, dt_model=dt_model, neig=neig) # set ncv for consistency if not hasattr(self, 'ncv'): self.ncv = None # update derived quantities from pyemma.util.units import TimeUnit self._timeunit_model = TimeUnit(self.dt_model) # set P and derived quantities if available if P is not None: from scipy.sparse import issparse # set states self._nstates = np.shape(P)[0] if self.reversible is None: self.reversible = msmana.is_reversible(P) self.sparse = issparse(P) # set or correct eig param if neig is None: if self.sparse: self.neig = 10 else: self.neig = self._nstates
tr = [el for el in window(trajectories)] count_matrix = np.zeros( (unique.shape[0], unique.shape[0])).astype(float) for element in tr: count_matrix[element[0], element[1]] = count_matrix[element[0], element[1]] + float(1) count_matrix = count_matrix + 1e-12 transition_matrix = (count_matrix / count_matrix.sum(axis=1, keepdims=1)) assert np.allclose(transition_matrix.sum(axis=1), 1) assert msmtools.analysis.is_transition_matrix(transition_matrix) assert is_connected(transition_matrix) binary_map = (unique / unique.sum(axis=1, keepdims=1)) new_tra = cg_transition_matrix(transition_matrix, binary_map) new_tra[np.isnan(new_tra)] = 0 new_tra = new_tra + 1e-12 new_tra = (new_tra / new_tra.sum(axis=1, keepdims=1)) np.savetxt('transition_matrix' + str(d + 1), new_tra, fmt='%.8f') assert np.allclose(new_tra.sum(axis=1), 1) mm = train_markov_chain(new_tra) stationary_prob = print_stationary_distributions( mm, features_df.KeywordLabel.to_list()) stationary_probs.append(stationary_prob) pdb.set_trace()
def estimate_transition_matrix_reversible(C, Xinit=None, maxiter=1000000, maxerr=1e-8, return_statdist=False, return_conv=False, warn_not_converged=True): """ iterative method for estimating a maximum likelihood reversible transition matrix The iteration equation implemented here is: t_ij = (c_ij + c_ji) / ((c_i / x_i) + (c_j / x_j)) Please note that there is a better (=faster) iteration that has been described in Prinz et al, J. Chem. Phys. 134, p. 174105 (2011). We should implement that too. Parameters ---------- C : ndarray (n,n) count matrix. If a non-connected count matrix is used, the method returns in error Xinit = None : ndarray (n,n) initial value for the matrix of absolute transition probabilities. Unless set otherwise, will use X = diag(pi) T, where T is a nonreversible transition matrix estimated from C, i.e. T_ij = c_ij / sum_k c_ik, and pi is its stationary distribution. maxerr = 1000000 : int maximum number of iterations before the method exits maxiter = 1e-8 : float convergence tolerance. This specifies the maximum change of the Euclidean norm of relative stationary probabilities (x_i = sum_k x_ik). The relative stationary probability changes e_i = (x_i^(1) - x_i^(2))/(x_i^(1) + x_i^(2)) are used in order to track changes in small probabilities. The Euclidean norm of the change vector, |e_i|_2, is compared to convtol. return_statdist : bool, default=False If set to true, the stationary distribution is also returned return_conv : bool, default=False If set to true, the likelihood history and the pi_change history is returned. warn_not_converged : bool, default=True Prints a warning if not converged. Returns ------- T or (T,pi) or (T,lhist,pi_changes) or (T,pi,lhist,pi_changes) T : ndarray (n,n) transition matrix. This is the only return for return_statdist = False, return_conv = False (pi) : ndarray (n) stationary distribution. Only returned if return_statdist = True (lhist) : ndarray (k) likelihood history. Has the length of the number of iterations needed. Only returned if return_conv = True (pi_changes) : ndarray (k) history of likelihood history. Has the length of the number of iterations needed. Only returned if return_conv = True """ from msmtools.estimation import is_connected from msmtools.estimation import log_likelihood # check input if (not is_connected(C)): ValueError('Count matrix is not fully connected. ' + 'Need fully connected count matrix for ' + 'reversible transition matrix estimation.') converged = False n = np.shape(C)[0] # initialization C2 = C + C.T # reversibly counted matrix nz = np.nonzero(C2) csum = np.sum(C, axis=1) # row sums C X = Xinit if (X is None): X = __initX(C) # initial X xsum = np.sum(X, axis=1) # row sums x D = np.zeros((n, n)) # helper matrix T = np.zeros((n, n)) # transition matrix # if convergence history requested, initialize variables if (return_conv): diffs = np.zeros(maxiter) # likelihood lhist = np.zeros(maxiter) T = X / xsum[:, np.newaxis] lhist[0] = log_likelihood(C, T) # iteration i = 1 while (i < maxiter - 1) and (not converged): # c_i / x_i c_over_x = csum / xsum # d_ij = (c_i/x_i) + (c_j/x_j) D[:] = c_over_x[:, np.newaxis] D += c_over_x # update estimate X[nz] = C2[nz] / D[nz] X[nz] /= np.sum(X[nz]) # renormalize xsumnew = np.sum(X, axis=1) # compute difference in pi diff = __relative_error(xsum, xsumnew) # update pi xsum = xsumnew # any convergence history wanted? if (return_conv): # update T and likelihood T = X / xsum[:, np.newaxis] lhist[i] = log_likelihood(C, T) diffs[i] = diff # converged? converged = (diff < maxerr) i += 1 # finalize and return T = X / xsum[:, np.newaxis] if warn_not_converged and not converged: warnings.warn( "Reversible transition matrix estimation didn't converge.", msmtools.util.exceptions.NotConvergedWarning) if (return_statdist and return_conv): return (T, xsum, lhist[0:i], diffs[0:i]) if (return_statdist): return (T, xsum) if (return_conv): return (T, lhist[0:i], diffs[0:i]) return T # else just return T
def _estimate(self, dtrajs): """ Parameters ---------- dtrajs : list containing ndarrays(dtype=int) or ndarray(n, dtype=int) or :class:`pyemma.msm.util.dtraj_states.DiscreteTrajectoryStats` discrete trajectories, stored as integer ndarrays (arbitrary size) or a single ndarray for only one trajectory. **params : Other keyword parameters if different from the settings when this estimator was constructed Returns ------- MSM : :class:`pyemma.msm.EstimatedMSM` or :class:`pyemma.msm.MSM` """ # ensure right format dtrajs = ensure_dtraj_list(dtrajs) # harvest discrete statistics if isinstance(dtrajs, _DiscreteTrajectoryStats): dtrajstats = dtrajs else: # compute and store discrete trajectory statistics dtrajstats = _DiscreteTrajectoryStats(dtrajs) # check if this MSM seems too large to be dense if dtrajstats.nstates > 4000 and not self.sparse: self.logger.warn( 'Building a dense MSM with ' + str(dtrajstats.nstates) + ' states. This can be ' 'inefficient or unfeasible in terms of both runtime and memory consumption. ' 'Consider using sparse=True.') # count lagged dtrajstats.count_lagged(self.lag, count_mode=self.count_mode) # full count matrix and number of states self._C_full = dtrajstats.count_matrix() self._nstates_full = self._C_full.shape[0] # set active set. This is at the same time a mapping from active to full if self.connectivity == 'largest': if self.statdist_constraint is None: # statdist not given - full connectivity on all states self.active_set = dtrajstats.largest_connected_set else: # statdist given - simple connectivity on all nonzero probability states nz = _np.nonzero(self.statdist_constraint)[0] Cnz = dtrajstats.count_matrix(subset=nz) self.active_set = nz[msmest.largest_connected_set( Cnz, directed=False)] else: # for 'None' and 'all' all visited states are active self.active_set = dtrajstats.visited_set # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean! # is estimated self._is_estimated = True # active count matrix and number of states self._C_active = dtrajstats.count_matrix(subset=self.active_set) self._nstates = self._C_active.shape[0] # computed derived quantities # back-mapping from full to lcs self._full2active = -1 * _np.ones((dtrajstats.nstates), dtype=int) self._full2active[self.active_set] = _np.array(list( range(len(self.active_set))), dtype=int) # restrict stationary distribution to active set if self.statdist_constraint is None: statdist_active = None else: statdist_active = self.statdist_constraint[self.active_set] statdist_active /= statdist_active.sum() # renormalize # Estimate transition matrix if self.connectivity == 'largest': P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) elif self.connectivity == 'none': # reversible mode only possible if active set is connected # - in this case all visited states are connected and thus # this mode is identical to 'largest' if self.reversible and not msmest.is_connected(self._C_active): raise ValueError( 'Reversible MSM estimation is not possible with connectivity mode \'none\', ' + 'because the set of all visited states is not reversibly connected' ) P = msmest.transition_matrix(self._C_active, reversible=self.reversible, mu=statdist_active, maxiter=self.maxiter, maxerr=self.maxerr) else: raise NotImplementedError( 'MSM estimation with connectivity=\'self.connectivity\' is currently not implemented.' ) # continue sparse or dense? if not self.sparse: # converting count matrices to arrays. As a result the # transition matrix and all subsequent properties will be # computed using dense arrays and dense matrix algebra. self._C_full = self._C_full.toarray() self._C_active = self._C_active.toarray() P = P.toarray() # Done. We set our own model parameters, so this estimator is # equal to the estimated model. self._dtrajs_full = dtrajs self._connected_sets = msmest.connected_sets(self._C_full) self.set_model_params(P=P, pi=statdist_active, reversible=self.reversible, dt_model=self.timestep_traj.get_scaled(self.lag)) return self
def is_weakly_connected(self): """ Whether the HMM transition matrix is weakly connected """ return msmest.is_connected(self._Tij, directed=False)
def is_strongly_connected(self): """ Whether the HMM transition matrix is strongly connected """ return msmest.is_connected(self._Tij, directed=True)