Beispiel #1
0
    def sample_indexes_by_cluster(self, clusters, nsample, replace=True):
        """Samples trajectory/time indexes according to the given sequence of states.

        Parameters
        ----------
        clusters : iterable of integers
            It contains the cluster indexes to be sampled

        nsample : int
            Number of samples per cluster. If replace = False, the number of returned samples per cluster could be smaller
            if less than nsample indexes are available for a cluster.

        replace : boolean, optional
            Whether the sample is with or without replacement

        Returns
        -------
        indexes : list of ndarray( (N, 2) )
            List of the sampled indices by cluster.
            Each element is an index array with a number of rows equal to N=len(sequence), with rows consisting of a
            tuple (i, t), where i is the index of the trajectory and t is the time index within the trajectory.
        """

        # Check if the catalogue (index_states)
        if len(self._index_states) == 0:  # has never been run
            self._index_states = index_states(self.dtrajs)

        return sample_indexes_by_state(self._index_states[clusters],
                                       nsample,
                                       replace=replace)
 def test_sample_by_state_replace(self):
     dtraj =[0,1,2,3,2,1,0]
     idx = dt.index_states(dtraj)
     sidx = dt.sample_indexes_by_state(idx, 5)
     for i in range(4):
         assert(sidx[i].shape[0] == 5)
         for t in range(sidx[i].shape[0]):
             assert(dtraj[sidx[i][t,1]] == i)
Beispiel #3
0
 def test_sample_by_state_replace(self):
     dtraj = [0, 1, 2, 3, 2, 1, 0]
     idx = dt.index_states(dtraj)
     sidx = dt.sample_indexes_by_state(idx, 5)
     for i in range(4):
         assert (sidx[i].shape[0] == 5)
         for t in range(sidx[i].shape[0]):
             assert (dtraj[sidx[i][t, 1]] == i)
 def test_sample_by_sequence(self):
     dtraj =[0,1,2,3,2,1,0]
     idx = dt.index_states(dtraj)
     seq = [0,1,1,1,0,0,0,0,1,1]
     sidx = dt.sample_indexes_by_sequence(idx, seq)
     assert(np.alltrue(sidx.shape == (len(seq),2)))
     for t in range(sidx.shape[0]):
         assert(sidx[t,0] == 0) # did we pick the right traj?
         assert(dtraj[sidx[t,1]] == seq[t]) # did we pick the right states?
 def test_twotraj(self):
     dtrajs = [[0,1,2,3,2,1,0], [3,4,5]]
     # should be a ValueError because this is not a subset
     res = dt.index_states(dtrajs)
     expected = [np.array([[0,0],[0,6]]),np.array([[0,1],[0,5]]),np.array([[0,2],[0,4]]),np.array([[0,3],[1,0]]),np.array([[1,1]]),np.array([[1,2]])]
     assert(len(res) == len(expected))
     for i in range(len(res)):
         assert(res[i].shape == expected[i].shape)
         assert(np.alltrue(res[i] == expected[i]))
 def test_sample_by_state_replace_subset(self):
     dtraj =[0,1,2,3,2,1,0]
     idx = dt.index_states(dtraj)
     subset = [1,2]
     sidx = dt.sample_indexes_by_state(idx, 5, subset=subset)
     for i in range(len(subset)):
         assert(sidx[i].shape[0] == 5)
         for t in range(sidx[i].shape[0]):
             assert(dtraj[sidx[i][t,1]] == subset[i])
 def active_state_indexes(self):
     """
     Ensures that the connected states are indexed and returns the indices
     """
     self._check_is_estimated()
     if not hasattr(self, '_active_state_indexes'):
         from pyemma.util.discrete_trajectories import index_states
         self._active_state_indexes = index_states(self.discrete_trajectories_active)
     return self._active_state_indexes
Beispiel #8
0
 def test_sample_by_state_replace_subset(self):
     dtraj = [0, 1, 2, 3, 2, 1, 0]
     idx = dt.index_states(dtraj)
     subset = [1, 2]
     sidx = dt.sample_indexes_by_state(idx, 5, subset=subset)
     for i in range(len(subset)):
         assert (sidx[i].shape[0] == 5)
         for t in range(sidx[i].shape[0]):
             assert (dtraj[sidx[i][t, 1]] == subset[i])
Beispiel #9
0
 def test_onetraj_sub(self):
     dtraj = [0, 1, 2, 3, 2, 1, 0]
     # should be a ValueError because this is not a subset
     res = dt.index_states(dtraj, subset=[2, 3])
     expected = [np.array([[0, 2], [0, 4]]), np.array([[0, 3]])]
     assert (len(res) == len(expected))
     for i in range(len(res)):
         assert (res[i].shape == expected[i].shape)
         assert (np.alltrue(res[i] == expected[i]))
 def test_onetraj_sub(self):
     dtraj =[0,1,2,3,2,1,0]
     # should be a ValueError because this is not a subset
     res = dt.index_states(dtraj, subset=[2,3])
     expected = [np.array([[0,2],[0,4]]),np.array([[0,3]])]
     assert(len(res) == len(expected))
     for i in range(len(res)):
         assert(res[i].shape == expected[i].shape)
         assert(np.alltrue(res[i] == expected[i]))
    def observable_state_indexes(self):
        """
        Ensures that the observable states are indexed and returns the indices
        """
        try:  # if we have this attribute, return it
            return self._observable_state_indexes
        except AttributeError:  # didn't exist? then create it.
            import pyemma.util.discrete_trajectories as dt

            self._observable_state_indexes = dt.index_states(self.discrete_trajectories_obs)
            return self._observable_state_indexes
Beispiel #12
0
    def active_state_indexes(self):
        """
        Ensures that the connected states are indexed and returns the indices
        """
        self._check_is_estimated()
        try:  # if we have this attribute, return it
            return self._active_state_indexes
        except:  # didn't exist? then create it.
            import pyemma.util.discrete_trajectories as dt

            self._active_state_indexes = dt.index_states(self.discrete_trajectories_full, subset=self.active_set)
            return self._active_state_indexes
Beispiel #13
0
    def test_performance(self):
        import pyemma.util.discrete_trajectories as dt
        state = np.random.RandomState(42)
        n_states = 10000
        dtrajs = [state.randint(0, n_states, size=100000) for _ in range(500)]

        selection = np.random.choice(np.arange(n_states), size=(500,), replace=False)
        with timing('pyemma'):
            out2 = dt.index_states(dtrajs, selection)
        with timing('cpp'):
            out = sample.compute_index_states(dtrajs, selection)

        assert len(out) == len(out2)
        for o1, o2 in zip(out, out2):
            np.testing.assert_array_almost_equal(o1, o2)
Beispiel #14
0
    def index_clusters(self):
        """Returns trajectory/time indexes for all the clusters

        Returns
        -------
        indexes : list of ndarray( (N_i, 2) )
            For each state, all trajectory and time indexes where this cluster occurs.
            Each matrix has a number of rows equal to the number of occurrences of the corresponding state,
            with rows consisting of a tuple (i, t), where i is the index of the trajectory and t is the time index
            within the trajectory.
        """
        if len(self._dtrajs) == 0:  # nothing assigned yet, doing that now
            self._dtrajs = self.assign()

        if len(self._index_states) == 0:  # has never been run
            self._index_states = index_states(self._dtrajs)

        return self._index_states
 def test_big(self):
     dtraj = dt.read_discrete_trajectory(testpath+'2well_traj_100K.dat')
     # just run these to see if there's any exception
     dt.index_states(dtraj)
Beispiel #16
0
 def test_big(self):
     import pyemma.datasets
     dtraj = pyemma.datasets.load_2well_discrete().dtraj_T100K_dt10
     # just run these to see if there's any exception
     dt.index_states(dtraj)
Beispiel #17
0
    def _estimate(self, dtrajs):
        if self.E is None or self.w is None or self.m is None:
            raise ValueError("E, w or m was not specified. Stopping.")

        # get trajectory counts. This sets _C_full and _nstates_full
        dtrajstats = self._get_dtraj_stats(dtrajs)
        self._C_full = dtrajstats.count_matrix()  # full count matrix
        self._nstates_full = self._C_full.shape[0]  # number of states

        # set active set. This is at the same time a mapping from active to full
        if self.connectivity == 'largest':
            # statdist not given - full connectivity on all states
            self.active_set = dtrajstats.largest_connected_set
        else:
            # for 'None' and 'all' all visited states are active
            self.active_set = dtrajstats.visited_set

        # FIXME: setting is_estimated before so that we can start using the parameters just set, but this is not clean!
        # is estimated
        self._is_estimated = True

        # if active set is empty, we can't do anything.
        if _np.size(self.active_set) == 0:
            raise RuntimeError('Active set is empty. Cannot estimate AMM.')

        from pyemma.util.discrete_trajectories import index_states
        self._active_state_indexes = index_states(dtrajs,
                                                  subset=self.active_set)

        # active count matrix and number of states
        self._C_active = dtrajstats.count_matrix(subset=self.active_set)
        self._nstates = self._C_active.shape[0]

        # computed derived quantities
        # back-mapping from full to lcs
        self._full2active = -1 * _np.ones(dtrajstats.nstates, dtype=int)
        self._full2active[self.active_set] = _np.arange(len(self.active_set))

        # slice out active states from E matrix
        _dset = list(set(_np.concatenate(dtrajs)))
        _rras = [_dset.index(s) for s in self.active_set]
        self.E_active = self.E[_rras]

        if not self.sparse:
            self._C_active = self._C_active.toarray()
            self._C_full = self._C_full.toarray()

        # reversibly counted
        self._C2 = 0.5 * (self._C_active + self._C_active.T)
        self._nz = _np.nonzero(self._C2)
        self._csum = _np.sum(self._C_active, axis=1)  # row sums C

        # get ranges of Markov model expectation values
        if self.support_ci == 1:
            self.E_min = _np.min(self.E_active, axis=0)
            self.E_max = _np.max(self.E_active, axis=0)
        else:
            # PyEMMA confidence interval calculation fails sometimes with conf=1.0
            self.E_min, self.E_max = _ci(self.E_active, conf=self.support_ci)

        # dimensions of E matrix
        self.n_mstates_active, self.n_exp_active = _np.shape(self.E_active)

        assert self.n_exp_active == len(self.w)
        assert self.n_exp_active == len(self.m)

        self.count_outside = []
        self.count_inside = []
        self._lls = []

        i = 0
        # Determine which experimental values are outside the support as defined by the Confidence interval
        for emi, ema, mm, mw in zip(self.E_min, self.E_max, self.m, self.w):
            if mm < emi or ema < mm:
                self.logger.info(
                    "Experimental value %f is outside the support (%f,%f)" %
                    (mm, emi, ema))
                self.count_outside.append(i)
            else:
                self.count_inside.append(i)
            i = i + 1

        self.logger.info(
            "Total experimental constraints outside support %d of %d" %
            (len(self.count_outside), len(self.E_min)))

        # A number of initializations
        self.P, self.pi = msmest.tmatrix(self._C_active,
                                         reversible=True,
                                         return_statdist=True)
        self.lagrange = _np.zeros(self.m.shape)
        self._pihat = self.pi.copy()
        self._update_mhat()
        self._dmhat = 1e-1 * _np.ones(_np.shape(self.mhat))

        # Determine number of slices of R-tensors computable at once with the given cache size
        self._slicesz = _np.floor(self.max_cache /
                                  (self.P.nbytes / 1.e6)).astype(int)
        # compute first bundle of slices
        self._update_Rslices(0)

        self._ll_old = self._log_likelihood_biased(self._C_active, self.P,
                                                   self.m, self.mhat, self.w)

        self._lls = [self._ll_old]

        # make sure everything is initialized

        self._update_pihat()
        self._update_mhat()

        self._update_Q()
        self._update_X_and_pi()

        self._ll_old = self._log_likelihood_biased(self._C_active, self.P,
                                                   self.m, self.mhat, self.w)
        self._update_G()

        #
        # Main estimation algorithm
        # 2-step algorithm, lagrange multipliers and pihat have different convergence criteria
        # when the lagrange multipliers have converged, pihat is updated until the log-likelihood has converged (changes are smaller than 1e-3).
        # These do not always converge together, but usually within a few steps of each other.
        # A better heuristic for the latter may be necessary. For realistic cases (the two ubiquitin examples in [1])
        # this yielded results very similar to those with more stringent convergence criteria (changes smaller than 1e-9) with convergence times
        # which are seconds instead of tens of minutes.
        #

        converged = False  # Convergence flag for lagrange multipliers
        i = 0
        die = False
        while i <= self.maxiter:
            pihat_old = self._pihat.copy()
            self._update_pihat()
            if not _np.all(self._pihat > 0):
                self._pihat = pihat_old.copy()
                die = True
                self.logger.warning(
                    "pihat does not have a finite probability for all states, terminating"
                )
            self._update_mhat()
            self._update_Q()
            if i > 1:
                X_old = self.X.copy()
                self._update_X_and_pi()
                if _np.any(self.X[self._nz] < 0) and i > 0:
                    die = True
                    self.logger.warning(
                        "Warning: new X is not proportional to C... reverting to previous step and terminating"
                    )
                    self.X = X_old.copy()

            if not converged:
                self._newton_lagrange()
            else:  # once Lagrange multipliers are converged compute likelihood here
                P = self.X / self.pi[:, None]
                _ll_new = self._log_likelihood_biased(self._C_active, P,
                                                      self.m, self.mhat,
                                                      self.w)
                self._lls.append(_ll_new)

            # General case fixed-point iteration
            if len(self.count_outside) > 0:
                if i > 1 and _np.all(
                    (_np.abs(self._dmhat) /
                     self.sigmas) < self.eps) and not converged:
                    self.logger.info(
                        "Converged Lagrange multipliers after %i steps..." % i)
                    converged = True
            # Special case
            else:
                if _np.abs(self._lls[-2] - self._lls[-1]) < 1e-8:
                    self.logger.info(
                        "Converged Lagrange multipliers after %i steps..." % i)
                    converged = True
            # if Lagrange multipliers are converged, check whether log-likelihood has converged
            if converged and _np.abs(self._lls[-2] - self._lls[-1]) < 1e-8:
                self.logger.info("Converged pihat after %i steps..." % i)
                die = True
            if die:
                break
            if i == self.maxiter:
                self.logger.info("Failed to converge within %i iterations. "
                                 "Consider increasing max_iter(now=%i)" %
                                 (i, self.max_iter))
            i += 1

        _P = msmest.tmatrix(self._C_active, reversible=True, mu=self._pihat)

        self._dtrajs_full = dtrajs
        self._connected_sets = msmest.connected_sets(self._C_full)
        self.set_model_params(P=_P,
                              pi=self._pihat,
                              reversible=True,
                              dt_model=self.timestep_traj.get_scaled(self.lag))

        return self
Beispiel #18
0
    with open("Qtanh_0_05_profile/T_used.dat","r") as fin: 
        T = float(fin.read())

    tempdirs = [ "T_{:.2f}_{}".format(T, x) for x in [1,2,3] ]
    topfile = tempdirs[0] + "/" + topname
    trajfiles = [ x + "/" + trajname for x in tempdirs ]

    # initialize traj input info.
    feat = coor.featurizer(topfile)
    inp = coor.source(trajfiles, feat)

    # Load MSM's that have already been calculated.
    dirs, dtrajs, lagtimes, models = util.load_markov_state_models()

    model_msm = models[7] # lagtime of 200

    # Determine the number of clusters by the number of timescales.
    n_pcca = 2
    n_sample = 100

    # Grab frames from pcca clustering
    model_msm.pcca(n_pcca)
    pcca_dist = model_msm.metastable_distributions
    active_state_indexes = dt.index_states(dtrajs)
    pcca_samples = dt.sample_indexes_by_distribution(active_state_indexes, pcca_dist, n_sample)

    outfiles = [ 'msm/pcca{}.xtc'.format(x) for x in range(1, n_pcca + 1) ]
                    
    coor.save_trajs(inp, pcca_samples, outfiles=outfiles)

Beispiel #19
0
 def test_subset_error(self):
     dtraj = [0, 1, 2, 3, 2, 1, 0]
     # should be a ValueError because this is not a subset
     with self.assertRaises(ValueError):
         dt.index_states(dtraj, subset=[3, 4, 5])
 def test_subset_error(self):
     dtraj =[0,1,2,3,2,1,0]
     # should be a ValueError because this is not a subset
     with self.assertRaises(ValueError):
         dt.index_states(dtraj, subset=[3,4,5])