Beispiel #1
0
    def test(self):

        num_macro = 5

        TC = get("PCCA_ref/tProb.mtx")
        A = get("PCCA_ref/Assignments.Fixed.h5")['arr_0']
        print A

        macro_map, macro_assign = PCCA.run_pcca(num_macro, A, TC)
        r_macro_map = get("PCCA_ref/MacroMapping.dat")

        macro_map = macro_map.astype(np.int)
        r_macro_map = r_macro_map.astype(np.int)

        # The order of macrostates might be different between the reference and
        # new lumping. We therefore find a permutation to match them.
        permutation_mapping = np.zeros(macro_assign.max() + 1, 'int')
        for i in range(num_macro):
            j = np.where(macro_map == i)[0][0]
            permutation_mapping[i] = r_macro_map[j]

        macro_map_permuted = permutation_mapping[macro_map]
        MSMLib.apply_mapping_to_assignments(macro_assign, permutation_mapping)

        r_macro_assign = get("PCCA_ref/MacroAssignments.h5")['arr_0']

        eq(macro_map_permuted, r_macro_map)
        eq(macro_assign, r_macro_assign)
Beispiel #2
0
def classic(trajs,
            n_clusters,
            n_medoid_iters,
            metric,
            dim=2,
            lag_time=1,
            show=False,
            desc=None):
    """Use classic clustering methods."""

    if desc is None:
        desc = "Classic, n_clusters=%d" % n_clusters

    hkm = clustering.HybridKMedoids(metric,
                                    trajs,
                                    k=n_clusters,
                                    local_num_iters=n_medoid_iters)
    centroids = hkm.get_generators_as_traj()

    centroids_nf = centroids['XYZList'][:, 0, 0:dim]
    plot_centroids(centroids_nf)
    if show: pp.show()

    counts = msml.get_count_matrix_from_assignments(hkm.get_assignments(),
                                                    n_clusters, lag_time)
    rev_counts, t_matrix, populations, mapping = msml.build_msm(counts)
    analyze_msm(t_matrix, centroids_nf, desc, show=show)

    return t_matrix
Beispiel #3
0
    def build_msm(self, lag_time=None):
        """Build an MSM from the loaded trajectories."""
        if lag_time is None:
            lag_time = self.good_lag_time
        else:
            self.good_lag_time = lag_time

        # Do assignment
        trajs = get_data.get_shimtraj_from_trajlist(self.traj_list)
        metric = classic.Euclidean2d()

        # Allocate array
        n_trajs = len(self.traj_list)
        max_traj_len = max([t.shape[0] for t in self.traj_list])
        assignments = -1 * np.ones((n_trajs, max_traj_len), dtype='int')

        # Prepare generators
        pgens = metric.prepare_trajectory(
            self.clusterer.get_generators_as_traj())

        for i, traj in enumerate(trajs):
            ptraj = metric.prepare_trajectory(traj)

            for j in xrange(len(traj)):
                d = metric.one_to_all(ptraj, pgens, j)
                assignments[i, j] = np.argmin(d)

        counts = msml.get_count_matrix_from_assignments(assignments,
                                                        n_states=None,
                                                        lag_time=lag_time)
        rev_counts, t_matrix, populations, mapping = msml.build_msm(counts)
        return t_matrix
Beispiel #4
0
def run_pcca_plus(num_macrostates,
                  assignments,
                  tProb,
                  output_dir,
                  flux_cutoff=0.0,
                  objective_function="crispness",
                  do_minimization=True):
    MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5")
    MacroMapFn = os.path.join(output_dir, "MacroMapping.dat")
    ChiFn = os.path.join(output_dir, 'Chi.dat')
    AFn = os.path.join(output_dir, 'A.dat')
    arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn])

    logger.info("Running PCCA+...")
    A, chi, vr, MAP = lumping.pcca_plus(tProb,
                                        num_macrostates,
                                        flux_cutoff=flux_cutoff,
                                        do_minimization=do_minimization,
                                        objective_function=objective_function)

    MSMLib.apply_mapping_to_assignments(assignments, MAP)

    np.savetxt(ChiFn, chi)
    np.savetxt(AFn, A)
    np.savetxt(MacroMapFn, MAP, "%d")
    msmbuilder.io.saveh(MacroAssignmentsFn, assignments)
    logger.info('Saved output to: %s, %s, %s, %s', ChiFn, AFn, MacroMapFn,
                MacroAssignmentsFn)
Beispiel #5
0
    def build_msm(self, lag_time=None):
        """Build an MSM from the loaded trajectories."""
        if lag_time is None:
            lag_time = self.good_lag_time
        else:
            self.good_lag_time = lag_time

        # Do assignment
        trajs = get_data.get_shimtraj_from_trajlist(self.traj_list)
        metric = classic.Euclidean2d()

        # Allocate array
        n_trajs = len(self.traj_list)
        max_traj_len = max([t.shape[0] for t in self.traj_list])
        assignments = -1 * np.ones((n_trajs, max_traj_len), dtype='int')

        # Prepare generators
        pgens = metric.prepare_trajectory(self.clusterer.get_generators_as_traj())

        for i, traj in enumerate(trajs):
            ptraj = metric.prepare_trajectory(traj)

            for j in xrange(len(traj)):
                d = metric.one_to_all(ptraj, pgens, j)
                assignments[i, j] = np.argmin(d)

        counts = msml.get_count_matrix_from_assignments(assignments, n_states=None, lag_time=lag_time)
        rev_counts, t_matrix, populations, mapping = msml.build_msm(counts)
        return t_matrix
    def test(self):

        num_macro = 5

        TC = get("PCCA_ref/tProb.mtx")
        A = get("PCCA_ref/Assignments.Fixed.h5")['arr_0']

        macro_map, macro_assign = PCCA.run_pcca(num_macro, A, TC)
        r_macro_map = get("PCCA_ref/MacroMapping.dat")

        macro_map = macro_map.astype(np.int)
        r_macro_map = r_macro_map.astype(np.int)

        # The order of macrostates might be different between the reference and
        # new lumping. We therefore find a permutation to match them.
        permutation_mapping = np.zeros(macro_assign.max() + 1, 'int')
        for i in range(num_macro):
            j = np.where(macro_map == i)[0][0]
            permutation_mapping[i] = r_macro_map[j]

        macro_map_permuted = permutation_mapping[macro_map]
        MSMLib.apply_mapping_to_assignments(macro_assign, permutation_mapping)

        r_macro_assign = get("PCCA_ref/MacroAssignments.h5")['arr_0']

        eq(macro_map_permuted, r_macro_map)
        eq(macro_assign, r_macro_assign)
def test_apply_mapping_to_assignments_1():
    l = 100
    assignments = np.random.randint(l, size=(10, 10))
    mapping = np.ones(l)

    MSMLib.apply_mapping_to_assignments(assignments, mapping)

    eq(assignments, np.ones((10, 10)))
def test_estimate_rate_matrix_1():
    np.random.seed(42)
    assignments = np.random.randint(2, size=(10, 10))
    counts = MSMLib.get_count_matrix_from_assignments(assignments)
    K = MSMLib.estimate_rate_matrix(counts, assignments).todense()

    correct = np.matrix([[-40.40909091, 0.5], [0.33928571, -50.55357143]])
    eq(K, correct)
Beispiel #9
0
def test_estimate_rate_matrix_1():
    np.random.seed(42)
    assignments = np.random.randint(2, size=(10, 10))
    counts = MSMLib.get_count_matrix_from_assignments(assignments)
    K = MSMLib.estimate_rate_matrix(counts, assignments).todense()

    correct = np.matrix([[-40.40909091, 0.5], [0.33928571, -50.55357143]])
    eq(K, correct)
Beispiel #10
0
def test_apply_mapping_to_assignments_1():
    l = 100
    assignments = np.random.randint(l, size=(10, 10))
    mapping = np.ones(l)

    MSMLib.apply_mapping_to_assignments(assignments, mapping)

    eq(assignments, np.ones((10, 10)))
    def test_1(self):

        C = MSMLib.get_count_matrix_from_assignments(self.assignments, 2)
        rc, t, p, m = MSMLib.build_msm(C, symmetrize="MLE", ergodic_trimming=True)

        eq(rc.todense(), np.matrix([[6.46159184, 4.61535527], [4.61535527, 2.30769762]]), decimal=4)
        eq(t.todense(), np.matrix([[0.58333689, 0.41666311], [0.66666474, 0.33333526]]), decimal=4)
        eq(p, np.array([0.61538595, 0.38461405]), decimal=5)
        eq(m, np.array([0, 1]))
def test_get_count_matrix_from_assignments_3():
    np.random.seed(42)
    assignments = np.random.randint(3, size=(10, 10))

    val = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=2, sliding_window=False).todense()
    eq(val, np.matrix([[5.0, 3.0, 4.0], [2.0, 12.0, 3.0], [4.0, 3.0, 4.0]]))

    val = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=2, sliding_window=True).todense()
    eq(val, np.matrix([[8.0, 9.0, 11.0], [5.0, 18.0, 6.0], [11.0, 5.0, 7.0]]))
Beispiel #13
0
def run_pcca_plus(num_macrostates, assignments, tProb, flux_cutoff=0.0,
    objective_function="crispness",do_minimization=True):
    
    logger.info("Running PCCA+...")
    A, chi, vr, MAP = lumping.pcca_plus(tProb, num_macrostates, flux_cutoff=flux_cutoff,
        do_minimization=do_minimization, objective_function=objective_function)

    MSMLib.apply_mapping_to_assignments(assignments, MAP)    

    return chi, A, MAP, assignments
Beispiel #14
0
def construct_counts_matrix(assignments):
    """Build and return a counts matrix from assignments.
    
    Symmetrize either with transpose or MLE based on the value of the
    self.symmetrize variable
        
    Also modifies the assignments file that you pass it to reflect ergodic
    trimming
    
    Parameters
    ----------
    assignments : np.ndarray
        2D array of MSMBuilder assignments
    
    Returns
    -------
    counts : scipy.sparse.csr_matrix
        transition counts
    
    """
        
    n_states  = np.max(assignments.flatten()) + 1
    raw_counts = MSMLib.get_count_matrix_from_assignments(assignments, n_states,
                                               lag_time=Project().lagtime,
                                               sliding_window=True)
        
    ergodic_counts = None
    if Project().trim:
        raise NotImplementedError(('Trimming is not yet supported because '
                                   'we need to keep track of the mapping from trimmed to '
                                   ' untrimmed states for joint clustering to be right'))
        try:
            ergodic_counts, mapping = MSMLib.ergodic_trim(raw_counts)
            MSMLib.apply_mapping_to_assignments(assignments, mapping)
            counts = ergodic_counts
        except Exception as e:
            logger.warning("MSMLib.ergodic_trim failed with message '{0}'".format(e))

    else:
        logger.info("Ignoring ergodic trimming")
        counts = raw_counts
        
    if Project().symmetrize == 'transpose':
        logger.debug('Transpose symmetrizing')
        counts = counts + counts.T
    elif Project().symmetrize == 'mle':
        logger.debug('MLE symmetrizing')
        counts = MSMLib.mle_reversible_count_matrix(counts)
    elif Project().symmetrize == 'none' or (not Project().symmetrize):
        logger.debug('Skipping symmetrization')
    else:
        raise ValueError("Could not understand symmetrization method: %s" % Project().symmetrize)
        
    return counts
Beispiel #15
0
def run_pcca(num_macrostates, assignments, tProb):
    logger.info("Running PCCA...")
    if len(np.unique(assignments[np.where(assignments != -1)])) != tProb.shape[0]:
        raise ValueError('Different number of states in assignments and tProb!')
    MAP = lumping.PCCA(tProb, num_macrostates)

    # MAP the new assignments and save, make sure don't
    # mess up negaitve one's (ie where don't have data)
    MSMLib.apply_mapping_to_assignments(assignments, MAP)

    return MAP, assignments
Beispiel #16
0
def test_get_count_matrix_from_assignments_3():
    np.random.seed(42)
    assignments = np.random.randint(3, size=(10, 10))

    val = MSMLib.get_count_matrix_from_assignments(
        assignments, lag_time=2, sliding_window=False).todense()
    eq(val, np.matrix([[5., 3., 4.], [2., 12., 3.], [4., 3., 4.]]))

    val = MSMLib.get_count_matrix_from_assignments(
        assignments, lag_time=2, sliding_window=True).todense()
    eq(val, np.matrix([[8., 9., 11.], [5., 18., 6.], [11., 5., 7.]]))
def test_trim_states():
    
    # run the (just tested) ergodic trim
    counts = scipy.sparse.csr_matrix(np.matrix('2 1 0; 1 2 0; 0 0 1'))
    trimmed, mapping = MSMLib.ergodic_trim(counts)
    
    # now try the segmented method
    states_to_trim = MSMLib.ergodic_trim_indices(counts)
    trimmed_counts = MSMLib.trim_states(states_to_trim, counts, assignments=None)
    
    eq(trimmed.todense(), trimmed_counts.todense())
Beispiel #18
0
def msm(traj_list, n_clusters, n_medoid_iters=10, lag_time=1, distance_cutoff=None):
    """Use classic clustering methods."""

    print "Building a classic MSM"
    hkm = cluster(traj_list, n_clusters, n_medoid_iters, distance_cutoff)
    # centroids = hkm.get_generators_as_traj()
    # centroids_nf = centroids['XYZList'][:, 0, 0:dim]

    counts = msml.get_count_matrix_from_assignments(hkm.get_assignments(), n_clusters, lag_time)
    rev_counts, t_matrix, populations, mapping = msml.build_msm(counts)

    return t_matrix
Beispiel #19
0
def test_get_count_matrix_from_assignments_3():
    np.random.seed(42)
    assignments = np.random.randint(3, size=(10,10))

    val = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=2, sliding_window=False).todense()
    npt.assert_equal(val, np.matrix([[ 5.,   3.,   4.],
                         [  2.,  12.,   3.],
                         [ 4.,   3.,   4.]]))
                         
    val = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=2, sliding_window=True).todense()
    npt.assert_equal(val, np.matrix([[8.,   9.,  11.],
                          [ 5.,  18.,   6.],
                          [ 11.,   5.,   7.]]))                 
def test_apply_mapping_to_assignments_2():
    "preseve the -1s"

    l = 100
    assignments = np.random.randint(l, size=(10, 10))
    assignments[0, 0] = -1
    mapping = np.ones(l)

    correct = np.ones((10, 10))
    correct[0, 0] = -1

    MSMLib.apply_mapping_to_assignments(assignments, mapping)

    eq(assignments, correct)
Beispiel #21
0
def run_pcca(num_macrostates, assignments, tProb):
    logger.info("Running PCCA...")
    if len(np.unique(
            assignments[np.where(assignments != -1)])) != tProb.shape[0]:
        raise ValueError(
            'Different number of states in assignments and tProb!')
    PCCA = lumping.PCCA(tProb, num_macrostates)
    MAP = PCCA.microstate_mapping

    # MAP the new assignments and save, make sure don't
    # mess up negaitve one's (ie where don't have data)
    MSMLib.apply_mapping_to_assignments(assignments, MAP)

    return MAP, assignments
Beispiel #22
0
    def compare_kyle_to_lutz(self, raw_counts):
        """Kyle wrote the most recent MLE code.  We compare to the previous
        code that was written by Lutz.
        """

        counts = MSMLib.ergodic_trim(raw_counts)[0]

        x_kyle = MSMLib.mle_reversible_count_matrix(counts)
        x_kyle /= x_kyle.sum()

        x_lutz = MSMLib.__mle_reversible_count_matrix_lutz__(counts)
        x_lutz /= x_lutz.sum()

        eq(x_kyle.toarray(), x_lutz.toarray())
    def compare_kyle_to_lutz(self, raw_counts):
        """Kyle wrote the most recent MLE code.  We compare to the previous
        code that was written by Lutz.
        """

        counts = MSMLib.ergodic_trim(raw_counts)[0]

        x_kyle = MSMLib.mle_reversible_count_matrix(counts)
        x_kyle /= x_kyle.sum()

        x_lutz = MSMLib.__mle_reversible_count_matrix_lutz__(counts)
        x_lutz /= x_lutz.sum()

        eq(x_kyle.toarray(), x_lutz.toarray())
Beispiel #24
0
def test_apply_mapping_to_assignments_2():
    "preseve the -1s"

    l = 100
    assignments = np.random.randint(l, size=(10, 10))
    assignments[0, 0] = -1
    mapping = np.ones(l)

    correct = np.ones((10, 10))
    correct[0, 0] = -1

    MSMLib.apply_mapping_to_assignments(assignments, mapping)

    eq(assignments, correct)
Beispiel #25
0
def test_estimate_rate_matrix_2():
    np.random.seed(42)
    counts_dense = np.random.randint(100, size=(4, 4))
    counts_sparse = scipy.sparse.csr_matrix(counts_dense)

    t_mat_dense = MSMLib.estimate_transition_matrix(counts_dense)
    t_mat_sparse = MSMLib.estimate_transition_matrix(counts_sparse)

    correct = np.array([[0.22368421, 0.40350877, 0.06140351, 0.31140351],
                        [0.24193548, 0.08064516, 0.33064516, 0.34677419],
                        [0.22155689, 0.22155689, 0.26047904, 0.29640719],
                        [0.23469388, 0.02040816, 0.21428571, 0.53061224]])

    eq(t_mat_dense, correct)
    eq(t_mat_dense, np.array(t_mat_sparse.todense()))
Beispiel #26
0
    def test_1(self):

        C = MSMLib.get_count_matrix_from_assignments(self.assignments, 2)
        rc, t, p, m = MSMLib.build_msm(C,
                                       symmetrize='MLE',
                                       ergodic_trimming=True)

        eq(rc.todense(),
           np.matrix([[6.46159184, 4.61535527], [4.61535527, 2.30769762]]),
           decimal=4)
        eq(t.todense(),
           np.matrix([[0.58333689, 0.41666311], [0.66666474, 0.33333526]]),
           decimal=4)
        eq(p, np.array([0.61538595, 0.38461405]), decimal=5)
        eq(m, np.array([0, 1]))
Beispiel #27
0
def test_estimate_rate_matrix_2():
    np.random.seed(42)
    counts_dense = np.random.randint(100, size=(4, 4))
    counts_sparse = scipy.sparse.csr_matrix(counts_dense)

    t_mat_dense = MSMLib.estimate_transition_matrix(counts_dense)
    t_mat_sparse = MSMLib.estimate_transition_matrix(counts_sparse)

    correct = np.array([[0.22368421, 0.40350877, 0.06140351, 0.31140351],
                        [0.24193548, 0.08064516, 0.33064516, 0.34677419],
                        [0.22155689, 0.22155689, 0.26047904, 0.29640719],
                        [0.23469388, 0.02040816, 0.21428571, 0.53061224]])

    eq(t_mat_dense, correct)
    eq(t_mat_dense, np.array(t_mat_sparse.todense()))
Beispiel #28
0
def build_new(centroids,
              trajs,
              fuzziness,
              dist,
              soft=True,
              neigen=4,
              show=False,
              desc=None):
    """Build an MSM from points and centroids.

    First this function generates membership vectors.

    if soft is False, 'Quantize' the membership vectors to mirror the
    hard clustering case, else use the fuzzy nature of the clusters in
    building the MSM.
    """
    n_states = len(centroids)
    time_pairs = get_giant_state_list(centroids,
                                      trajs,
                                      fuzziness,
                                      dist,
                                      soft=soft)
    print("Got state list")
    counts_mat = buildmsm.get_counts_from_pairs(time_pairs, n_states)
    print("Got count matrix")
    rev_counts, t_matrix, populations, mapping = msml.build_msm(counts_mat)

    if desc is None:
        if soft:
            desc = 'New, Fuzzy'
        else:
            desc = 'New, not-so-fuzzy'
    analyze_msm(t_matrix, centroids, desc=desc, show=show, neigen=neigen)
Beispiel #29
0
def run_pcca(num_macrostates, assignments, tProb, output_dir):
    MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5")
    MacroMapFn = os.path.join(output_dir, "MacroMapping.dat")
    arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn])

    logger.info("Running PCCA...")
    MAP = lumping.PCCA(tProb, num_macrostates)

    # MAP the new assignments and save, make sure don't
    # mess up negaitve one's (ie where don't have data)
    MSMLib.apply_mapping_to_assignments(assignments, MAP)

    np.savetxt(MacroMapFn, MAP, "%d")
    msmbuilder.io.saveh(MacroAssignmentsFn, assignments)

    logger.info("Saved output to: %s, %s", MacroAssignmentsFn, MacroMapFn)
Beispiel #30
0
def test_estimate_transition_matrix_1():
    np.random.seed(42)
    count_matrix = np.array([[6, 3, 7], [4, 6, 9], [2, 6, 7]])
    t = MSMLib.estimate_transition_matrix(count_matrix)
    eq(t, np.array([[0.375, 0.1875, 0.4375],
                    [0.21052632, 0.31578947, 0.47368421],
                    [0.13333333, 0.4, 0.46666667]]))
Beispiel #31
0
    def test_j_PCCA(self):

        TC = scipy.io.mmread(os.path.join(WorkingDir, "Data", "tProb.mtx"))
        A = io.loadh(os.path.join(WorkingDir, "Data", "Assignments.Fixed.h5"),
                     'arr_0')
        PCCA.run_pcca(NumMacroStates, A, TC, os.path.join(WorkingDir, 'Data'))

        mm = np.loadtxt(os.path.join(WorkingDir, "Data", "MacroMapping.dat"),
                        'int')
        mm_r = np.loadtxt(
            os.path.join(ReferenceDir, "Data", "MacroMapping.dat"), 'int')

        ma = io.loadh(os.path.join(WorkingDir, "Data", "MacroAssignments.h5"),
                      'arr_0')
        ma_r = io.loadh(
            os.path.join(ReferenceDir, "Data", "MacroAssignments.h5"), 'Data')

        num_macro = NumMacroStates
        permutation_mapping = np.zeros(num_macro, 'int')
        #The order of macrostates might be different between the reference and new lumping.
        #We therefore find a permutation to match them.
        for i in range(num_macro):
            j = np.where(mm == i)[0][0]
            permutation_mapping[i] = mm_r[j]

        mm_permuted = permutation_mapping[mm]
        MSMLib.ApplyMappingToAssignments(ma, permutation_mapping)

        npt.assert_array_almost_equal(mm_permuted, mm_r)
        npt.assert_array_almost_equal(ma, ma_r)
Beispiel #32
0
def run_pcca(num_macrostates, assignments, tProb, output_dir):
    MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5")
    MacroMapFn = os.path.join(output_dir, "MacroMapping.dat")
    arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn])

    logger.info("Running PCCA...")
    MAP = lumping.PCCA(tProb, num_macrostates)

    # MAP the new assignments and save, make sure don't
    # mess up negaitve one's (ie where don't have data)
    MSMLib.apply_mapping_to_assignments(assignments, MAP)

    np.savetxt(MacroMapFn, MAP, "%d")
    msmbuilder.io.saveh(MacroAssignmentsFn, assignments)
    
    logger.info("Saved output to: %s, %s", MacroAssignmentsFn, MacroMapFn)
Beispiel #33
0
def ndgrid_msm_likelihood_score(estimator, sequences):
    """Log-likelihood score function for an (NDGrid, MarkovStateModel) pipeline

    Parameters
    ----------
    estimator : sklearn.pipeline.Pipeline
        A pipeline estimator containing an NDGrid followed by a MarkovStateModel
    sequences: list of array-like, each of shape (n_samples_i, n_features)
        Data sequences, where n_samples_i in the number of samples
        in sequence i and n_features is the number of features.

    Returns
    -------
    log_likelihood : float
        Mean log-likelihood per data point.

    Examples
    --------
    >>> pipeline = Pipeline([
    >>>    ('grid', NDGrid()),
    >>>    ('msm', MarkovStateModel())
    >>> ])
    >>> grid = GridSearchCV(pipeline, param_grid={
    >>>    'grid__n_bins_per_feature': [10, 20, 30, 40]
    >>> }, scoring=ndgrid_msm_likelihood_score)
    >>> grid.fit(dataset)
    >>> print grid.grid_scores_

    References
    ----------
    .. [1] McGibbon, R. T., C. R. Schwantes, and V. S. Pande. "Statistical
       Model Selection for Markov Models of Biomolecular Dynamics." J. Phys.
       Chem B. (2014)
    """
    import msmbuilder.MSMLib as msmlib
    from mixtape import cluster

    grid = [model for (name, model) in estimator.steps if isinstance(model, cluster.NDGrid)][0]
    msm = [model for (name, model) in estimator.steps if isinstance(model, MarkovStateModel)][0]

    # NDGrid supports min/max being different along different directions, which
    # means that the bin widths are coordinate dependent. But I haven't
    # implemented that because I've only been using this for 1D data
    if grid.n_features != 1:
        raise NotImplementedError("file an issue on github :)")

    transition_log_likelihood = 0
    emission_log_likelihood = 0
    logtransmat = np.nan_to_num(np.log(np.asarray(msm.transmat_.todense())))
    width = grid.grid[0, 1] - grid.grid[0, 0]

    for X in grid.transform(sequences):
        counts = np.asarray(
            _apply_mapping_to_matrix(msmlib.get_counts_from_traj(X, n_states=grid.n_bins), msm.mapping_).todense()
        )
        transition_log_likelihood += np.multiply(counts, logtransmat).sum()
        emission_log_likelihood += -1 * np.log(width) * len(X)

    return (transition_log_likelihood + emission_log_likelihood) / sum(len(x) for x in sequences)
def main(assfile, lag, nproc):
    lag=int(lag)
    nproc=int(nproc)
    Assignments=io.loadh(assfile)
    num=int(assfile.split('Assignments_sub')[1].split('.h5')[0])
    dir=os.path.dirname(assfile)
    newdir='%s/boot-sub%s' % (dir, num)
    ref_sub=numpy.loadtxt('%s/times.h5' % dir, usecols=(1,))
    ref_total=numpy.loadtxt('%s/times.h5' % dir, usecols=(2,))
    times=dict()
    for (i,j) in zip(ref_sub, ref_total):
        times[i]=j

    proj=Project.load_from('%s/ProjectInfo.yaml' % dir.split('Data')[0])
    multinom=int(times[num])
    if not os.path.exists(newdir):
        os.mkdir(newdir)
    if 'Data' in Assignments.keys():
        Assignments=Assignments['Data']
    else:
        Assignments=Assignments['arr_0']
    print Assignments.shape
    NumStates = max(Assignments.flatten()) + 1
    Counts = MSMLib.get_count_matrix_from_assignments(Assignments, lag_time=int(lag), sliding_window=True)
    Counts=Counts.todense()
    Counts=Counts*(1.0/lag)
    T=numpy.array(Counts)
    frames=numpy.where(T==0)
    T[frames]=1
    Popsample=dict()
    iteration=0
    total_iteration=100/nproc
    print "%s total iterations" % total_iteration
    if 100 % nproc != 0:
        remain=100 % nproc
    else:
        remain=False
    print "iterating thru tCount samples"
    count=0
    while iteration < 100:
        if count*nproc > 100:
            nproc=remain
        print "sampling iteration %s" % iteration
        Tfresh=T.copy()
        input = zip([Tfresh]*nproc, [multinom]*nproc, range(0, NumStates))
        pool = multiprocessing.Pool(processes=nproc)
        result = pool.map_async(parallel_get_matrix, input)
        result.wait()
        all = result.get()
        pool.terminate()
        for c_matrix in all:
            scipy.io.mmwrite('%s/tCounts-%s' % (newdir, iteration), c_matrix)
            #rev_counts, t_matrix, Populations, Mapping=x
            #scipy.io.mmwrite('%s/tProb-%s' % (newdir, iteration), t_matrix)
            #numpy.savetxt('%s/Populations-%s' % (newdir, iteration), Populations)
            #numpy.savetxt('%s/Mapping-%s' % (newdir, iteration), Mapping)
            iteration+=1
        count+=1
        print "dont with iteration %s" % iteration*nproc
Beispiel #35
0
def test_get_count_matrix_from_assignments_1():
    
    assignments = np.zeros((10,10))
    
    val = MSMLib.get_count_matrix_from_assignments(assignments).todense()
    correct = np.matrix([[90.0]])
    
    npt.assert_equal(val, correct)
def test_get_count_matrix_from_assignments_1():

    assignments = np.zeros((10, 10), "int")

    val = MSMLib.get_count_matrix_from_assignments(assignments).todense()
    correct = np.matrix([[90.0]])

    eq(val, correct)
Beispiel #37
0
def msm(traj_list,
        n_clusters,
        n_medoid_iters=10,
        lag_time=1,
        distance_cutoff=None):
    """Use classic clustering methods."""

    print "Building a classic MSM"
    hkm = cluster(traj_list, n_clusters, n_medoid_iters, distance_cutoff)
    # centroids = hkm.get_generators_as_traj()
    # centroids_nf = centroids['XYZList'][:, 0, 0:dim]

    counts = msml.get_count_matrix_from_assignments(hkm.get_assignments(),
                                                    n_clusters, lag_time)
    rev_counts, t_matrix, populations, mapping = msml.build_msm(counts)

    return t_matrix
Beispiel #38
0
def test_get_count_matrix_from_assignments_1():

    assignments = np.zeros((10, 10), 'int')

    val = MSMLib.get_count_matrix_from_assignments(assignments).todense()
    correct = np.matrix([[90.0]])

    eq(val, correct)
Beispiel #39
0
def run(LagTime, assignments, Symmetrize='MLE', input_mapping="None", Prior=0.0, OutDir="./Data/"):

    # set the filenames for output
    FnTProb = os.path.join(OutDir, "tProb.mtx")
    FnTCounts = os.path.join(OutDir, "tCounts.mtx")
    FnMap = os.path.join(OutDir, "Mapping.dat")
    FnAss = os.path.join(OutDir, "Assignments.Fixed.h5")
    FnPops = os.path.join(OutDir, "Populations.dat")
    
    # make sure none are taken
    outputlist = [FnTProb, FnTCounts, FnMap, FnAss, FnPops]
    arglib.die_if_path_exists(outputlist)

    # if given, apply mapping to assignments
    if input_mapping != "None":
        MSMLib.apply_mapping_to_assignments(assignments, input_mapping)

    n_states = np.max(assignments.flatten()) + 1
    n_assigns_before_trim = len( np.where( assignments.flatten() != -1 )[0] )
    
    rev_counts, t_matrix, populations, mapping = MSMLib.build_msm(assignments,
        lag_time=LagTime, symmetrize=Symmetrize,
        sliding_window=True, trim=True)

    MSMLib.apply_mapping_to_assignments(assignments, mapping)
    n_assigns_after_trim = len( np.where( assignments.flatten() != -1 )[0] )

    # if had input mapping, then update it
    if input_mapping != "None":
        mapping = mapping[input_mapping]
    
    # Print a statement showing how much data was discarded in trimming
    percent = (1.0 - float(n_assigns_after_trim) / float(n_assigns_before_trim)) * 100.0
    logger.warning("Ergodic trimming discarded: %f percent of your data", percent)
 
    # Save all output
    np.savetxt(FnPops, populations)
    np.savetxt(FnMap, mapping,"%d")
    scipy.io.mmwrite(str(FnTProb), t_matrix)
    scipy.io.mmwrite(str(FnTCounts), rev_counts)
    msmbuilder.io.saveh(FnAss, assignments)

    for output in outputlist:
        logger.info("Wrote: %s", output)

    return
Beispiel #40
0
def test_get_count_matrix_from_assignments_1():

    assignments = np.zeros((10, 10))

    val = MSMLib.get_count_matrix_from_assignments(assignments).todense()
    correct = np.matrix([[90.0]])

    npt.assert_equal(val, correct)
Beispiel #41
0
def test_get_count_matrix_from_assignments_2():
    np.random.seed(42)

    assignments = np.random.randint(3, size=(10, 10))

    val = MSMLib.get_count_matrix_from_assignments(assignments).todense()

    correct = np.matrix([[11., 9., 10.], [9., 17., 7.], [10., 7., 10.]])
    eq(val, correct)
Beispiel #42
0
def test_estimate_transition_matrix_1():
    np.random.seed(42)
    count_matrix = np.array([[6, 3, 7], [4, 6, 9], [2, 6, 7]])
    t = MSMLib.estimate_transition_matrix(count_matrix)
    eq(
        t,
        np.array([[0.375, 0.1875, 0.4375],
                  [0.21052632, 0.31578947, 0.47368421],
                  [0.13333333, 0.4, 0.46666667]]))
Beispiel #43
0
def run_pcca_plus(num_macrostates, assignments, tProb, output_dir, flux_cutoff=0.0,objective_function="crispness",do_minimization=True):
    MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5")
    MacroMapFn = os.path.join(output_dir, "MacroMapping.dat")
    ChiFn = os.path.join(output_dir, 'Chi.dat')
    AFn = os.path.join(output_dir, 'A.dat')
    arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn])
    
    logger.info("Running PCCA+...")
    A, chi, vr, MAP = lumping.pcca_plus(tProb, num_macrostates, flux_cutoff=flux_cutoff,
        do_minimization=do_minimization, objective_function=objective_function)

    MSMLib.apply_mapping_to_assignments(assignments, MAP)    

    np.savetxt(ChiFn, chi)
    np.savetxt(AFn, A)
    np.savetxt(MacroMapFn, MAP,"%d")
    msmbuilder.io.saveh(MacroAssignmentsFn, assignments)
    logger.info('Saved output to: %s, %s, %s, %s', ChiFn, AFn, MacroMapFn, MacroAssignmentsFn)
Beispiel #44
0
def build_classic_from_memberships(memberships, lag_time=1):
    """Build a classic msm by turning a membership array into a state list.

    This function uses msmbuilder code to calculate the count matrix. Use this
    for compairing quantized versions of the fuzzy count matrix building
    for consistency.
    """
    states = np.zeros(memberships.shape[0], dtype='int')
    n_states = memberships.shape[1]

    for i in xrange(memberships.shape[0]):
        memb = memberships[i]
        state = np.argmax(memb)
        states[i] = state

    counts = msm.get_counts_from_traj(states, n_states, lag_time)
    rev_counts, t_matrix, populations, mapping = msm.build_msm(counts)
    return rev_counts, t_matrix, populations, mapping
Beispiel #45
0
def build_classic_from_memberships(memberships, lag_time=1):
    """Build a classic msm by turning a membership array into a state list.

    This function uses msmbuilder code to calculate the count matrix. Use this
    for compairing quantized versions of the fuzzy count matrix building
    for consistency.
    """
    states = np.zeros(memberships.shape[0], dtype='int')
    n_states = memberships.shape[1]

    for i in xrange(memberships.shape[0]):
        memb = memberships[i]
        state = np.argmax(memb)
        states[i] = state

    counts = msm.get_counts_from_traj(states, n_states, lag_time)
    rev_counts, t_matrix, populations, mapping = msm.build_msm(counts)
    return rev_counts, t_matrix, populations, mapping
Beispiel #46
0
def classic(trajs, n_clusters, n_medoid_iters, metric, dim=2, lag_time=1, show=False, desc=None):
    """Use classic clustering methods."""

    if desc is None:
        desc = "Classic, n_clusters=%d" % n_clusters

    hkm = clustering.HybridKMedoids(metric, trajs, k=n_clusters, local_num_iters=n_medoid_iters)
    centroids = hkm.get_generators_as_traj()

    centroids_nf = centroids['XYZList'][:, 0, 0:dim]
    plot_centroids(centroids_nf)
    if show: pp.show()

    counts = msml.get_count_matrix_from_assignments(hkm.get_assignments(), n_clusters, lag_time)
    rev_counts, t_matrix, populations, mapping = msml.build_msm(counts)
    analyze_msm(t_matrix, centroids_nf, desc, show=show)

    return t_matrix
def test_get_count_matrix_from_assignments_2():
    np.random.seed(42)

    assignments = np.random.randint(3, size=(10, 10))

    val = MSMLib.get_count_matrix_from_assignments(assignments).todense()

    correct = np.matrix([[11.0, 9.0, 10.0], [9.0, 17.0, 7.0], [10.0, 7.0, 10.0]])
    eq(val, correct)
Beispiel #48
0
 def test_4(self):
     c, rc, t, p, m = MSMLib.build_msm(self.assignments, lag_time=2, symmetrize=None, sliding_window=True)
     npt.assert_array_equal(c.todense(), np.matrix('7 4; 3 2'))
     npt.assert_array_almost_equal(rc.todense(), np.matrix('7 4; 3 2'))
     npt.assert_array_almost_equal(t.todense(), 
         np.matrix([[ 0.63636364,  0.36363636],
             [  0.6,  0.4]]))
     assert p is None
     npt.assert_array_equal(m, [0,1])
Beispiel #49
0
def run_pcca_plus(num_macrostates,
                  assignments,
                  tProb,
                  flux_cutoff=0.0,
                  objective_function="crispness",
                  do_minimization=True):

    logger.info("Running PCCA+...")
    pcca_plus = lumping.PCCAPlus(tProb,
                                 num_macrostates,
                                 flux_cutoff=flux_cutoff,
                                 do_minimization=do_minimization,
                                 objective_function=objective_function)

    A, chi, MAP = pcca_plus.A, pcca_plus.chi, pcca_plus.microstate_mapping

    MSMLib.apply_mapping_to_assignments(assignments, MAP)

    return chi, A, MAP, assignments
Beispiel #50
0
 def dump_count_matrix(self,assignfn,lagtime=1,outfn="count_matrix.txt"):
     from msmbuilder import io
     from msmbuilder import MSMLib
     
     assignments = io.loadh(assignfn, 'arr_0')
     # returns sparse lil_matrix
     counts = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=lagtime,
                                                       sliding_window=True)
     counts = counts.tocoo()
     np.savetxt(outfn,(counts.row, counts.col, counts.data))
Beispiel #51
0
 def test_4(self):
     c, rc, t, p, m = MSMLib.build_msm(self.assignments,
                                       lag_time=2,
                                       symmetrize=None,
                                       sliding_window=True)
     npt.assert_array_equal(c.todense(), np.matrix('7 4; 3 2'))
     npt.assert_array_almost_equal(rc.todense(), np.matrix('7 4; 3 2'))
     npt.assert_array_almost_equal(
         t.todense(), np.matrix([[0.63636364, 0.36363636], [0.6, 0.4]]))
     assert p is None
     npt.assert_array_equal(m, [0, 1])
Beispiel #52
0
def test_trim_states():

    # run the (just tested) ergodic trim
    counts = scipy.sparse.csr_matrix(np.matrix('2 1 0; 1 2 0; 0 0 1'))
    trimmed, mapping = MSMLib.ergodic_trim(counts)

    # now try the segmented method
    states_to_trim = MSMLib.ergodic_trim_indices(counts)
    trimmed_counts = MSMLib.trim_states(
        states_to_trim, counts, assignments=None)

    eq(trimmed.todense(), trimmed_counts.todense())

    assignments = np.array([np.arange(counts.shape[0])])
    states_to_trim = MSMLib.ergodic_trim_indices(counts)
    trimmed_counts, trimmed_assignments = MSMLib.trim_states(states_to_trim, counts, assignments=assignments)  # Test that code works with assignments given
    
    trimmed_assignments_ref = np.array([[0, 1, -1]])  # State 2 is strong-disconnected so set to -1
    
    eq(trimmed_assignments, trimmed_assignments_ref)
def parallel_get_matrix(input):
    print "working"
    (Ttest, multinom, NumStates)=input
    newT=scipy.sparse.lil_matrix((int(NumStates),int(NumStates)),dtype='float32')
    for i in range(0, Ttest.shape[1]):
        transitions = numpy.row_stack((numpy.array([i]*NumStates),numpy.arange(0, NumStates)))
        pvals=numpy.array([x/sum(Ttest[i]) for x in Ttest[i]])
        counts=numpy.random.multinomial(int(multinom), pvals, size=1)
        newT=newT+scipy.sparse.coo_matrix((counts[0], transitions),shape=(NumStates,NumStates))
    rev_counts, t_matrix, Populations, Mapping = MSMLib.build_msm(newT, symmetrize='MLE', ergodic_trimming=True)
    return rev_counts, t_matrix, Populations, Mapping
    def __init__(self):
        self.epsilon = 1E-7
        self.alpha = 0.001  # Confidence for uncertainty estimate
        # Testing is stochastic; we expect errors 0.1 % of the time.
        self.max_lag = 100
        self.times = np.arange(self.max_lag)
        self.num_steps = 100000

        self.C = np.array([[500, 2], [2, 50]])
        self.T = MSMLib.estimate_transition_matrix(self.C)
        self.state_traj = np.array(msm_analysis.sample(self.T, 0, self.num_steps))
Beispiel #55
0
def GetCorrelationCorrectedEigensystem(T,
                                       NumEigen,
                                       Assignments,
                                       MultiplicativeFactor=6,
                                       CorrelationLength=50):
    """Get the slowest eigenvalues of a system, correcting for nonmarkovian bias.

    Inputs:
    T: Transition Matrix
    NumEigen: Number of eigenvalues to get
    Assignments: Assignments array

    Optional Arguments:
    

    Eigenvalues  estimated using a short lagtime model generally yield timescales that are far too fast.  In particular, often those timescales would be slower when estimated with a longer timescale model.  This is particularly bad when using PCCA or PCCA+, as sometimes the short-timescale eigenvalues are not in correct rank order.  This leads to poor state decompositions.  This functino uses an eigenvector correlation function analysis to get the 'long lagtime' corrected eigenvalues, in the correct order.  These eigenvalues can then be used to yield a better state decomposition.
    """

    #We first calculate more than the desired number of eigenvalues
    #Then we correct them and pick the slowest *corrected* eigenvalues
    NumEigenToCalculate = NumEigen * MultiplicativeFactor

    eigVals, eigVecs = MSMLib.GetEigenvectors(T, NumEigenToCalculate)
    #eigVals,eigVecs_Right=MSMLib.GetEigenvectors_Right(T,(NumEigen)*MultiplicativeFactor)

    #Calculate the right eigenvectors using the stationary vector
    eigVecs_Right = eigVecs.copy()
    Pi = eigVecs[:, 0]
    for i in range(NumEigenToCalculate):
        eigVecs_Right[:, i] /= Pi

    Ind, CorrelationEigVals = ReOrderEigensystem(
        Assignments,
        eigVals,
        eigVecs_Right,
        CorrelationLength=CorrelationLength)

    #Re-order using the correct ordering
    CorrelationEigVals = CorrelationEigVals[Ind]
    eigVals = eigVals[Ind]
    eigVecs = eigVecs[:, Ind]
    eigVecs_Right = eigVecs_Right[:, Ind]

    #Collect the NumEigen slowest eigenvalues and eigenvectors.
    eigVals = eigVals[0:NumEigen]
    eigVecs = eigVecs[:, 0:NumEigen]
    eigVecs_Right = eigVecs_Right[:, 0:NumEigen]
    CorrelationEigVals = CorrelationEigVals[0:NumEigen]

    print(-1 / np.log(eigVals))
    print(-1 / np.log(CorrelationEigVals))

    return eigVals, CorrelationEigVals, eigVecs, eigVecs_Right
Beispiel #56
0
 def test_2(self):
     c, rc, t, p, m = MSMLib.build_msm(self.assignments,
                                       self.lag_time,
                                       symmetrize=None)
     npt.assert_array_equal(c.todense(), np.matrix('7 5; 4 2'))
     npt.assert_array_almost_equal(rc.todense(), np.matrix([[7, 5], [4,
                                                                     2]]))
     npt.assert_array_almost_equal(
         t.todense(),
         np.matrix([[0.58333333, 0.41666667], [0.66666667, 0.33333333]]))
     assert p is None
     npt.assert_array_equal(m, [0, 1])
def run(project, assignments, num_confs_per_state, random_source=None):
    """
    Pull random confs from each state in an MSM
    
    Parameters
    ----------
    project : msmbuilder.Project
        Used to load up the trajectories, get topology
    assignments : np.ndarray, dtype=int
        State membership for each frame
    num_confs_per_state : int
        number of conformations to pull from each state
    random_source : numpy.random.RandomState, optional
        If supplied, random numbers will be pulled from this random source,
        instead of the default, which is np.random. This argument is used
        for testing, to ensure that the random number generator always
        gives the same stream.
        
    Notes
    -----
    A new random_source can be initialized by calling numpy.random.RandomState(seed)
    with whatever seed you like. See http://stackoverflow.com/questions/5836335/consistenly-create-same-random-numpy-array
    for some discussion.
                
    """

    if random_source is None:
        random_source = np.random

    n_states = max(assignments.flatten()) + 1
    logger.info("Pulling %s confs for each of %s confs", num_confs_per_state,
                n_states)

    inv = MSMLib.invert_assignments(assignments)
    xyzlist = []
    for s in xrange(n_states):
        trj, frame = inv[s]
        # trj and frame are a list of indices, such that
        # project.load_traj(trj[i])[frame[i]] is a frame assigned to state s
        for j in xrange(num_confs_per_state):
            r = random_source.randint(len(trj))
            xyz = Trajectory.read_frame(project.traj_filename(trj[r]),
                                        frame[r])
            xyzlist.append(xyz)

    # xyzlist is now a list of (n_atoms, 3) arrays, and we're going
    # to stack it along the third dimension
    xyzlist = np.dstack(xyzlist)
    # load up the conf to get the topology, put then pop in the new coordinates
    output = project.load_conf()
    output['XYZList'] = xyzlist

    return output
Beispiel #58
0
 def test_3(self):
     c, rc, t, p, m = MSMLib.build_msm(self.assignments,
                                       self.lag_time,
                                       symmetrize='Transpose')
     npt.assert_array_equal(c.todense(), np.matrix('7 5; 4 2'))
     npt.assert_array_almost_equal(rc.todense(),
                                   np.matrix([[7, 4.5], [4.5, 2]]))
     npt.assert_array_almost_equal(
         t.todense(),
         np.matrix([[0.60869565, 0.39130435], [0.69230769, 0.30769231]]))
     npt.assert_array_almost_equal(p, [0.63888889, 0.36111111])
     npt.assert_array_equal(m, [0, 1])
Beispiel #59
0
 def test_1(self):
     c, rc, t, p, m = MSMLib.build_msm(self.assignments,
                                       self.lag_time,
                                       symmetrize='MLE')
     npt.assert_array_equal(c.todense(), np.matrix('7 5; 4 2'))
     npt.assert_array_almost_equal(
         rc.todense(),
         np.matrix([[6.46159184, 4.61535527], [4.61535527, 2.30769762]]))
     npt.assert_array_almost_equal(
         t.todense(),
         np.matrix([[0.58333689, 0.41666311], [0.66666474, 0.33333526]]))
     npt.assert_array_almost_equal(p, [0.61538595, 0.38461405])
     npt.assert_array_equal(m, [0, 1])