def test(self): num_macro = 5 TC = get("PCCA_ref/tProb.mtx") A = get("PCCA_ref/Assignments.Fixed.h5")['arr_0'] print A macro_map, macro_assign = PCCA.run_pcca(num_macro, A, TC) r_macro_map = get("PCCA_ref/MacroMapping.dat") macro_map = macro_map.astype(np.int) r_macro_map = r_macro_map.astype(np.int) # The order of macrostates might be different between the reference and # new lumping. We therefore find a permutation to match them. permutation_mapping = np.zeros(macro_assign.max() + 1, 'int') for i in range(num_macro): j = np.where(macro_map == i)[0][0] permutation_mapping[i] = r_macro_map[j] macro_map_permuted = permutation_mapping[macro_map] MSMLib.apply_mapping_to_assignments(macro_assign, permutation_mapping) r_macro_assign = get("PCCA_ref/MacroAssignments.h5")['arr_0'] eq(macro_map_permuted, r_macro_map) eq(macro_assign, r_macro_assign)
def classic(trajs, n_clusters, n_medoid_iters, metric, dim=2, lag_time=1, show=False, desc=None): """Use classic clustering methods.""" if desc is None: desc = "Classic, n_clusters=%d" % n_clusters hkm = clustering.HybridKMedoids(metric, trajs, k=n_clusters, local_num_iters=n_medoid_iters) centroids = hkm.get_generators_as_traj() centroids_nf = centroids['XYZList'][:, 0, 0:dim] plot_centroids(centroids_nf) if show: pp.show() counts = msml.get_count_matrix_from_assignments(hkm.get_assignments(), n_clusters, lag_time) rev_counts, t_matrix, populations, mapping = msml.build_msm(counts) analyze_msm(t_matrix, centroids_nf, desc, show=show) return t_matrix
def build_msm(self, lag_time=None): """Build an MSM from the loaded trajectories.""" if lag_time is None: lag_time = self.good_lag_time else: self.good_lag_time = lag_time # Do assignment trajs = get_data.get_shimtraj_from_trajlist(self.traj_list) metric = classic.Euclidean2d() # Allocate array n_trajs = len(self.traj_list) max_traj_len = max([t.shape[0] for t in self.traj_list]) assignments = -1 * np.ones((n_trajs, max_traj_len), dtype='int') # Prepare generators pgens = metric.prepare_trajectory( self.clusterer.get_generators_as_traj()) for i, traj in enumerate(trajs): ptraj = metric.prepare_trajectory(traj) for j in xrange(len(traj)): d = metric.one_to_all(ptraj, pgens, j) assignments[i, j] = np.argmin(d) counts = msml.get_count_matrix_from_assignments(assignments, n_states=None, lag_time=lag_time) rev_counts, t_matrix, populations, mapping = msml.build_msm(counts) return t_matrix
def run_pcca_plus(num_macrostates, assignments, tProb, output_dir, flux_cutoff=0.0, objective_function="crispness", do_minimization=True): MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(output_dir, "MacroMapping.dat") ChiFn = os.path.join(output_dir, 'Chi.dat') AFn = os.path.join(output_dir, 'A.dat') arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn]) logger.info("Running PCCA+...") A, chi, vr, MAP = lumping.pcca_plus(tProb, num_macrostates, flux_cutoff=flux_cutoff, do_minimization=do_minimization, objective_function=objective_function) MSMLib.apply_mapping_to_assignments(assignments, MAP) np.savetxt(ChiFn, chi) np.savetxt(AFn, A) np.savetxt(MacroMapFn, MAP, "%d") msmbuilder.io.saveh(MacroAssignmentsFn, assignments) logger.info('Saved output to: %s, %s, %s, %s', ChiFn, AFn, MacroMapFn, MacroAssignmentsFn)
def build_msm(self, lag_time=None): """Build an MSM from the loaded trajectories.""" if lag_time is None: lag_time = self.good_lag_time else: self.good_lag_time = lag_time # Do assignment trajs = get_data.get_shimtraj_from_trajlist(self.traj_list) metric = classic.Euclidean2d() # Allocate array n_trajs = len(self.traj_list) max_traj_len = max([t.shape[0] for t in self.traj_list]) assignments = -1 * np.ones((n_trajs, max_traj_len), dtype='int') # Prepare generators pgens = metric.prepare_trajectory(self.clusterer.get_generators_as_traj()) for i, traj in enumerate(trajs): ptraj = metric.prepare_trajectory(traj) for j in xrange(len(traj)): d = metric.one_to_all(ptraj, pgens, j) assignments[i, j] = np.argmin(d) counts = msml.get_count_matrix_from_assignments(assignments, n_states=None, lag_time=lag_time) rev_counts, t_matrix, populations, mapping = msml.build_msm(counts) return t_matrix
def test(self): num_macro = 5 TC = get("PCCA_ref/tProb.mtx") A = get("PCCA_ref/Assignments.Fixed.h5")['arr_0'] macro_map, macro_assign = PCCA.run_pcca(num_macro, A, TC) r_macro_map = get("PCCA_ref/MacroMapping.dat") macro_map = macro_map.astype(np.int) r_macro_map = r_macro_map.astype(np.int) # The order of macrostates might be different between the reference and # new lumping. We therefore find a permutation to match them. permutation_mapping = np.zeros(macro_assign.max() + 1, 'int') for i in range(num_macro): j = np.where(macro_map == i)[0][0] permutation_mapping[i] = r_macro_map[j] macro_map_permuted = permutation_mapping[macro_map] MSMLib.apply_mapping_to_assignments(macro_assign, permutation_mapping) r_macro_assign = get("PCCA_ref/MacroAssignments.h5")['arr_0'] eq(macro_map_permuted, r_macro_map) eq(macro_assign, r_macro_assign)
def test_apply_mapping_to_assignments_1(): l = 100 assignments = np.random.randint(l, size=(10, 10)) mapping = np.ones(l) MSMLib.apply_mapping_to_assignments(assignments, mapping) eq(assignments, np.ones((10, 10)))
def test_estimate_rate_matrix_1(): np.random.seed(42) assignments = np.random.randint(2, size=(10, 10)) counts = MSMLib.get_count_matrix_from_assignments(assignments) K = MSMLib.estimate_rate_matrix(counts, assignments).todense() correct = np.matrix([[-40.40909091, 0.5], [0.33928571, -50.55357143]]) eq(K, correct)
def test_1(self): C = MSMLib.get_count_matrix_from_assignments(self.assignments, 2) rc, t, p, m = MSMLib.build_msm(C, symmetrize="MLE", ergodic_trimming=True) eq(rc.todense(), np.matrix([[6.46159184, 4.61535527], [4.61535527, 2.30769762]]), decimal=4) eq(t.todense(), np.matrix([[0.58333689, 0.41666311], [0.66666474, 0.33333526]]), decimal=4) eq(p, np.array([0.61538595, 0.38461405]), decimal=5) eq(m, np.array([0, 1]))
def test_get_count_matrix_from_assignments_3(): np.random.seed(42) assignments = np.random.randint(3, size=(10, 10)) val = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=2, sliding_window=False).todense() eq(val, np.matrix([[5.0, 3.0, 4.0], [2.0, 12.0, 3.0], [4.0, 3.0, 4.0]])) val = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=2, sliding_window=True).todense() eq(val, np.matrix([[8.0, 9.0, 11.0], [5.0, 18.0, 6.0], [11.0, 5.0, 7.0]]))
def run_pcca_plus(num_macrostates, assignments, tProb, flux_cutoff=0.0, objective_function="crispness",do_minimization=True): logger.info("Running PCCA+...") A, chi, vr, MAP = lumping.pcca_plus(tProb, num_macrostates, flux_cutoff=flux_cutoff, do_minimization=do_minimization, objective_function=objective_function) MSMLib.apply_mapping_to_assignments(assignments, MAP) return chi, A, MAP, assignments
def construct_counts_matrix(assignments): """Build and return a counts matrix from assignments. Symmetrize either with transpose or MLE based on the value of the self.symmetrize variable Also modifies the assignments file that you pass it to reflect ergodic trimming Parameters ---------- assignments : np.ndarray 2D array of MSMBuilder assignments Returns ------- counts : scipy.sparse.csr_matrix transition counts """ n_states = np.max(assignments.flatten()) + 1 raw_counts = MSMLib.get_count_matrix_from_assignments(assignments, n_states, lag_time=Project().lagtime, sliding_window=True) ergodic_counts = None if Project().trim: raise NotImplementedError(('Trimming is not yet supported because ' 'we need to keep track of the mapping from trimmed to ' ' untrimmed states for joint clustering to be right')) try: ergodic_counts, mapping = MSMLib.ergodic_trim(raw_counts) MSMLib.apply_mapping_to_assignments(assignments, mapping) counts = ergodic_counts except Exception as e: logger.warning("MSMLib.ergodic_trim failed with message '{0}'".format(e)) else: logger.info("Ignoring ergodic trimming") counts = raw_counts if Project().symmetrize == 'transpose': logger.debug('Transpose symmetrizing') counts = counts + counts.T elif Project().symmetrize == 'mle': logger.debug('MLE symmetrizing') counts = MSMLib.mle_reversible_count_matrix(counts) elif Project().symmetrize == 'none' or (not Project().symmetrize): logger.debug('Skipping symmetrization') else: raise ValueError("Could not understand symmetrization method: %s" % Project().symmetrize) return counts
def run_pcca(num_macrostates, assignments, tProb): logger.info("Running PCCA...") if len(np.unique(assignments[np.where(assignments != -1)])) != tProb.shape[0]: raise ValueError('Different number of states in assignments and tProb!') MAP = lumping.PCCA(tProb, num_macrostates) # MAP the new assignments and save, make sure don't # mess up negaitve one's (ie where don't have data) MSMLib.apply_mapping_to_assignments(assignments, MAP) return MAP, assignments
def test_get_count_matrix_from_assignments_3(): np.random.seed(42) assignments = np.random.randint(3, size=(10, 10)) val = MSMLib.get_count_matrix_from_assignments( assignments, lag_time=2, sliding_window=False).todense() eq(val, np.matrix([[5., 3., 4.], [2., 12., 3.], [4., 3., 4.]])) val = MSMLib.get_count_matrix_from_assignments( assignments, lag_time=2, sliding_window=True).todense() eq(val, np.matrix([[8., 9., 11.], [5., 18., 6.], [11., 5., 7.]]))
def test_trim_states(): # run the (just tested) ergodic trim counts = scipy.sparse.csr_matrix(np.matrix('2 1 0; 1 2 0; 0 0 1')) trimmed, mapping = MSMLib.ergodic_trim(counts) # now try the segmented method states_to_trim = MSMLib.ergodic_trim_indices(counts) trimmed_counts = MSMLib.trim_states(states_to_trim, counts, assignments=None) eq(trimmed.todense(), trimmed_counts.todense())
def msm(traj_list, n_clusters, n_medoid_iters=10, lag_time=1, distance_cutoff=None): """Use classic clustering methods.""" print "Building a classic MSM" hkm = cluster(traj_list, n_clusters, n_medoid_iters, distance_cutoff) # centroids = hkm.get_generators_as_traj() # centroids_nf = centroids['XYZList'][:, 0, 0:dim] counts = msml.get_count_matrix_from_assignments(hkm.get_assignments(), n_clusters, lag_time) rev_counts, t_matrix, populations, mapping = msml.build_msm(counts) return t_matrix
def test_get_count_matrix_from_assignments_3(): np.random.seed(42) assignments = np.random.randint(3, size=(10,10)) val = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=2, sliding_window=False).todense() npt.assert_equal(val, np.matrix([[ 5., 3., 4.], [ 2., 12., 3.], [ 4., 3., 4.]])) val = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=2, sliding_window=True).todense() npt.assert_equal(val, np.matrix([[8., 9., 11.], [ 5., 18., 6.], [ 11., 5., 7.]]))
def test_apply_mapping_to_assignments_2(): "preseve the -1s" l = 100 assignments = np.random.randint(l, size=(10, 10)) assignments[0, 0] = -1 mapping = np.ones(l) correct = np.ones((10, 10)) correct[0, 0] = -1 MSMLib.apply_mapping_to_assignments(assignments, mapping) eq(assignments, correct)
def run_pcca(num_macrostates, assignments, tProb): logger.info("Running PCCA...") if len(np.unique( assignments[np.where(assignments != -1)])) != tProb.shape[0]: raise ValueError( 'Different number of states in assignments and tProb!') PCCA = lumping.PCCA(tProb, num_macrostates) MAP = PCCA.microstate_mapping # MAP the new assignments and save, make sure don't # mess up negaitve one's (ie where don't have data) MSMLib.apply_mapping_to_assignments(assignments, MAP) return MAP, assignments
def compare_kyle_to_lutz(self, raw_counts): """Kyle wrote the most recent MLE code. We compare to the previous code that was written by Lutz. """ counts = MSMLib.ergodic_trim(raw_counts)[0] x_kyle = MSMLib.mle_reversible_count_matrix(counts) x_kyle /= x_kyle.sum() x_lutz = MSMLib.__mle_reversible_count_matrix_lutz__(counts) x_lutz /= x_lutz.sum() eq(x_kyle.toarray(), x_lutz.toarray())
def test_estimate_rate_matrix_2(): np.random.seed(42) counts_dense = np.random.randint(100, size=(4, 4)) counts_sparse = scipy.sparse.csr_matrix(counts_dense) t_mat_dense = MSMLib.estimate_transition_matrix(counts_dense) t_mat_sparse = MSMLib.estimate_transition_matrix(counts_sparse) correct = np.array([[0.22368421, 0.40350877, 0.06140351, 0.31140351], [0.24193548, 0.08064516, 0.33064516, 0.34677419], [0.22155689, 0.22155689, 0.26047904, 0.29640719], [0.23469388, 0.02040816, 0.21428571, 0.53061224]]) eq(t_mat_dense, correct) eq(t_mat_dense, np.array(t_mat_sparse.todense()))
def test_1(self): C = MSMLib.get_count_matrix_from_assignments(self.assignments, 2) rc, t, p, m = MSMLib.build_msm(C, symmetrize='MLE', ergodic_trimming=True) eq(rc.todense(), np.matrix([[6.46159184, 4.61535527], [4.61535527, 2.30769762]]), decimal=4) eq(t.todense(), np.matrix([[0.58333689, 0.41666311], [0.66666474, 0.33333526]]), decimal=4) eq(p, np.array([0.61538595, 0.38461405]), decimal=5) eq(m, np.array([0, 1]))
def build_new(centroids, trajs, fuzziness, dist, soft=True, neigen=4, show=False, desc=None): """Build an MSM from points and centroids. First this function generates membership vectors. if soft is False, 'Quantize' the membership vectors to mirror the hard clustering case, else use the fuzzy nature of the clusters in building the MSM. """ n_states = len(centroids) time_pairs = get_giant_state_list(centroids, trajs, fuzziness, dist, soft=soft) print("Got state list") counts_mat = buildmsm.get_counts_from_pairs(time_pairs, n_states) print("Got count matrix") rev_counts, t_matrix, populations, mapping = msml.build_msm(counts_mat) if desc is None: if soft: desc = 'New, Fuzzy' else: desc = 'New, not-so-fuzzy' analyze_msm(t_matrix, centroids, desc=desc, show=show, neigen=neigen)
def run_pcca(num_macrostates, assignments, tProb, output_dir): MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(output_dir, "MacroMapping.dat") arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn]) logger.info("Running PCCA...") MAP = lumping.PCCA(tProb, num_macrostates) # MAP the new assignments and save, make sure don't # mess up negaitve one's (ie where don't have data) MSMLib.apply_mapping_to_assignments(assignments, MAP) np.savetxt(MacroMapFn, MAP, "%d") msmbuilder.io.saveh(MacroAssignmentsFn, assignments) logger.info("Saved output to: %s, %s", MacroAssignmentsFn, MacroMapFn)
def test_estimate_transition_matrix_1(): np.random.seed(42) count_matrix = np.array([[6, 3, 7], [4, 6, 9], [2, 6, 7]]) t = MSMLib.estimate_transition_matrix(count_matrix) eq(t, np.array([[0.375, 0.1875, 0.4375], [0.21052632, 0.31578947, 0.47368421], [0.13333333, 0.4, 0.46666667]]))
def test_j_PCCA(self): TC = scipy.io.mmread(os.path.join(WorkingDir, "Data", "tProb.mtx")) A = io.loadh(os.path.join(WorkingDir, "Data", "Assignments.Fixed.h5"), 'arr_0') PCCA.run_pcca(NumMacroStates, A, TC, os.path.join(WorkingDir, 'Data')) mm = np.loadtxt(os.path.join(WorkingDir, "Data", "MacroMapping.dat"), 'int') mm_r = np.loadtxt( os.path.join(ReferenceDir, "Data", "MacroMapping.dat"), 'int') ma = io.loadh(os.path.join(WorkingDir, "Data", "MacroAssignments.h5"), 'arr_0') ma_r = io.loadh( os.path.join(ReferenceDir, "Data", "MacroAssignments.h5"), 'Data') num_macro = NumMacroStates permutation_mapping = np.zeros(num_macro, 'int') #The order of macrostates might be different between the reference and new lumping. #We therefore find a permutation to match them. for i in range(num_macro): j = np.where(mm == i)[0][0] permutation_mapping[i] = mm_r[j] mm_permuted = permutation_mapping[mm] MSMLib.ApplyMappingToAssignments(ma, permutation_mapping) npt.assert_array_almost_equal(mm_permuted, mm_r) npt.assert_array_almost_equal(ma, ma_r)
def ndgrid_msm_likelihood_score(estimator, sequences): """Log-likelihood score function for an (NDGrid, MarkovStateModel) pipeline Parameters ---------- estimator : sklearn.pipeline.Pipeline A pipeline estimator containing an NDGrid followed by a MarkovStateModel sequences: list of array-like, each of shape (n_samples_i, n_features) Data sequences, where n_samples_i in the number of samples in sequence i and n_features is the number of features. Returns ------- log_likelihood : float Mean log-likelihood per data point. Examples -------- >>> pipeline = Pipeline([ >>> ('grid', NDGrid()), >>> ('msm', MarkovStateModel()) >>> ]) >>> grid = GridSearchCV(pipeline, param_grid={ >>> 'grid__n_bins_per_feature': [10, 20, 30, 40] >>> }, scoring=ndgrid_msm_likelihood_score) >>> grid.fit(dataset) >>> print grid.grid_scores_ References ---------- .. [1] McGibbon, R. T., C. R. Schwantes, and V. S. Pande. "Statistical Model Selection for Markov Models of Biomolecular Dynamics." J. Phys. Chem B. (2014) """ import msmbuilder.MSMLib as msmlib from mixtape import cluster grid = [model for (name, model) in estimator.steps if isinstance(model, cluster.NDGrid)][0] msm = [model for (name, model) in estimator.steps if isinstance(model, MarkovStateModel)][0] # NDGrid supports min/max being different along different directions, which # means that the bin widths are coordinate dependent. But I haven't # implemented that because I've only been using this for 1D data if grid.n_features != 1: raise NotImplementedError("file an issue on github :)") transition_log_likelihood = 0 emission_log_likelihood = 0 logtransmat = np.nan_to_num(np.log(np.asarray(msm.transmat_.todense()))) width = grid.grid[0, 1] - grid.grid[0, 0] for X in grid.transform(sequences): counts = np.asarray( _apply_mapping_to_matrix(msmlib.get_counts_from_traj(X, n_states=grid.n_bins), msm.mapping_).todense() ) transition_log_likelihood += np.multiply(counts, logtransmat).sum() emission_log_likelihood += -1 * np.log(width) * len(X) return (transition_log_likelihood + emission_log_likelihood) / sum(len(x) for x in sequences)
def main(assfile, lag, nproc): lag=int(lag) nproc=int(nproc) Assignments=io.loadh(assfile) num=int(assfile.split('Assignments_sub')[1].split('.h5')[0]) dir=os.path.dirname(assfile) newdir='%s/boot-sub%s' % (dir, num) ref_sub=numpy.loadtxt('%s/times.h5' % dir, usecols=(1,)) ref_total=numpy.loadtxt('%s/times.h5' % dir, usecols=(2,)) times=dict() for (i,j) in zip(ref_sub, ref_total): times[i]=j proj=Project.load_from('%s/ProjectInfo.yaml' % dir.split('Data')[0]) multinom=int(times[num]) if not os.path.exists(newdir): os.mkdir(newdir) if 'Data' in Assignments.keys(): Assignments=Assignments['Data'] else: Assignments=Assignments['arr_0'] print Assignments.shape NumStates = max(Assignments.flatten()) + 1 Counts = MSMLib.get_count_matrix_from_assignments(Assignments, lag_time=int(lag), sliding_window=True) Counts=Counts.todense() Counts=Counts*(1.0/lag) T=numpy.array(Counts) frames=numpy.where(T==0) T[frames]=1 Popsample=dict() iteration=0 total_iteration=100/nproc print "%s total iterations" % total_iteration if 100 % nproc != 0: remain=100 % nproc else: remain=False print "iterating thru tCount samples" count=0 while iteration < 100: if count*nproc > 100: nproc=remain print "sampling iteration %s" % iteration Tfresh=T.copy() input = zip([Tfresh]*nproc, [multinom]*nproc, range(0, NumStates)) pool = multiprocessing.Pool(processes=nproc) result = pool.map_async(parallel_get_matrix, input) result.wait() all = result.get() pool.terminate() for c_matrix in all: scipy.io.mmwrite('%s/tCounts-%s' % (newdir, iteration), c_matrix) #rev_counts, t_matrix, Populations, Mapping=x #scipy.io.mmwrite('%s/tProb-%s' % (newdir, iteration), t_matrix) #numpy.savetxt('%s/Populations-%s' % (newdir, iteration), Populations) #numpy.savetxt('%s/Mapping-%s' % (newdir, iteration), Mapping) iteration+=1 count+=1 print "dont with iteration %s" % iteration*nproc
def test_get_count_matrix_from_assignments_1(): assignments = np.zeros((10,10)) val = MSMLib.get_count_matrix_from_assignments(assignments).todense() correct = np.matrix([[90.0]]) npt.assert_equal(val, correct)
def test_get_count_matrix_from_assignments_1(): assignments = np.zeros((10, 10), "int") val = MSMLib.get_count_matrix_from_assignments(assignments).todense() correct = np.matrix([[90.0]]) eq(val, correct)
def test_get_count_matrix_from_assignments_1(): assignments = np.zeros((10, 10), 'int') val = MSMLib.get_count_matrix_from_assignments(assignments).todense() correct = np.matrix([[90.0]]) eq(val, correct)
def run(LagTime, assignments, Symmetrize='MLE', input_mapping="None", Prior=0.0, OutDir="./Data/"): # set the filenames for output FnTProb = os.path.join(OutDir, "tProb.mtx") FnTCounts = os.path.join(OutDir, "tCounts.mtx") FnMap = os.path.join(OutDir, "Mapping.dat") FnAss = os.path.join(OutDir, "Assignments.Fixed.h5") FnPops = os.path.join(OutDir, "Populations.dat") # make sure none are taken outputlist = [FnTProb, FnTCounts, FnMap, FnAss, FnPops] arglib.die_if_path_exists(outputlist) # if given, apply mapping to assignments if input_mapping != "None": MSMLib.apply_mapping_to_assignments(assignments, input_mapping) n_states = np.max(assignments.flatten()) + 1 n_assigns_before_trim = len( np.where( assignments.flatten() != -1 )[0] ) rev_counts, t_matrix, populations, mapping = MSMLib.build_msm(assignments, lag_time=LagTime, symmetrize=Symmetrize, sliding_window=True, trim=True) MSMLib.apply_mapping_to_assignments(assignments, mapping) n_assigns_after_trim = len( np.where( assignments.flatten() != -1 )[0] ) # if had input mapping, then update it if input_mapping != "None": mapping = mapping[input_mapping] # Print a statement showing how much data was discarded in trimming percent = (1.0 - float(n_assigns_after_trim) / float(n_assigns_before_trim)) * 100.0 logger.warning("Ergodic trimming discarded: %f percent of your data", percent) # Save all output np.savetxt(FnPops, populations) np.savetxt(FnMap, mapping,"%d") scipy.io.mmwrite(str(FnTProb), t_matrix) scipy.io.mmwrite(str(FnTCounts), rev_counts) msmbuilder.io.saveh(FnAss, assignments) for output in outputlist: logger.info("Wrote: %s", output) return
def test_get_count_matrix_from_assignments_1(): assignments = np.zeros((10, 10)) val = MSMLib.get_count_matrix_from_assignments(assignments).todense() correct = np.matrix([[90.0]]) npt.assert_equal(val, correct)
def test_get_count_matrix_from_assignments_2(): np.random.seed(42) assignments = np.random.randint(3, size=(10, 10)) val = MSMLib.get_count_matrix_from_assignments(assignments).todense() correct = np.matrix([[11., 9., 10.], [9., 17., 7.], [10., 7., 10.]]) eq(val, correct)
def test_estimate_transition_matrix_1(): np.random.seed(42) count_matrix = np.array([[6, 3, 7], [4, 6, 9], [2, 6, 7]]) t = MSMLib.estimate_transition_matrix(count_matrix) eq( t, np.array([[0.375, 0.1875, 0.4375], [0.21052632, 0.31578947, 0.47368421], [0.13333333, 0.4, 0.46666667]]))
def run_pcca_plus(num_macrostates, assignments, tProb, output_dir, flux_cutoff=0.0,objective_function="crispness",do_minimization=True): MacroAssignmentsFn = os.path.join(output_dir, "MacroAssignments.h5") MacroMapFn = os.path.join(output_dir, "MacroMapping.dat") ChiFn = os.path.join(output_dir, 'Chi.dat') AFn = os.path.join(output_dir, 'A.dat') arglib.die_if_path_exists([MacroAssignmentsFn, MacroMapFn, ChiFn, AFn]) logger.info("Running PCCA+...") A, chi, vr, MAP = lumping.pcca_plus(tProb, num_macrostates, flux_cutoff=flux_cutoff, do_minimization=do_minimization, objective_function=objective_function) MSMLib.apply_mapping_to_assignments(assignments, MAP) np.savetxt(ChiFn, chi) np.savetxt(AFn, A) np.savetxt(MacroMapFn, MAP,"%d") msmbuilder.io.saveh(MacroAssignmentsFn, assignments) logger.info('Saved output to: %s, %s, %s, %s', ChiFn, AFn, MacroMapFn, MacroAssignmentsFn)
def build_classic_from_memberships(memberships, lag_time=1): """Build a classic msm by turning a membership array into a state list. This function uses msmbuilder code to calculate the count matrix. Use this for compairing quantized versions of the fuzzy count matrix building for consistency. """ states = np.zeros(memberships.shape[0], dtype='int') n_states = memberships.shape[1] for i in xrange(memberships.shape[0]): memb = memberships[i] state = np.argmax(memb) states[i] = state counts = msm.get_counts_from_traj(states, n_states, lag_time) rev_counts, t_matrix, populations, mapping = msm.build_msm(counts) return rev_counts, t_matrix, populations, mapping
def test_get_count_matrix_from_assignments_2(): np.random.seed(42) assignments = np.random.randint(3, size=(10, 10)) val = MSMLib.get_count_matrix_from_assignments(assignments).todense() correct = np.matrix([[11.0, 9.0, 10.0], [9.0, 17.0, 7.0], [10.0, 7.0, 10.0]]) eq(val, correct)
def test_4(self): c, rc, t, p, m = MSMLib.build_msm(self.assignments, lag_time=2, symmetrize=None, sliding_window=True) npt.assert_array_equal(c.todense(), np.matrix('7 4; 3 2')) npt.assert_array_almost_equal(rc.todense(), np.matrix('7 4; 3 2')) npt.assert_array_almost_equal(t.todense(), np.matrix([[ 0.63636364, 0.36363636], [ 0.6, 0.4]])) assert p is None npt.assert_array_equal(m, [0,1])
def run_pcca_plus(num_macrostates, assignments, tProb, flux_cutoff=0.0, objective_function="crispness", do_minimization=True): logger.info("Running PCCA+...") pcca_plus = lumping.PCCAPlus(tProb, num_macrostates, flux_cutoff=flux_cutoff, do_minimization=do_minimization, objective_function=objective_function) A, chi, MAP = pcca_plus.A, pcca_plus.chi, pcca_plus.microstate_mapping MSMLib.apply_mapping_to_assignments(assignments, MAP) return chi, A, MAP, assignments
def dump_count_matrix(self,assignfn,lagtime=1,outfn="count_matrix.txt"): from msmbuilder import io from msmbuilder import MSMLib assignments = io.loadh(assignfn, 'arr_0') # returns sparse lil_matrix counts = MSMLib.get_count_matrix_from_assignments(assignments, lag_time=lagtime, sliding_window=True) counts = counts.tocoo() np.savetxt(outfn,(counts.row, counts.col, counts.data))
def test_4(self): c, rc, t, p, m = MSMLib.build_msm(self.assignments, lag_time=2, symmetrize=None, sliding_window=True) npt.assert_array_equal(c.todense(), np.matrix('7 4; 3 2')) npt.assert_array_almost_equal(rc.todense(), np.matrix('7 4; 3 2')) npt.assert_array_almost_equal( t.todense(), np.matrix([[0.63636364, 0.36363636], [0.6, 0.4]])) assert p is None npt.assert_array_equal(m, [0, 1])
def test_trim_states(): # run the (just tested) ergodic trim counts = scipy.sparse.csr_matrix(np.matrix('2 1 0; 1 2 0; 0 0 1')) trimmed, mapping = MSMLib.ergodic_trim(counts) # now try the segmented method states_to_trim = MSMLib.ergodic_trim_indices(counts) trimmed_counts = MSMLib.trim_states( states_to_trim, counts, assignments=None) eq(trimmed.todense(), trimmed_counts.todense()) assignments = np.array([np.arange(counts.shape[0])]) states_to_trim = MSMLib.ergodic_trim_indices(counts) trimmed_counts, trimmed_assignments = MSMLib.trim_states(states_to_trim, counts, assignments=assignments) # Test that code works with assignments given trimmed_assignments_ref = np.array([[0, 1, -1]]) # State 2 is strong-disconnected so set to -1 eq(trimmed_assignments, trimmed_assignments_ref)
def parallel_get_matrix(input): print "working" (Ttest, multinom, NumStates)=input newT=scipy.sparse.lil_matrix((int(NumStates),int(NumStates)),dtype='float32') for i in range(0, Ttest.shape[1]): transitions = numpy.row_stack((numpy.array([i]*NumStates),numpy.arange(0, NumStates))) pvals=numpy.array([x/sum(Ttest[i]) for x in Ttest[i]]) counts=numpy.random.multinomial(int(multinom), pvals, size=1) newT=newT+scipy.sparse.coo_matrix((counts[0], transitions),shape=(NumStates,NumStates)) rev_counts, t_matrix, Populations, Mapping = MSMLib.build_msm(newT, symmetrize='MLE', ergodic_trimming=True) return rev_counts, t_matrix, Populations, Mapping
def __init__(self): self.epsilon = 1E-7 self.alpha = 0.001 # Confidence for uncertainty estimate # Testing is stochastic; we expect errors 0.1 % of the time. self.max_lag = 100 self.times = np.arange(self.max_lag) self.num_steps = 100000 self.C = np.array([[500, 2], [2, 50]]) self.T = MSMLib.estimate_transition_matrix(self.C) self.state_traj = np.array(msm_analysis.sample(self.T, 0, self.num_steps))
def GetCorrelationCorrectedEigensystem(T, NumEigen, Assignments, MultiplicativeFactor=6, CorrelationLength=50): """Get the slowest eigenvalues of a system, correcting for nonmarkovian bias. Inputs: T: Transition Matrix NumEigen: Number of eigenvalues to get Assignments: Assignments array Optional Arguments: Eigenvalues estimated using a short lagtime model generally yield timescales that are far too fast. In particular, often those timescales would be slower when estimated with a longer timescale model. This is particularly bad when using PCCA or PCCA+, as sometimes the short-timescale eigenvalues are not in correct rank order. This leads to poor state decompositions. This functino uses an eigenvector correlation function analysis to get the 'long lagtime' corrected eigenvalues, in the correct order. These eigenvalues can then be used to yield a better state decomposition. """ #We first calculate more than the desired number of eigenvalues #Then we correct them and pick the slowest *corrected* eigenvalues NumEigenToCalculate = NumEigen * MultiplicativeFactor eigVals, eigVecs = MSMLib.GetEigenvectors(T, NumEigenToCalculate) #eigVals,eigVecs_Right=MSMLib.GetEigenvectors_Right(T,(NumEigen)*MultiplicativeFactor) #Calculate the right eigenvectors using the stationary vector eigVecs_Right = eigVecs.copy() Pi = eigVecs[:, 0] for i in range(NumEigenToCalculate): eigVecs_Right[:, i] /= Pi Ind, CorrelationEigVals = ReOrderEigensystem( Assignments, eigVals, eigVecs_Right, CorrelationLength=CorrelationLength) #Re-order using the correct ordering CorrelationEigVals = CorrelationEigVals[Ind] eigVals = eigVals[Ind] eigVecs = eigVecs[:, Ind] eigVecs_Right = eigVecs_Right[:, Ind] #Collect the NumEigen slowest eigenvalues and eigenvectors. eigVals = eigVals[0:NumEigen] eigVecs = eigVecs[:, 0:NumEigen] eigVecs_Right = eigVecs_Right[:, 0:NumEigen] CorrelationEigVals = CorrelationEigVals[0:NumEigen] print(-1 / np.log(eigVals)) print(-1 / np.log(CorrelationEigVals)) return eigVals, CorrelationEigVals, eigVecs, eigVecs_Right
def test_2(self): c, rc, t, p, m = MSMLib.build_msm(self.assignments, self.lag_time, symmetrize=None) npt.assert_array_equal(c.todense(), np.matrix('7 5; 4 2')) npt.assert_array_almost_equal(rc.todense(), np.matrix([[7, 5], [4, 2]])) npt.assert_array_almost_equal( t.todense(), np.matrix([[0.58333333, 0.41666667], [0.66666667, 0.33333333]])) assert p is None npt.assert_array_equal(m, [0, 1])
def run(project, assignments, num_confs_per_state, random_source=None): """ Pull random confs from each state in an MSM Parameters ---------- project : msmbuilder.Project Used to load up the trajectories, get topology assignments : np.ndarray, dtype=int State membership for each frame num_confs_per_state : int number of conformations to pull from each state random_source : numpy.random.RandomState, optional If supplied, random numbers will be pulled from this random source, instead of the default, which is np.random. This argument is used for testing, to ensure that the random number generator always gives the same stream. Notes ----- A new random_source can be initialized by calling numpy.random.RandomState(seed) with whatever seed you like. See http://stackoverflow.com/questions/5836335/consistenly-create-same-random-numpy-array for some discussion. """ if random_source is None: random_source = np.random n_states = max(assignments.flatten()) + 1 logger.info("Pulling %s confs for each of %s confs", num_confs_per_state, n_states) inv = MSMLib.invert_assignments(assignments) xyzlist = [] for s in xrange(n_states): trj, frame = inv[s] # trj and frame are a list of indices, such that # project.load_traj(trj[i])[frame[i]] is a frame assigned to state s for j in xrange(num_confs_per_state): r = random_source.randint(len(trj)) xyz = Trajectory.read_frame(project.traj_filename(trj[r]), frame[r]) xyzlist.append(xyz) # xyzlist is now a list of (n_atoms, 3) arrays, and we're going # to stack it along the third dimension xyzlist = np.dstack(xyzlist) # load up the conf to get the topology, put then pop in the new coordinates output = project.load_conf() output['XYZList'] = xyzlist return output
def test_3(self): c, rc, t, p, m = MSMLib.build_msm(self.assignments, self.lag_time, symmetrize='Transpose') npt.assert_array_equal(c.todense(), np.matrix('7 5; 4 2')) npt.assert_array_almost_equal(rc.todense(), np.matrix([[7, 4.5], [4.5, 2]])) npt.assert_array_almost_equal( t.todense(), np.matrix([[0.60869565, 0.39130435], [0.69230769, 0.30769231]])) npt.assert_array_almost_equal(p, [0.63888889, 0.36111111]) npt.assert_array_equal(m, [0, 1])
def test_1(self): c, rc, t, p, m = MSMLib.build_msm(self.assignments, self.lag_time, symmetrize='MLE') npt.assert_array_equal(c.todense(), np.matrix('7 5; 4 2')) npt.assert_array_almost_equal( rc.todense(), np.matrix([[6.46159184, 4.61535527], [4.61535527, 2.30769762]])) npt.assert_array_almost_equal( t.todense(), np.matrix([[0.58333689, 0.41666311], [0.66666474, 0.33333526]])) npt.assert_array_almost_equal(p, [0.61538595, 0.38461405]) npt.assert_array_equal(m, [0, 1])