print "Clustering." kmeans = KMeans(n_clusters=1200).fit(reduced_data) Gen_fn = "Gens.npy" np.save(Gen_fn,kmeans.cluster_centers_) if verbose: print "Wrote: %s"%Gen_fn model_dir = "kmeans_model_n_1200" if not os.path.exists(model_dir): os.makedirs(model_dir) model_fn = os.path.join(model_dir,'kmeans-combined.pkl') joblib.dump(kmeans,model_fn) if verbose: print "Saved cluster model to %s"%model_fn if verbose: print "Assigning.." assignments = kmeans.predict(tica_data) if verbose: print "Wrote assignments" np.save('Assignments.npy',assignments) if verbose: print "Building MSMs:" lagtimes = [1,10,20,30,40,50,100,150,200] msmts = [] for lagtime in lagtimes: if verbose: print "\tLagtime: %d"%lagtime msm = MarkovStateModel(lag_time=lagtime).fit(assignments) msmts.append(msm.timescales_) lagtime_fn = "lagtimes.txt" msmts_fn = "ImpliedTimescales.npy"
for n in n_clusters: kmeans = KMeans(n_clusters=n, n_jobs=-1) print "Clustering data to %d clusters..." % n for fold in range(nFolds): train_data = [] test_data = [] for i in range(len(tica_data)): cv = KFold(len(tica_data[i]), n_folds=nFolds) for current_fold, (train_index, test_index) in enumerate(cv): if current_fold == fold: train_data.append(tica_data[i][train_index]) test_data.append(tica_data[i][test_index]) reduced_train_data = sub_sampling_data(train_data, stride=100) kmeans.fit(reduced_train_data) assignments_train = kmeans.predict(train_data) assignments_test = kmeans.predict(test_data) msm = MarkovStateModel(lag_time=lagtime) msm.fit(assignments_train) train_score = msm.score_ test_score = msm.score(assignments_test) results.append({ 'train_score': train_score, 'test_score': test_score, 'n_states': n, 'fold': fold, 'timescales': msm.timescales_ }) results = pd.DataFrame(results)