def test_kcenters_2(): # some data at (0,0), some data at (1,1) and some data at (0.5, 0.5) data = [np.zeros((10,2)), np.ones((10,2)), 0.5*np.ones((10,2))] m = KCenters(n_clusters=2, random_state=0) m.fit(data) # the centers should be [0,0], [1,1] (in either order). This # assumes that the random state seeded the initial center at # either (0,0) or (1,1). A different random state could have # seeded the first cluster at [0.5, 0.5] assert np.all(m.cluster_centers_ == np.array([[0,0], [1,1]])) or \ np.all(m.cluster_centers_ == np.array([[1,1], [0,0]])) # the distances should be 0 or sqrt(2)/2 eq(np.unique(np.concatenate(m.distances_)), np.array([0, np.sqrt(2)/2]))
def test_kcenters_1(): # make sure all the shapes are correct of the fit parameters m = KCenters(n_clusters=3) m.fit([np.random.randn(23,2), np.random.randn(10,2)]) assert isinstance(m.labels_, list) assert isinstance(m.distances_, list) assert len(m.labels_) == 2 eq(m.cluster_centers_.shape, (3,2)) eq(m.labels_[0].shape, (23,)) eq(m.labels_[1].shape, (10,)) eq(m.distances_[0].shape, (23,)) eq(m.distances_[1].shape, (10,)) eq(m.fit_predict([np.random.randn(10, 2)])[0].shape, (10,)) assert np.all(np.logical_not(np.isnan(m.distances_[0])))
#sample conformations along tIC1 print('now we are sampling representative conformations along tIC1') plt.figure() sampling_along_tIC(resultdir, 'samples_tic1.png', tica_trajs, trajectory_dir, traj_list_array, pdb_name, 1) print("You can use vmd to visualize the tica-dimension-tIC1.xtc file") # In[158]: #step 1.1: split the conformations into hundreds of microstates #perform kCenters on the tIC subspace #input:tICA projections, output:assignments indicating which microstate each conformation is assigned to nMicro = 100 #specified a priori kcenters = KCenters(n_clusters=nMicro, metric='euclidean', random_state=0) microstate_sequences = kcenters.fit(tica_trajs) print("output of msm:", microstate_sequences.labels_) plt.figure() plot_states_on_tic_space(resultdir, 'micorstate.png', tica_trajs, microstate_sequences.labels_, 1, 2) # In[159]: #plot the microstate implied timescale, which will show how many macrostates we need plt.figure() lag_times = range(2, 50, 2) msm_timescales = implied_timescales(microstate_sequences.labels_, lag_times, n_timescales=10, msm=MarkovStateModel(
#tica_trajs is the data for your DBSCAN for j in range(len(traj_list_array)): np.savetxt('tica_projections/%s_tica.txt' % (traj_list_array[j][:-4]), tica_trajs[j][:, 0:4]) #we save the top tica projections ###after you have the tica projection (tica_trajs), you can do clustering like DBSCAN, KCENTERS exit() #########################below: kcenters to get the microstates nMicro = 100 kcenters = KCenters(n_clusters=nMicro, metric='euclidean', random_state=0) kcenters_sequences = kcenters.fit(tica_trajs) out_assignment_dir = 'Microassignment/' os.system("mkdir %s" % (out_assignment_dir)) tmp_counter = 0 for ifile in traj_list_array: np.savetxt("%s/%s_assignment_.txt" % (out_assignment_dir, ifile[:-4]), kcenters.labels_[tmp_counter], fmt='%d') tmp_counter += 1 exit() from msmbuilder.msm import MarkovStateModel from msmbuilder.utils import dump
for line in open("trajlist"): traj_list_array.append(line.strip()) print traj_list_array dataset = [] for trajfile in traj_list_array: t = md.load(xtc_file_dir + trajfile, top='test.pdb', atom_indices=select_atoms) dataset.append(t) print dataset #ww: check whether they have aligned w.r.t reference kcenters = KCenters(n_clusters=nMicro, metric='rmsd', random_state=0) kcenters_sequences = kcenters.fit(dataset) out_assignment_dir = 'Microassignment/' out_kcenters_distances_dir = 'distances/' os.system("mkdir %s" % (out_assignment_dir)) os.system("mkdir %s" % (out_kcenters_distances_dir)) tmp_counter = 0 for ifile in traj_list_array: numpy.savetxt("%s/%s_assignment_.txt" % (out_assignment_dir, ifile[:-4]), kcenters.labels_[tmp_counter], fmt='%d') numpy.savetxt("%s/%s_distances_.txt" % (out_kcenters_distances_dir, ifile[:-4]), kcenters.distances_[tmp_counter], fmt='%18.5f')
plt.figure() draw_tica_projection_cross_validation( sub_resultdir, 'Fold_%d_tica_lagtime_%d_train_data_proj_tIC13.png' % (fold, tica_correlation_time), train_data_projection, test_data_projection, 1, 3) for n_tics in n_tics_range: for n_Micro in n_Micro_range: print("parameters: fold-", fold, ',tica_lagtime-', tica_correlation_time, ',n_tics-', n_tics, ',n_Micro-', n_Micro) kcenters = KCenters(n_clusters=n_Micro, metric='euclidean', random_state=0) kcenters.fit(train_data_projection) train_data_sequence = kcenters.predict( train_data_projection) test_data_sequence = kcenters.predict(test_data_projection) msm = MarkovStateModel( n_timescales=3, lag_time=100, reversible_type='transpose', verbose=False, sliding_window=True, ergodic_cutoff='on') #the parameters may change msm.fit(train_data_sequence) train_score = msm.score(train_data_sequence) test_score = msm.score(test_data_sequence) f1 = open( sub_resultdir +
metric='rmsd', ) ## Try to limit RAM usage def guestimate_stride(): total_data = meta['nframes'].sum() want = kcen.n_clusters * 20 stride = max(1, total_data // want) print("Since we have", total_data, "frames, we're going to stride by", stride, "during fitting, because this is probably adequate for", kcen.n_clusters, "clusters") return stride ## Fit kcen.fit([traj for _, traj in itertrajs(meta, stride=guestimate_stride())]) print(kcen.summarize()) ## Save save_generic(kcen, 'clusterer' + str(round_num) +'.pickl') ## Save centroids def frame(traj_i, frame_i): # Note: kmedoids does 0-based, contiguous integers so we use .iloc row = meta.iloc[traj_i] return md.load_frame(row['traj_fn'], frame_i, top=row['top_fn']) centroids = md.join((frame(ti, fi) for ti, fi in kcen.cluster_ids_), check_topology=False)