Beispiel #1
0
    def test_plot_voronoi(self):
        kmeans = KMeans(n_clusters=15)
        kmeans.fit([data])

        ax = plot_voronoi(kmeans, xlabel='x', ylabel='y')

        assert isinstance(ax, SubplotBase)
def test_plot_voronoi():
    kmeans = KMeans(n_clusters=15)
    kmeans.fit([data])

    ax = plot_voronoi(kmeans)

    assert isinstance(ax, Subplot)
Beispiel #3
0
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time):
    clusterer_dir = "%s/clusterer_%dclusters.h5" % (tica_dir, n_clusters)
    if (os.path.exists(clusterer_dir)):
        print("Already clustered")
    else:
        print("Clustering by KMeans")
        reduced_data = verboseload(data_dir)
        trajs = np.concatenate(reduced_data)
        clusterer = KMeans(n_clusters=n_clusters, n_jobs=-1)
        clusterer.fit_transform(reduced_data)
        verbosedump(clusterer, clusterer_dir)
def cluster_kmeans(tica_dir, data_dir, traj_dir, n_clusters, lag_time):
	clusterer_dir = "%s/clusterer_%dclusters.h5" %(tica_dir, n_clusters)
	if (os.path.exists(clusterer_dir)):
		print "Already clustered"
	else:
		print "Clustering by KMeans"
		reduced_data = verboseload(data_dir)
		trajs = np.concatenate(reduced_data)
		clusterer = KMeans(n_clusters = n_clusters, n_jobs=-1)
		clusterer.fit_transform(reduced_data)
		verbosedump(clusterer, clusterer_dir)	
Beispiel #5
0
def cluster_msm(sequences,n_states, lag_times):
  for n in n_states:
    states = KMeans(n_clusters=n)
    states.fit(sequences)
    io.dump(states,str(n)+'n_cl.pkl')
    ts=np.zeros(5)
    for lag_time in lag_times:
        msm = MarkovStateModel(lag_time=lag_time, verbose=False,n_timescales=5)
        msm.fit(states.labels_)
        ts1=msm.timescales_
        ts=np.vstack((ts,ts1))
        io.dump(msm,str(n)+'n_'+str(lag_time)+'lt_msm.pkl')
    ts=np.delete(ts, (0), axis=0)
    io.dump(ts,str(n)+'n_timescales.pkl')
def fit_protein_kmeans(yaml_file,mini=True,pca=False):
    mdl_dir = yaml_file["mdl_dir"]
    mdl_params = yaml_file["mdl_params"]

    current_mdl_params={}
    for i in mdl_params.keys():
        if i.startswith("cluster__"):
            current_mdl_params[i.split("cluster__")[1]] = mdl_params[i]

    if mini:
        current_mdl_params["batch_size"] = 100*current_mdl_params["n_clusters"]
        kmeans_mdl = MiniBatchKMeans(**current_mdl_params)
    else:
        kmeans_mdl = KMeans(**current_mdl_params)
    data = []

    for protein in yaml_file["protein_list"]:
        with enter_protein_mdl_dir(yaml_file, protein):
            if pca:
                tica_data = verboseload("pca_data.pkl")
            else:
                tica_data = verboseload("tica_data.pkl")
            # get all traj
            sorted_list = sorted(tica_data.keys(), key=keynat)
            data.extend([tica_data[i] for i in sorted_list])

    kmeans_mdl.fit(data)
    kmeans_mdl_path = os.path.join(mdl_dir, "kmeans_mdl.pkl")
    verbosedump(kmeans_mdl, kmeans_mdl_path)
    return
def Kmeans_score(dataset, Max_clusters):
    print(
        "Start to analyze the dependence of inertia on the number of clusters\n"
    )
    scores_in = []  #the elbow indicates the good cluster number
    scores_sc = [
    ]  #s=b-a/max(a,b) a: The mean distance between a sample and all other points in the same class,
    # b: The mean distance between a sample and all other points in the next nearest cluster.
    scores_ch = []  # Variance Ratio Criterion, tightness of the cluster
    scores_rt = [
    ]  # Variance ratio, As the ratio inherently rises with cluster count,
    # one looks for an “elbow” in the curve where adding another cluster does not add much new information, as done in a scree test
    scores_db = [
    ]  # Values closer to zero indicate a better partition. sum of cluster i and j diameter over the distance between cluster centroids i and j. smaller the better.
    for i in range(Max_clusters - 2):
        kmeans_model = KMeans(n_clusters=i + 2,
                              init='k-means++',
                              n_init=10,
                              max_iter=300,
                              tol=0.001,
                              precompute_distances='auto',
                              verbose=0,
                              random_state=None,
                              copy_x=True,
                              n_jobs=1).fit(dataset)
        labels = kmeans_model.labels_
        scores_in.append(kmeans_model.inertia_)
        scores_sc.append(
            metrics.silhouette_score(dataset[0], labels[0],
                                     metric='euclidean'))
        scores_ch.append(metrics.calinski_harabaz_score(dataset[0], labels[0]))
        scores_rt.append(ssr_sst_ratio(dataset[0], labels[0]))
        scores_db.append(metrics.davies_bouldin_score(dataset[0], labels[0]))
    print("Done generating scores for " + str(Max_clusters) + " clusters\n")
    return scores_in, scores_sc, scores_ch, scores_rt, scores_db
def cluster_project_wrapper(proj_folder,feature_dict,n_states):

     if os.path.exists(proj_folder+"/assignments.pkl"):
          return verboseload(proj_folder+"/cluster_mdl.pkl"),verboseload(proj_folder+"/assignments.pkl")
     elif os.path.exists(proj_folder+"/cluster_mdl.pkl"):
          cluster_mdl = verboseload(proj_folder+"/cluster_mdl.pkl")
     else:
          cluster_mdl = KMeans(n_clusters = n_states)
          cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

     assignments={}
     for i in feature_dict.keys():
          assignments[i] = cluster_mdl.transform([feature_dict[i]])

     verbosedump(cluster_mdl,proj_folder+"/cluster_mdl.pkl")
     verbosedump(assignments,proj_folder+"/assignments.pkl")
     return cluster_mdl,assignments
def cluster_features(features, clusterer, n_clusters=8):
    '''
    Input
    features : list of arrays, length n_trajs, each of shape (n_samples, n_features)
	
    Output
    clst : msmbuilder.cluster object, with attributes
        cluster_centers_ : (n_clusters, n_features)
	labels_	         : list of arrays, each of shape (n_samples, )
    '''
    if clusterer == 'KMeans':
        from msmbuilder.cluster import KMeans
        clst = KMeans(n_clusters=n_clusters)
    elif clusterer == 'KCenters':
        from msmbuilder.cluster import KCenters
        clst = KCenters(n_clusters=n_clusters)
    elif clusterer == 'KMedoids':
        from msmbuilder.cluster import KMedoids
        clst = KMedoids(n_clusters=n_clusters)
    elif clusterer == 'MiniBatchKMeans':
        from msmbuilder.cluster import MiniBatchKMeans
        clst = MiniBatchKMeans(n_clusters=n_clusters)
    elif clusterer == 'MiniBatchKMedoids':
        from msmbuilder.cluster import MiniBatchKMedoids
        clst = MiniBatchKMedoids(n_clusters=n_clusters)
    clusters = clst.fit_transform(features)
    return clst
Beispiel #10
0
def cluster_project_wrapper(proj_folder, feature_dict, n_states):

    if os.path.exists(proj_folder + "/assignments.pkl"):
        return verboseload(proj_folder +
                           "/cluster_mdl.pkl"), verboseload(proj_folder +
                                                            "/assignments.pkl")
    elif os.path.exists(proj_folder + "/cluster_mdl.pkl"):
        cluster_mdl = verboseload(proj_folder + "/cluster_mdl.pkl")
    else:
        cluster_mdl = KMeans(n_clusters=n_states)
        cluster_mdl.fit([feature_dict[i] for i in feature_dict.keys()])

    assignments = {}
    for i in feature_dict.keys():
        assignments[i] = cluster_mdl.transform([feature_dict[i]])

    verbosedump(cluster_mdl, proj_folder + "/cluster_mdl.pkl")
    verbosedump(assignments, proj_folder + "/assignments.pkl")
    return cluster_mdl, assignments
def cluster():
    '''
    This function perfomes K-means clustering on the tICA space and saves assignsment files for each trajectory.
    Cluster centers are also saved at `microstate_centers.txt` file.
    '''
    cluster = KMeans(n_clusters=n_states,n_jobs=-1,verbose=0, max_iter=100, tol=0.0001,)
    dataset, ev0, ev1 = [], [], []
    print "Loading projected data..."
    for i in tqdm(range(start_traj, end_traj+1)):
        a = io.loadh('%s/traj%d_%s.h5' %(proj_path,i,traj_name))['arr_0']
        a = a[:,0:2]
        dataset.append(a)
	ev0.extend(a[:,0])
	ev1.extend(a[:,1])
    print "Clustering %d datapoints..." %len(ev0)
    cluster.fit(dataset)
    for i in range(start_traj,end_traj+1):
        np.savetxt('%s/assigns_%d.txt' %(out_path,i),np.array(cluster.labels_[i-start_traj]),fmt='%d')
    np.savetxt('%s/microstate_centers.txt' %out_path,np.array(cluster.cluster_centers_))
    print "Saved microstate assignments and microstate centers at %s" %out_path
    return cluster.cluster_centers_, np.array(ev0), np.array(ev1)
def clustering(N_cluster_opt, dataset, traj):
    cluster = KMeans(n_clusters=N_cluster_opt,
                     init='k-means++',
                     n_init=10,
                     max_iter=300,
                     tol=0.001,
                     precompute_distances='auto',
                     verbose=0,
                     random_state=None,
                     copy_x=True,
                     n_jobs=2).fit(dataset)
    cluster_centers = cluster.cluster_centers_
    print("center lenghth: " + str(len(cluster_centers)) + "\n")
    clusters = [[] for i in range(0, N_cluster_opt)]
    clusters_xyz = [[] for i in range(0, N_cluster_opt)]
    clusters_xyz_center = []
    fileout_labels = open(
        "./AlleyCat-Ca-constrained/Labels_for_" + str(N_cluster_opt) +
        "_clusters.dat", 'w')
    for i in range(0, len(cluster.labels_[0])):
        fileout_labels.write("snapshot " + str(i + 1) +
                             " corresponds to Cluster " +
                             str(cluster.labels_[0][i] + 1) + "\n")
        for j in range(0, N_cluster_opt):
            if cluster.labels_[0][i] == j:
                clusters[j].append(dataset[0][i])
                clusters_xyz[j].append(traj[i].xyz)
    fileout = open(
        "./AlleyCat-Ca-constrained/population_for_" + str(N_cluster_opt) +
        "_clusters.dat", 'w')
    for l in range(0, N_cluster_opt):
        clusters_xyz_center.append(
            np.average(np.array(clusters_xyz[l]), axis=0)[0])
        fileout.write('The population of cluster ' + str(l) + ' is ' +
                      str(len(clusters[l])) + '\n')
        print('The population of cluster ' + str(l) + ' is ' +
              str(len(clusters[l])))
    fileout_labels.close()
    fileout.close()
    return clusters_xyz, clusters_xyz_center, cluster_centers, clusters, cluster.labels_[
        0]
Beispiel #13
0
n_samples = 200
topFile='NarK-strip.pdb'

dataset = [] 
ls = []
for i in sorted(glob.glob('*.npy')):
	a = np.load(i)
	b = np.array(a)
	dataset.append(b)
	ls.append(i)
	print(i)
np.save('list', ls)

#trajs = [np.load('data.npy')]
# make cluster of the tICs trajectories
cluster = KMeans(n_clusters=myn_clusters)
cluster.fit(dataset)
l = cluster.labels_

T = []
for trj in glob.glob('*strip.mdcrd'):
	T.append(trj)
T.sort()

# Write the output file, which have the information about population of each cluster, 
# trajectory name and frame number of corresponding frame 	
asFunctions.writeOPF(l, T, myn_clusters, n_samples)

# Based on information in output file, build the cpptraj input file
asFunctions.CpptrajInGen_commonTop(topFile)
#pickle.dump( cluster , open( "tICCluster.pkl", "wb"))
Beispiel #14
0
f = DihedralFeaturizer(sincos=False)
dump(f, "raw_featurizer.pkl")

feat = f.transform(trj_list)

dump(feat, "raw_features.pkl")

f = load("./featurizer.pkl")
dump(f, "featurizer.pkl")
df1 = pd.DataFrame(f.describe_features(trj_list[0]))
dump(df1, "feature_descriptor.pkl")
feat = f.transform(trj_list)

dump(feat, "features.pkl")

t = tICA(lag_time=100, n_components=2, kinetic_mapping=False)

tica_feat = t.fit_transform(feat)

dump(t, "tica_mdl.pkl")
dump(tica_feat, "tica_features.pkl")

kmeans_mdl = KMeans(50)
ass = kmeans_mdl.fit_predict(tica_feat)
msm_mdl = MarkovStateModel(100)
msm_mdl.fit(ass)

dump(kmeans_mdl, "kmeans_mdl.pkl")
dump(ass, "assignments.pkl")
dump(msm_mdl, "msm_mdl.pkl")
    def calculate_tica_components(self, cluster_method, calculate_strides=False, feats=None):
          
        '''Load in the features, calculate a given number of tICA components (tica_components) given a
        lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates
        a list for each desired component, clusters the data, saving normalized populations as populations.dat
        and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are
        calculated, saved, and plotted.
        '''
	
	# tICA parameters
        tica_lagtime = 10 # determine from implied timescales
        tica_components = 8 # how many tICs to compute
        n_clusters = 100 # denotes number of microstates
        n_timescales = tica_components # plot all eigenvalues --> timescales
        md_time_step = 0.02 # ns
        subsampled_time_step = 1. # ns multiplier of timescales and lagtimes in implied timescale plot
        stride = int(subsampled_time_step / md_time_step)  #time step stride for sub-sampling
	equil_time = 1. # ns
        equil_steps = 1 #int(equil_time / md_time_step)  time steps to be removed from start
        lagtimes = np.array([1,2,4,8,16,32,64,128,256,512,1024])
        cluster_method = 'kcenters' # 'kcenters/kmeans'
        all_ticas = list(itertools.permutations(range(1,tica_components+1), 2)) # all combinations
        all_ticas = [[1,2]] # override: just show analysis for first two components
        cluster_percentage_cutoff = 5 # clusters with a relative population less than this
                                  # number will not be labeled on plot i.e. 0 : all clusters labeled
        verbose = False

        print("\nCalculating tICA components...")
        
        # Load in feature files THIS WILL NEED TO BE CHANGED
	if feats == None:
	    if calculate_strides:
                self.calculate_stride_distances(stride, equil_steps)
	        data = np.load('/home/server/git/fah-scripts/DataAnalysisScripts/stride_dist/stride_dist_%d.npy' % self.proj_num)
	    else:
	        data = self.data
	else:
	    data = np.load(feats)

	features = []
	for run in data:
	    for clone in run:
	        gen_seq = []
	        for gen in clone:
	            if gen is not None and gen[0] is not None:
    	                if calculate_strides or feats is not None:
			    gen_seq.append(gen)
		        else:
	                    gen_seq.append(gen[::stride]) 
	        if len(gen_seq) > 0:
		    gen_cat = np.concatenate(gen_seq)
		    if calculate_strides:
		        features.append(gen_cat)
		    else:
		        features.append(gen_cat[equil_steps:])
	features = np.asarray(features)
	print(features.shape)
	print(features[0].shape)
	tica_coordinates = tICA(lag_time=tica_lagtime,
            n_components=int(tica_components)).fit_transform(features)
      
        np.save('%s/lag_%d_coord_%d.npy' %(self.tICA_dir, tica_lagtime, tica_components), tica_coordinates)
          
        # Initiate and populate an array for each component    
        for i in range(tica_components):
            exec('tica_' + str(i+1) + ' = []')
          
        for i in tqdm.tqdm(range(len(features))):
            for j in range(len(tica_coordinates[i])):
                for k in range(tica_components):
                    exec('tica_' + str(k+1) + '.append(tica_coordinates[i][j][k])')
            
        # Perform clustering based on the cluster_method parameter.
        if cluster_method == 'kcenters':
            print("Clustering via KCenters...")
            clusters = KCenters(n_clusters)
        elif cluster_method == 'kmeans':
            print("Clustering via KMeans...")
            clusters = KMeans(n_clusters)
        else:
            sys.exit("Invalid cluster_method. Use kmeans or kcenters.")
        
        # Determine cluster assignment for each frame.      
        sequences = clusters.fit_transform(tica_coordinates)
	
        np.save('%s/lag_%d_clusters_%d_sequences.npy' %(self.tICA_dir, tica_lagtime, n_clusters), sequences)
        np.save('%s/lag_%d_clusters_%d_center.npy' %(self.tICA_dir, tica_lagtime, n_clusters),
        clusters.cluster_centers_)

        # Determine cluster populations, normalize the counts, and save as percentages for
        # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data.
        # Finally, save normalized counts.
        print("\nDetermining cluster populations...")
    
        if not os.path.exists('%s/cluster_centers' % self.tICA_dir):
            os.makedirs('%s/cluster_centers' % self.tICA_dir)
        counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)])
        normalized_counts =  counts/float(counts.sum())
        percentages = [ i*100 for i in normalized_counts ]
        population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ]
        np.savetxt('%s/cluster_centers/populations.dat' % self.tICA_dir, normalized_counts)
	

        # Plot all unique combinations of tICA components
        print("\nPlotting tICA components with cluster centers...")
        all_ticas = list(itertools.permutations(range(1,tica_components+1), 2))
        for j in tqdm.tqdm(range(len(all_ticas))): # For each pair
            if all_ticas[j][0] < all_ticas[j][1]:
                plt.figure(j, figsize=(20,16))
                plt.hexbin(eval("tica_"+str(all_ticas[j][0])), eval("tica_"+str(all_ticas[j][1])), bins='log')
                x_centers = [clusters.cluster_centers_[i][all_ticas[j][0]-1] for i in range(len(clusters.cluster_centers_))]
                y_centers = [clusters.cluster_centers_[i][all_ticas[j][1]-1] for i in range(len(clusters.cluster_centers_))]
                high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ]
                high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ]
                plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o")
                plt.plot(eval("tica_"+str(all_ticas[j][0])+'[0]'), eval("tica_"+str(all_ticas[j][1])+'[0]'), color='k', marker='*',markersize=24)
                plt.xlabel('tic'+str(all_ticas[j][0]))
                plt.ylabel('tic'+str(all_ticas[j][1]))
                plt.title(self.proj_num)
                # Add labels for high-population cluster centers
                for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers):
                    plt.annotate(
                      label,
                      xy = (x, y), xytext = (-15, 15),
                      textcoords = 'offset points', ha = 'right', va = 'bottom',
                      bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
                      arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
                plt.savefig('%s/tica_' % (self.tICA_dir) +str(all_ticas[j][0])+'_'+str(all_ticas[j][1])+'.png')
                plt.close()
        
###########################################################################
	for filename in os.listdir(self.tICA_dir + '/cluster_centers'):
	    if filename.endswith('.pdb'):
                os.remove(self.tICA_dir + '/cluster_centers/' + filename)  
    # Write out PDBs for each cluster center
        print("Performing cluster analytics and saving center PDBs...\n")
	runs, clones, gens = data.shape[0], data.shape[1], data.shape[2]
	x, y, z = 0, 0, 0
	for i in range(len(features)):
	    if i % clones == 0 and i != 0:
		x += 1
	    if i % gens == 0:
		y = 0
            n_snapshots = len(clusters.distances_[i])

            # Determine frames that are cluster centers
            cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ]
            # Determine number of each cluster, correlates to populations.dat
            cluster_labels = sequences[i][cluster_indices]
            # Save each cluster center as a pdb
            if list(cluster_indices): # load center-containing xtcs to check length
		traj_cat = []
		print('x: %d, y: %d, z: %d' % (x, y, z))

		while True:
		    try:
			traj = base_dir + 'PROJ%s/RUN%s/CLONE%s/results%s/traj_comp.xtc' % (self.proj_num, x, y, z)
                	traj_cat.append(md.load(traj, top=self.gro_file))
			z += 1
		    except:
			break
		if len(traj_cat) > 0:
		    trajectory_file = md.join(traj_cat)
                xtc_len = len(trajectory_file)
	    y += 1
            z = 0
            for j in range(len(cluster_indices)):
                frames = range(xtc_len) # map the strided frame number back to xtc frame number
                strided_frames = frames[equil_steps:][::stride]  
                xtc_frame = frames.index(strided_frames[cluster_indices[j]])
                cluster_traj = trajectory_file[xtc_frame]
                cluster_traj.save_pdb('%s/cluster_centers/state_%d_%.3f.pdb'%(self.tICA_dir, cluster_labels[j],percentages[cluster_labels[j]]))
                if verbose:
                    print('Successfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(cluster_labels[j],percentages[cluster_labels[j]]))
                    print('traj_file: %s (%d/%d)'%(trajectory_file,i,len(features)))
                    print('frame: %d (%d/%d centers from this trajectory)'%(cluster_indices[j],j,len(cluster_indices)))
                    print('strided: npy_frame/npy_len = %d/%d = %f'%(cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots))
                    print('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(xtc_frame,xtc_len,xtc_frame/xtc_len))
Beispiel #16
0
import numpy as np
from msmbuilder.cluster import KMeans, KCenters
import mdtraj.io as io

# 100 microstates
cluster = KMeans(
    n_clusters=100,
    n_jobs=-1,
    verbose=0,
    max_iter=100,
    tol=0.0001,
)

dataset = []
for i in range(4):
    a = io.loadh('../on_tica_l20_s1_%d.h5' % i)['arr_0']
    a = a[:, 0:3]  # using first 3 tICs
    dataset.append(a)
    print a.shape
for i in range(20):
    a = io.loadh('../on_tica_l20_s2_%d.h5' % i)['arr_0']
    a = a[:, 0:3]
    dataset.append(a)
    print a.shape
for i in range(20):
    a = io.loadh('../on_tica_l20_s3_%d.h5' % i)['arr_0']
    a = a[:, 0:3]
    dataset.append(a)
    print a.shape
for i in range(20):
    a = io.loadh('../on_tica_l20_s4_%d.h5' % i)['arr_0']
Beispiel #17
0
from msmbuilder.decomposition import tICA
from msmbuilder.cluster import KMeans,KCenters,KMedoids
from msmbuilder.msm import MarkovStateModel


verbose = True

tica_data=np.load('../../ticas_n_8.npy')

reduced_data = []
for i in range(len(tica_data)):
    reduced_data.append(tica_data[i][::100,:])

if verbose:
    print "Clustering."
kmeans = KMeans(n_clusters=1200).fit(reduced_data)
Gen_fn = "Gens.npy"
np.save(Gen_fn,kmeans.cluster_centers_)
if verbose:
    print "Wrote: %s"%Gen_fn
model_dir = "kmeans_model_n_1200"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
model_fn = os.path.join(model_dir,'kmeans-combined.pkl')
joblib.dump(kmeans,model_fn)
if verbose:
    print "Saved cluster model to %s"%model_fn
if verbose:
    print "Assigning.."
assignments = kmeans.predict(tica_data)
if verbose:
Beispiel #18
0
        print "%s not exists!" % tica_fn
        continue

    tica_data = np.load(tica_fn)

    results = []

    n_clusters = [100, 200, 400, 600, 800, 1000, 1200, 1500, 2000, 2500, 3000]

    #n_clusters = [1200,1500,2000]

    #n_clusters = [3500,3500,4000,4500,5000,6000]
    lagtime = 50

    for n in n_clusters:
        kmeans = KMeans(n_clusters=n, n_jobs=-1)
        print "Clustering data to %d clusters..." % n
        for fold in range(nFolds):
            train_data = []
            test_data = []
            for i in range(len(tica_data)):
                cv = KFold(len(tica_data[i]), n_folds=nFolds)
                for current_fold, (train_index, test_index) in enumerate(cv):
                    if current_fold == fold:
                        train_data.append(tica_data[i][train_index])
                        test_data.append(tica_data[i][test_index])
            reduced_train_data = sub_sampling_data(train_data, stride=100)
            kmeans.fit(reduced_train_data)
            assignments_train = kmeans.predict(train_data)
            assignments_test = kmeans.predict(test_data)
            msm = MarkovStateModel(lag_time=lagtime)
Beispiel #19
0
def compute_tica_components():
          
    '''Load in the features, calculate a given number of tICA components (tica_components) given a
       lagtime (lag_time), and save tICA coordinates and eigenvector data. It then creates and populates
       a list for each desired component, clusters the data, saving normalized populations as populations.dat
       and saving each cluster center as a .pdb. tICA plots are created and saved, and implied timescales are
       calculated, saved, and plotted.
    '''
        
    verbose = False
    save_pdb = True
    color_by = 'cluster'
    
    if verbose:
        print("\nCalculating tICA components...")
    if not os.path.exists(project_title + '/tica_%d'%n_clusters):
        os.mkdir(project_title + '/tica_%d'%n_clusters)
    
    # load in feature files and determine indices of unbiased ensembles
    feature_files = []
    for i in range(runs):
        run_files = sorted(glob.glob(/features/' + "P*R%d_*npy"%i))
        feature_files += run_files
        if i in unbiased_runs:
            unbiased_indices = [len(feature_files) - len(run_files),len(feature_files)]
    features = [np.load(x) for x in feature_files]
    
    # perform tICA calculation and extract score / eigenvectors
    tica_coordinates = tICA(lag_time=tica_lagtime,
        n_components=int(n_components)).fit_transform(features)
    tica_components = tICA(lag_time=tica_lagtime,
        n_components=int(n_components)).fit(features)
    eigenvectors = np.transpose(tica_components.eigenvectors_)
    tica_score = tica_components.score(features)
          
    np.save('%s/tica_%d/tica_coords-lag_%d-comp_%d.npy' %(
        project_title, n_clusters, tica_lagtime, n_components), tica_coordinates)
    np.save('%s/tica_%d/tica_comps-lag_%d-comp_%d.npy' %(
        project_title, n_clusters, tica_lagtime, n_components), tica_components)
    
    # Perform clustering based on the cluster_method parameter.
    if verbose:
        print('Clustering via %s'%cluster_method)
    if cluster_method == 'kcenters':
        clusters = KCenters(n_clusters)
    elif cluster_method == 'kmeans':
        clusters = KMeans(n_clusters)
    elif cluster_method == 'kmedoids':
        clusters = KMedoids(n_clusters)
    else:
        sys.exit('Invalid cluster_method. Use kcenters/kmeans/kmedoids.')
        
    # Cluster unbiased data and fit biased data to these centers
    new_assignments = []
    sequences = clusters.fit_transform(tica_coordinates[unbiased_indices[0]:unbiased_indices[1]])
    for i in tqdm.tqdm_notebook(range(unbiased_indices[0])):
        tica_traj = tica_coordinates[i]
        if isinstance(tica_traj, np.ndarray):
            if not (tica_traj.dtype == 'float32' or tica_traj.dtype == 'float64'):
                tica_traj = tica_traj.astype('float64')
        labels, inertia = msmbuilder.libdistance.assign_nearest(
            tica_traj, clusters.cluster_centers_, metric='euclidean')
        new_assignments.append(labels)

    new_assignments += sequences # tack the unbiased assignments back on to the end.


    np.save('%s/tica_%d/lag_%d_clusters_%d_assignments.npy' %(
        project_title, n_clusters, tica_lagtime, n_clusters), new_assignments)
    np.save('%s/tica_%d/lag_%d_clusters_%d_center.npy' %(
        project_title, n_clusters, tica_lagtime, n_clusters), clusters.cluster_centers_)

    # Determine cluster populations, normalize the counts, and save as percentages for
    # labeling if a cluster contains more than cluster_percentage_cutoff percent of the data.
    # Finally, save normalized counts.
    
    if verbose:
        print("\nDetermining cluster populations...")
    if not os.path.exists('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method)):
        os.mkdir('%s/tica_%d/%s_clusters'%(project_title,n_clusters,cluster_method))
    if not os.path.exists('%s/tica_%d/plots'%(project_title,n_clusters)):
        os.mkdir('%s/tica_%d/plots'%(project_title,n_clusters))
        
    counts = np.array([len(np.where(np.concatenate(sequences)==i)[0]) for i in range(n_clusters)])
    normalized_counts =  counts/float(counts.sum())
    percentages = [ i*100 for i in normalized_counts ]
    population_labels = [ [i,"%.2f"%percentages[i]] for i in range(len(percentages)) if percentages[i] > cluster_percentage_cutoff ]
    np.savetxt('%s/tica_%d/%s_clusters/populations.dat'
               %(project_title,n_clusters,cluster_method), normalized_counts)

    # Plot all unique combinations of tICA components
    if verbose:
        print("\nPlotting tICA components...")
    tica_coordinates = np.concatenate(tica_coordinates)
    new_assignments = np.concatenate(new_assignments)
    cluster_colors = matplotlib.cm.rainbow(np.linspace(0,1,n_clusters))
    for j in tqdm.tqdm_notebook(range(len(all_ticas)),leave=False): # For each pair
        if all_ticas[j][0] < all_ticas[j][1]:
            plt.figure(j, figsize=(20,16))
            tICx, tICy = all_ticas[j][0]-1, all_ticas[j][1]-1
            plt.hexbin(tica_coordinates[:,tICx],tica_coordinates[:,tICy], bins='log')
            for l in tqdm.tqdm(range(len(tica_coordinates))[::stride*2]):
                if color_by == 'cluster':
                    plt.plot(tica_coordinates[l][tICx], tica_coordinates[l][tICy],
                        color=cluster_colors[new_assignments[l]], linestyle="", marker="o")
            x_centers = [clusters.cluster_centers_[i][tICx] for i in range(len(clusters.cluster_centers_))]
            y_centers = [clusters.cluster_centers_[i][tICy] for i in range(len(clusters.cluster_centers_))]
            high_pop_x_centers = [ x_centers[i] for i in range(len(x_centers)) if percentages[i] > cluster_percentage_cutoff ]
            high_pop_y_centers = [ y_centers[i] for i in range(len(y_centers)) if percentages[i] > cluster_percentage_cutoff ]
            plt.plot(x_centers, y_centers, color='y', linestyle="", marker="o")
            plt.plot(tica_coordinates[:,tICx][0],tica_coordinates[:,tICy][0], color='k', marker='*',markersize=24)
            plt.xlabel('tIC'+str(all_ticas[j][0]))
            plt.ylabel('tIC'+str(all_ticas[j][1]))
            plt.title(project_title)
            # Add labels for high-population cluster centers
            for label, x, y in zip(population_labels, high_pop_x_centers, high_pop_y_centers):
                plt.annotate(
                  label,
                  xy = (x, y), xytext = (-15, 15),
                  textcoords = 'offset points', ha = 'right', va = 'bottom',
                  bbox = dict(boxstyle = 'round,pad=0.5', fc = 'yellow', alpha = 0.5),
                  arrowprops = dict(arrowstyle = '->', connectionstyle = 'arc3,rad=0'))
            plt.savefig('%s/tica_%d/plots/tica_%d_%d.png'%(project_title,n_clusters,
                all_ticas[j][0], all_ticas[j][1]))
            plt.close()

    # Write out PDBs for each cluster center
    if verbose:
        print("Performing cluster analytics and saving center PDBs...\n")
    if save_pdb:
        trajectory_files, feature_files, cluster_features = [],[],[]
        for run in range(runs): # get only xtc files that correlate to cluster-center features
            trajectory_files += [re.sub('features',
                                    'traj_data/RUN%d'%run,re.sub('npy','xtc',x)
                                     ) for x in sorted(glob.glob('%s/features/*R%d_*npy'%(
                                        project_title,run)))]
            feature_files += sorted(glob.glob('%s/features/*R%d_*npy'%(project_title,run)))

        for i in tqdm.tqdm_notebook(range(len(trajectory_files)),leave=False):

                n_snapshots = len(clusters.distances_[i])

                # Determine frames that are cluster centers
                cluster_indices = np.arange(n_snapshots)[ (clusters.distances_[i] < 1e-6) ]

                # Determine number of each cluster, correlates to populations.dat
                cluster_labels = sequences[i][cluster_indices]

                # Save each cluster center as a pdb
                if list(cluster_indices): # load center-containing xtcs to check length
                    xtc_len = len(md.load(trajectory_files[i],top=structure_file))
                    
                # map strided frame number back to xtc frame number
                for j in range(len(cluster_indices)):
                        frames = range(xtc_len) 
                        strided_frames = frames[equil_steps:][::stride]
                        xtc_frame = frames.index(strided_frames[cluster_indices[j]])
                        cluster_traj = md.load_frame(trajectory_files[i], xtc_frame,
                                            top=structure_file)
                        cluster_features.append(np.load(feature_files[i])[cluster_indices[j]])
                        cluster_traj.save_pdb('%s/tica_%d/%s_clusters/state_%d.pdb'
                                            %(project_title,n_clusters,cluster_method,
                                            cluster_labels[j]))
                        
                        # save cluster information
                        with open('%s/tica_%d/cluster.dat'%(project_title,n_clusters),'w') as f:
                            f.write('\nSuccessfully saved PDB for cluster: %d, (rel.pop: %.3f)'%(
                                cluster_labels[j],percentages[cluster_labels[j]]))
                            f.write('traj_file: %s (%d/%d)'%(trajectory_files[i],i,len(features)))
                            f.write('frame: %d (%d/%d centers from this trajectory)'%(
                                cluster_indices[j],j,len(cluster_indices)))
                            f.write('strided: npy_frame/npy_len = %d/%d = %f'%(
                                cluster_indices[j],n_snapshots,cluster_indices[j]/n_snapshots))
                            f.write('re-mapped: orig_frame/xtc_len = %d/%d = %f\n'%(
                                xtc_frame,xtc_len,xtc_frame/xtc_len))
                            f.close()
                        
        # save features corresponding to each cluster center
        np.save('%s/tica_%d/cluster_features.npy'%(project_title,n_clusters),cluster_features)
                    
    return tica_score