Ejemplo n.º 1
0
def core_clusters(dynamic_clustering, pdb_file, dcd_pkl_filename):
    # dict of trajectory frames
    dynamic_clustering_frames_list = {}  # dict of [cluster_no - frameids]

    for i in dynamic_clustering:
        try:
            dynamic_clustering_frames_list[dynamic_clustering[i]].append(
                int(i))
        except:
            dynamic_clustering_frames_list[dynamic_clustering[i]] = [int(i)]

    frames = preprocessing.load_residues('reduced_dimensions.pkl')
    distances_of_frames_in_cluster = {}  # this is distance-frameindex mapping
    avg_structure_in_cluster = {}  # this is distance-frameindex mapping

    for i in dynamic_clustering_frames_list:
        #print i, "cluster_id"
        temp = misc.most_probable_structure_in_cluster(
            dynamic_clustering_frames_list[i], frames, pdb_file, i, "dynamic",
            dcd_pkl_filename)
        total_number_of_strucutres = len(dynamic_clustering_frames_list[i])

        for j in range(total_number_of_strucutres):
            distances_of_frames_in_cluster[misc.distance(
                frames[dynamic_clustering_frames_list[i][j]],
                temp)] = dynamic_clustering_frames_list[i][j]

        distance_value = sorted(distances_of_frames_in_cluster.keys())[
            (total_number_of_strucutres / 2) + 1]
        distances_of_frames_in_cluster[i] = distance_value
        avg_structure_in_cluster[i] = temp
    return distances_of_frames_in_cluster, avg_structure_in_cluster
Ejemplo n.º 2
0
def get_dynamic_cluster_sequence():
    frames = preprocessing.load_residues('reduced_dimensions.pkl')
    #print frames.shape
    d = shelve.open("dynamic_clustering")
    #print set(d.values())
    cluster_membership = {}
    for i in range(frames.shape[0]):
        try:
            cluster_membership[int(d[str(i)])] += 1
        except:
            cluster_membership[int(d[str(i)])] = 1
    transition_matrix = ds.Autovivification()
    for i in set(d.values()):
        for j in set(d.values()):
            transition_matrix[i][j] = 0
    for i in range(frames.shape[0] - 1):
        transition_matrix[int(d[str(i)])][int(d[str(i + 1)])] += 1
    # normalizing values row-wise
    cluster_probability = {}
    for i in set(d.values()):
        sums = 0
        for j in set(d.values()):
            sums += transition_matrix[i][j]
        cluster_probability[i] = sums
        for j in set(d.values()):
            transition_matrix[i][j] /= sums * 1.0
    sequence = get_most_probable_path(d)
    return sequence, transition_matrix
def tsne(load=True,jump=9):
    #d = shelve.open("dynamic_clustering")
    X = preprocessing.load_residues('reduced_dimensions.pkl')

    #clusters = []
    #for i in range(0, X.shape[0], jump):
    #	clusters.append(int(d[str(i)]))
    X = X[::jump]
    d = {}
    gc.collect()
    #print X.shape, len(clusters)

    if load == False:
        model = TSNE(n_components=2, random_state=0, perplexity=40)
        y = model.fit_transform(X)
        joblib.dump(model,'tsne_model.pkl')
    else:
        clf = joblib.load('tsne_model.pkl')
        y = clf.embedding_

    #print y.shape
    #print np.array(clusters).shape

    """
    # Plot our dataset.
    fig = plt.figure()
    # ax = fig.add_subplot(111, projection='3d')
    p = plt.scatter(y[:, 0], y[:, 1], c=np.array(clusters), cmap=plt.cm.rainbow)
    plt.colorbar(p)
    plt.legend()
    # plt.show()
    plt.savefig("TSNE_40.eps", format='eps', dpi=300)
    """   
    return
Ejemplo n.º 4
0
def get_path_probability(transition_matrix, path, cluster_representative_index,
                         jump):
    d = shelve.open("dynamic_clustering")
    frames = preprocessing.load_residues('reduced_dimensions.pkl')
    cluster_membership = {}
    for i in range(frames.shape[0]):
        try:
            cluster_membership[int(d[str(i)])] += 1

        except:
            cluster_membership[int(d[str(i)])] = 1

    initial_val = cluster_membership[int(d[str(0)])] / float(frames.shape[0])
    converted_clusters = [d[str((cluster_representative_index[0]) * jump)]]
    for i in range(1, len(path)):
        first = d[str((cluster_representative_index[path[i - 1]]) * jump)]
        second = d[str((cluster_representative_index[path[i]]) * jump)]
        #print first, second, transition_matrix[first][second]
        initial_val *= transition_matrix[first][second]

        #ensuring the path has unique clusters only
        next_state = d[str((cluster_representative_index[path[i]]) * jump)]
        if len(converted_clusters
               ) != 0 and next_state != converted_clusters[-1]:
            converted_clusters.append(next_state)
    return converted_clusters, initial_val
Ejemplo n.º 5
0
def cluster_trajectory_kmeans(fit=True):
    if fit == True:
        X = preprocessing.load_residues('reduced_dimensions.pkl')
        model = KMeans(n_clusters=1000)
        model.fit(X)
        joblib.dump(model, "KMEANS.pkl")
    else:
        model = joblib.load("KMEANS.pkl")
        X = preprocessing.load_residues('reduced_dimensions.pkl')
        mean_cluster_ids = shelve.open("kmeans_trajectory_clustering")  # this structure stores cluster-ids for each frame of the trajectory
        #print X.shape
        for i in range(X.shape[0]):
            prediction = model.predict(X[i].reshape((1, -1)))
            mean_cluster_ids[str(i)] = int(prediction)
        number_of_current_clusters = model.cluster_centers_.shape[0]
        d = dict(mean_cluster_ids)
        mean_cluster_ids.close()
Ejemplo n.º 6
0
def markov_chain(dcd, pdb, filename):
	#mpp.cluster_trajectory_kmeans(fit = True)
	#mpp.cluster_trajectory_kmeans(fit = False)
	#mpp.dynamic_cluster_trajectory(meta_stability_criteria = 0.93, pdb_file=pdb, dcd_pkl_filename=filename)
	
	sequence, transition_matrix = mpp.get_dynamic_cluster_sequence()
	#print len(sequence)
	distribution = mpp.equilibrium_distribution(transition_matrix)
	mpp.construct_transition_graph(sequence,transition_matrix,distribution)
	sequence, transition_matrix = mpp.get_dynamic_cluster_sequence()
	dynamic_clustering = shelve.open("dynamic_clustering")
	
	start, end = misc.get_cluster_ids_for_start_and_end(dynamic_clustering["0"],set(dynamic_clustering.values()), "dynamic")
	#print start, end
	path = mpp.get_most_probable_path_in_markov_chain(transition_matrix, start, end)
	print path
	dcd_array = pp.load_residues(filename)
	misc.write_dcd(dcd_array,path,"dynamic")
	misc.write_pdb(dcd_array,path,"dynamic")
	
	dcd_array = pp.load_residues(filename)
	return
def get_individual_transitions(cluster_representative_point, jump):
    transition_matrix = aux.Autovivification() #T_i_j represents prob of going from i to j
    mixture_params = shelve.open("EM_params_with_full_covariance")

    data_points = preprocessing.load_residues('reduced_dimensions.pkl')
    energies = preprocessing.create_energy_matrix(get_from_file=1)
	
    energies = np.sum(energies,axis=1)

    dbscan = joblib.load('dbscan_model.pkl')

    for i in cluster_representative_point.keys():
	#print "cluster_id:", i
        covar_matrix = mixture_params["variances"][i]
        prec_chol, log_det = prob_dis.compute_precisions_chol(np.array([covar_matrix]))
        
        parent_temp_indices = misc.find_indices_of_clusters(dbscan.labels_, i)    

        parent_temp_transition_probs = aux.Autovivification()    

        parent_log_probabilities = prob_dis.log_pdf(data_points[::jump][np.array(parent_temp_indices)], mixture_params["means"][i], covar_matrix, mixture_params["beta"], energies[::jump][np.array(parent_temp_indices)], mixture_params["energy_cluster"][i], prec_chol[0], log_det[0])

        for j in cluster_representative_point.keys():
            temp_indices = misc.find_indices_of_clusters(dbscan.labels_, j)            
        
            child_log_probabilities = prob_dis.log_pdf(data_points[::jump][np.array(temp_indices)], mixture_params["means"][i], covar_matrix, mixture_params["beta"], energies[::jump][np.array(temp_indices)], mixture_params["energy_cluster"][i], prec_chol[0], log_det[0])
             
            for index in range(parent_log_probabilities.shape[0]):
                max_val = np.maximum(child_log_probabilities, np.array([parent_log_probabilities[index]]))
        
                trans_prob = np.exp(child_log_probabilities - max_val - np.log(np.exp(child_log_probabilities - max_val) + np.exp(np.array([parent_log_probabilities[index]]) - max_val)))
                
                trans_prob[trans_prob == 0.5] = 0.0
                
            
                parent_temp_transition_probs[index][j] = copy.deepcopy(trans_prob)
        
        #normalize_probabilities_for_individual_structures
        for index in parent_temp_transition_probs.keys():
            denom = 0.0
            for j in parent_temp_transition_probs[index].keys():
                denom += np.sum(parent_temp_transition_probs[index][j])
            
            for j in parent_temp_transition_probs[index].keys():
                parent_temp_transition_probs[index][j] /= denom

	    	transition_matrix[parent_temp_indices[index]][j] = parent_temp_transition_probs[index][j]

    return transition_matrix
def evaluate_metastable_states(jump):
    d = shelve.open("dynamic_clustering")
    X = preprocessing.load_residues('reduced_dimensions.pkl')
    clusters = []
    for i in range(0, X.shape[0], jump):
        clusters.append(int(d[str(i)]))

    model = joblib.load('dbscan_model.pkl')
    not_noise_indices = np.where(model.labels_ != -1)
    new_cluster_labels = model.labels_[not_noise_indices]
    new_ground_truth = (np.array(clusters))[not_noise_indices]

    labels_true = new_ground_truth
    labels_pred = new_cluster_labels
    return metrics.adjusted_mutual_info_score(labels_true, labels_pred)
def density_clustering(jump=9, min_number_of_samples = 5):
    X = preprocessing.load_residues('reduced_dimensions.pkl')

    clf = joblib.load('tsne_model.pkl')
    y = clf.embedding_

    # determining value of eps - data is of uniform density
    neigh = NearestNeighbors(4)
    neigh.fit(y)

    plt.clf()
    distances = neigh.kneighbors()[0][:, 3]
    #print distances.shape
    plt.plot(np.arange(distances.shape[0]), np.array(sorted(distances)))

    # plt.plot(np.arange(distances.shape[0]), first_order_gradients[:])
    # print second_order_gradients.argmax()

    # print second_order_gradients.argmax()
    eps_cutoff = np.array(sorted(distances))[int(0.99 * distances.shape[0])]
    #print eps_cutoff, int(0.99 * distances.shape[0])
    # plt.show()

    model = DBSCAN(eps=eps_cutoff, min_samples=min_number_of_samples)
    model.fit_predict(y)
    joblib.dump(model, 'dbscan_model.pkl')
   #print set(model.labels_), len(set(model.labels_))

    """
    not_noise_indices = np.where(model.labels_ != -1)
    new_cluster_labels = model.labels_[not_noise_indices]
    new_ground_truth = (np.array(clusters))[not_noise_indices]
    #print new_cluster_labels.shape
    #print set(clusters), len(set(clusters))

    fig = plt.figure()
    # ax = fig.add_subplot(111, projection='3d')
    p = plt.scatter(y[:, 0], y[:, 1], c=np.array(model.labels_), cmap=plt.cm.rainbow)
    plt.colorbar(p)
    # plt.show()
    plt.savefig("DBSCAN.eps", format='eps', dpi=300)

    labels_true = new_ground_truth
    labels_pred = new_cluster_labels
    print "AMI Score: ", metrics.adjusted_mutual_info_score(labels_true, labels_pred)
    """
    return
Ejemplo n.º 10
0
def most_probable_structure_in_cluster(frame_indices,
                                       frames,
                                       pdb,
                                       cluster_id,
                                       type_of_cluster,
                                       dcd_pkl_filename,
                                       jump=1):
    # this function finds the most probable 2-D location of each cluster
    array = frames[frame_indices]
    if array.shape[0] <= 3:
        mean_point = np.mean(array, axis=0)
    else:
        # use grid search cross-validation to optimize the bandwidth
        params = {'bandwidth': np.logspace(-1, 0, 20)}
        grid = GridSearchCV(KernelDensity(), params)
        grid.fit(array)

        # use the best estimator to compute the kernel density estimate
        kde = grid.best_estimator_
        sampling_points = kde.sample(n_samples=10000, random_state=20)
        #Z = kde.score_samples(sampling_points)
        #Z = Z.reshape(X.shape)

        #np.save("prob_density"+base, Z)
        #index = np.unravel_index(Z.argmax(), Z.shape)
        #print Z[index], index
        #print "most probable values:", X[index], Y[index]

        mean_point = np.mean(sampling_points, axis=0)

#mean_point = sampling_points[index]

    closest_structure_index = get_closest_structure_index(
        frame_indices, frames, mean_point)

    for x in frame_indices:
        print "mol addfile " + dcd_pkl_filename[:3] + ".dcd first " + str(
            x * jump) + " last " + str(x * jump) + " waitfor all"
    print
    dcd_array = preprocessing.load_residues(dcd_pkl_filename)[::jump]
    temp = dcd_array[closest_structure_index]
    # print "mol addfile " + dcd_pkl_filename[:3] + ".dcd first " + str(closest_structure_index*jump) + " last " + str(closest_structure_index*jump) +" waitfor all"
    write_pdb_file(temp, pdb, cluster_id, type_of_cluster)
    return mean_point
Ejemplo n.º 11
0
def find_cluster_centres(jump=18, pdb_file="", dcd_pkl_filename="", load=False):
    # this function finds the most probable 2-D location of each cluster
    dbscan = joblib.load('dbscan_model.pkl')
    X = preprocessing.load_residues('reduced_dimensions.pkl')
    indices = dbscan.core_sample_indices_
    X = X[::jump]  # very important line jump as in t-SNE we skip frames according to the jump parameter
    #print X.shape
    if load == False:
        core_points = X[indices]
        #print core_points.shape
        labels = dbscan.labels_[indices]
        #print labels.shape
        cluster_frames_dict = {}
        for lbl in set(labels):
            cluster_frames_dict[lbl] = []

        for index in range(core_points.shape[0]):
            cluster_frames_dict[labels[index]].append(index)

        cluster_representative_point = {}
        for key in cluster_frames_dict.keys():
            cluster_representative_point[key] = misc.most_probable_structure_in_cluster(cluster_frames_dict[key], X, pdb_file, key, "tsne", dcd_pkl_filename, jump)

        joblib.dump(cluster_representative_point, "tsne_cluster_representative_point.pkl")

    else:
        cluster_representative_point = joblib.load("tsne_cluster_representative_point.pkl")

    # make_clusters
    cluster_representative_index = {}
    for key in cluster_representative_point.keys():
        frame_indices = misc.find_indices_of_clusters(dbscan.labels_, key)
	print "mol new rna.psf"
	for x in frame_indices:
   		print "mol addfile " + dcd_pkl_filename[:3] + "-recenter-solute.dcd first " + str(x*jump) + " last " + str(x*jump) +" waitfor all" 
   	print
        temp_index = misc.get_closest_structure_index(np.array(frame_indices), X, cluster_representative_point[key])
        cluster_representative_index[key] = temp_index
    return cluster_representative_point, cluster_representative_index
Ejemplo n.º 12
0
def graph_based_method(dcd, pdb, filename,jump,min_number_of_samples):
	gbu.tsne(False,jump=jump)
	gbu.density_clustering(jump, min_number_of_samples)
	cluster_representative_point, cluster_representative_index = gbu.find_cluster_centres(jump, pdb, filename, False)
	
	log_transition_matrix = gbu.get_transition_probabilities(cluster_representative_point, cluster_representative_index, temp=411, jump=jump, iterations=1000, train_model = False)

	dbscan = joblib.load('dbscan_model.pkl')
	start = dbscan.labels_[0]
	
	sequence = gbu.get_most_probable_path(dbscan.labels_)	
	#print sequence
	#mpp.construct_transition_graph(sequence,transition_matrix,distribution)#

	start, end = misc.get_cluster_ids_for_start_and_end(start, set(dbscan.labels_), "tsne")
	#print start, end

	path = mpp.get_most_probable_path_in_markov_chain(copy.deepcopy(log_transition_matrix), start, end, is_log = True)
	#print path
	
	dcd_array = pp.load_residues(filename)
	misc.write_dcd(dcd_array,path,"tsne")
	misc.write_pdb(dcd_array,path,"tsne")

	print "The most reactive path is saved in your current working directory as 'tsne_unfolded_traj.dcd'"
	
	"""
	#compare the results to the MPP algorthm
	sequence, transition_matrix = mpp.get_dynamic_cluster_sequence()
	dynamic_clusters_path, path_prob = mpp.get_path_probability(copy.deepcopy(transition_matrix), path,cluster_representative_index,jump=jump)
	#print path_prob
	print "Graph:", (dynamic_clusters_path)

	path = mpp.get_most_probable_path_in_markov_chain(copy.deepcopy(transition_matrix), dynamic_clusters_path[0], dynamic_clusters_path[-1])
	print "Markov Chain", path
	misc.write_pdb(dcd_array,path,"dynamic")	
	"""
	return
Ejemplo n.º 13
0
def dynamic_cluster_trajectory(meta_stability_criteria = 0.9, pdb_file = "", dcd_pkl_filename = ""):
    model = joblib.load("KMEANS.pkl")
    X = preprocessing.load_residues('reduced_dimensions.pkl')
    #print X.shape
    mean_cluster_ids = shelve.open("kmeans_trajectory_clustering")  # this structure stores cluster-ids for each frame of the trajectory
    if (len(mean_cluster_ids) == 0):
        for i in range(X.shape[0]):
            prediction = model.predict(X[i].reshape((1, -1)))
            mean_cluster_ids[str(i)] = int(prediction)
    #print "yo"
    number_of_current_clusters = model.cluster_centers_.shape[0]
    d = dict(mean_cluster_ids)
    mean_cluster_ids.close()
    while (True):
        cluster_membership = {}
        #print number_of_current_clusters, "--"
        for i in range(X.shape[0]):
            try:
                cluster_membership[int(d[str(i)])] += 1
            except:
                cluster_membership[int(d[str(i)])] = 1
        transition_matrix = ds.Autovivification()
        for i in set(d.values()):
            for j in set(d.values()):
                transition_matrix[i][j] = 0
        for i in range(X.shape[0] - 1):
            transition_matrix[int(d[str(i)])][int(d[str(i + 1)])] += 1
        # normalizing values row-wise
        cluster_probability = {}
        for i in set(d.values()):
            sums = 0
            for j in set(d.values()):
                sums += transition_matrix[i][j]
            cluster_probability[i] = sums
            for j in set(d.values()):
                transition_matrix[i][j] /= sums * 1.0
        dynamic_clusters = ds.disjoint(set(d.values()))
        visited = {}
        for i in set(d.values()):
            temp_visited = dfs_markov(i, set(d.values()), transition_matrix, dynamic_clusters, visited, meta_stability_criteria)
            visited = copy.deepcopy(temp_visited)
        dynamic_clusters.compress()
        dynamic_clusters.save_structure()
        cluster_ids = dynamic_clusters.get_clusters(cluster_probability)
        new_clusters = cluster_ids.values()
        #print len(set(new_clusters))
        if number_of_current_clusters == len(set(new_clusters)):
            # applying cluster core correction
            distances_of_frames_in_cluster, avg_structure_in_cluster = core_clusters(d, pdb_file, dcd_pkl_filename)
            #print len(set(d.values())), "original"
            X = preprocessing.load_residues('reduced_dimensions.pkl')
            d_new = {}
            d_new["0"] = d["0"]
            for index in range(1, len(X)):
                dist = misc.distance(X[index], avg_structure_in_cluster[d[str(index)]])
                if dist > distances_of_frames_in_cluster[d[str(index)]]:
                    d_new[str(index)] = d_new[str(index - 1)]
                else:
                    d_new[str(index)] = d[str(index)]
            d = d_new
            dynamic_clustering = shelve.open("dynamic_clustering")
            dynamic_clustering.clear()
            for i in d:
                dynamic_clustering[i] = d[i]
            dynamic_clustering.close()
            #print len(set(d.values())), "original"
            sequence = get_most_probable_path(d)
            #print len(sequence)
            #print sequence
            return sequence, transition_matrix
        else:
            #print set(new_clusters)
            number_of_current_clusters = len(set(new_clusters))
            d_new = {}
            for i in range(X.shape[0]):
                d_new[str(i)] = cluster_ids[int(d[str(i)])]
            d = d_new
            dynamic_clustering = shelve.open("dynamic_clustering")
            dynamic_clustering.clear()
            for i in d:
                dynamic_clustering[i] = d[i]
            dynamic_clustering.close()
    return
Ejemplo n.º 14
0
def get_transition_probabilities(cluster_representative_point, cluster_representative_index, temp, jump, iterations, train_model = True):
	data_points = preprocessing.load_residues('reduced_dimensions.pkl')
	energies = preprocessing.create_energy_matrix(get_from_file=1)
	
	energies = np.sum(energies,axis=1)
	dbscan = joblib.load('dbscan_model.pkl')
	beta_val = 1.0/(0.0019872041*temp)
	#EM to estimate parameters of mixture model
	
	if train_model == True:			
		init_cov = np.cov(data_points, rowvar = False)
		cov = []
		coefs = []
		means = []
		energies_cluster = []
		for key in cluster_representative_point.keys():
			frame_indices = misc.find_indices_of_clusters(dbscan.labels_, key)
			params = {'bandwidth': np.logspace(-1, 0, 20)}
			grid = GridSearchCV(KernelDensity(), params)
			grid.fit(energies[np.array(frame_indices)].reshape(-1,1))

			# use the best estimator to compute the kernel density estimate
			kde = grid.best_estimator_
			sampling_points = kde.sample(n_samples=10000, random_state=20)
			mean_energy = float(np.mean(sampling_points, axis=0))
			

			cov.append(init_cov)
			
			means.append(cluster_representative_point[key])
			energies_cluster.append(mean_energy)
			
			coefs.append(1.0/len(cluster_representative_point.keys()))
	
		#print energies_cluster
		param_grid = {"means": means, "data": data_points, "beta": beta_val, "covariances": cov, "energy_data": energies, "energy_cluster": energies_cluster, "coef": coefs}

		model = em.Expectation_Maximization(param_grid, threshold=1e-4, reg_covar=1e-6, iterations=iterations)
		model.fit()
		model.get_params() #saves params in shelve file names "EM_params"
		mixture_params = dict(shelve.open("EM_params_with_full_covariance"))
		
		joblib.dump(mixture_params, "EM_params.pkl")

	mixture_params = joblib.load("EM_params.pkl")
	#print mixture_params["beta"], mixture_params["coef"]
	log_transition_matrix = aux.Autovivification() #T_i_j represents prob of going from i to j
	
	for i in cluster_representative_point.keys():
		covar_matrix = mixture_params["variances"][i]
		prec_chol, log_det = prob_dis.compute_precisions_chol(np.array([covar_matrix]))
		for j in cluster_representative_point.keys():
			temp_indices = misc.find_indices_of_clusters(dbscan.labels_, j)			
		

			log_probabilites = prob_dis.log_pdf(data_points[::jump][np.array(temp_indices)], mixture_params["means"][i], covar_matrix, mixture_params["beta"], energies[::jump][np.array(temp_indices)], mixture_params["energy_cluster"][i], prec_chol[0], log_det[0])
	 		max_val = np.amax(log_probabilites)
			

			total = np.log(np.sum(np.exp(log_probabilites - max_val))) + max_val
			log_transition_matrix[i][j] = total
	
	return log_transition_matrix
Ejemplo n.º 15
0
def get_metastable_states(filename):
	path = gbu.map_PES()
	dcd_array = pp.load_residues(filename)
	misc.write_dcd(dcd_array,path,"tsne")