Esempio n. 1
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 39299, 38031]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))
    sim_matrix = cluster.similarity_matrix(active_sites)

    assert cluster.cluster_by_partitioning([], {}) == []
    assert cluster.cluster_by_partitioning([active_sites[0]],
                                           {}) == [[active_sites[0]]]
Esempio n. 2
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701, 10701,10814,13052,14181,15813]


    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb"%id)
        active_sites.append(io.read_active_site(filepath))

    cluster.get_order_residues(active_sites)
    # update this assertion
    assert len(cluster.cluster_by_partitioning(active_sites,2)[0]) == 2
    assert len(cluster.cluster_by_partitioning(active_sites,3)[0]) == 3
Esempio n. 3
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    assert cluster.cluster_by_partitioning(active_sites) == [["276"], ["4629"],
                                                             ["10701"]]

    # check empty active sites doesn't crash
    assert cluster.cluster_by_partitioning(None) is None
Esempio n. 4
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb"%id)
        active_sites.append(io.read_active_site(filepath))

    # update this assertion

    #Slice out clusters that have an assignment
    clusters_out = cluster.cluster_by_partitioning(active_sites)
    non_empty_clusters = [i for i in clusters_out if len(i) > 1]
    distances_from_centroids = [i[1][1] for i in non_empty_clusters]
    print('non-empty clusters for k-means is', non_empty_clusters)
    print('non-empty cluster distance from centroids are', distances_from_centroids)

    #Check that the length of non-empty cluster list is <= 3 (in case where e.g. two or three residues get assigned to same cluster)
    assert len(non_empty_clusters) <= 3

    #For my particular variant of k-means, check that the distance between the cluster residue
    #and the centroid is >= 0 (nonnegativity)
    for i in distances_from_centroids:
        assert i >= 0
Esempio n. 5
0
def test_compareTwoMethods():
    allPossible = []
    for filename in os.listdir("data"):
        allPossible.append(int(filename.split(".")[0]))
    hierTotal = 0
    partTotal = 0
    iterations = 100
    numPDBs = 15
    for i in range(0, iterations):
        indices = random.sample(range(0, len(allPossible)), numPDBs)
        pdb_ids = []
        for j in indices:
            pdb_ids.append(allPossible[j])
        active_sites = []
        for id in pdb_ids:
            filepath = os.path.join("data", "%i.pdb" % id)
            active_sites.append(io.read_active_site(filepath))

        hierScore = cluster.qualityMetric(
            cluster.cluster_hierarchically(active_sites))
        partScore = cluster.qualityMetric(
            cluster.cluster_by_partitioning(active_sites))

        hierTotal += hierScore
        partTotal += partScore
    print("hierScoreAverage: ", hierTotal / float(iterations))
    print("partScoreAverage: ", partTotal / float(iterations))
Esempio n. 6
0
def test_hierarchical_clustering():
    # tractable subset
    # pdb_ids = [276, 4629, 10701]
    pdb_ids = [276, 1806, 3458, 3733, 10814, 4629, 10701]
    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))
    assert [] not in cluster.cluster_by_partitioning(active_sites)
    assert len(cluster.cluster_by_partitioning(active_sites)) == 3

    pdb_ids = [276]
    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))
    assert len(cluster.cluster_by_partitioning(active_sites)) == 1
Esempio n. 7
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    clustering = cluster.cluster_by_partitioning(active_sites, [2])
    # clusters more similar clusters together
    assert get_names(flatten(clustering[0])) in [['276', '4629'], ['10701']]
    assert get_names(flatten(clustering[1])) in [['276', '4629'], ['10701']]

    # len(clustered_list.unique()==k)
    active_sites = read_active_sites("data")
    assert len(cluster.cluster_by_partitioning(active_sites, [2])) == 2
    assert len(cluster.cluster_by_partitioning(active_sites, [3])) == 3
Esempio n. 8
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    assert len(cluster.cluster_by_partitioning(active_sites).keys()) >= 2
Esempio n. 9
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # update this assertion
    assert cluster.cluster_by_partitioning(active_sites) == [[2], [], [0, 1]]
Esempio n. 10
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))
    assert cluster.test_format_part(
        cluster.cluster_by_partitioning(active_sites, 2)) == [[276],
                                                              [4629, 10701]]
Esempio n. 11
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    clusters = cluster.cluster_by_partitioning(active_sites, 1)[0]
    assert [int(c.name) for c in clusters] == [276, 4629, 10701]
Esempio n. 12
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # clusters produce k number of final labels
    assert len(
        cluster.cluster_by_partitioning(active_sites)) == 3  #k = 3 in my code
Esempio n. 13
0
def test_partition_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # update this assertion
    # checking the the three sites cluster as expected
    assert np.array_equal(
        cluster.cluster_by_partitioning(active_sites, 2)[0], [1, 1, 0])
Esempio n. 14
0
def test_partition_clustering():
    random.seed(40)
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # update this assertion
    label = cluster.cluster_by_partitioning(active_sites, 2)
    #assert cluster.cluster_by_partitioning(active_sites) == []
    assert all(label[1] == [0, 0, 1])
Esempio n. 15
0
from matplotlib import pyplot as plt
import pandas as pd

active_sites = io.read_active_sites('./data')


number_clusters = []
p_sil_scores = []
h_sil_scores = []
r_sil_scores = []

for i in range(2,10):

    # clustering all sites by partition
    print('Finding %d clusters by partitioning'% i)
    p_clusters, p_distances = cluster.cluster_by_partitioning(active_sites, i)

    # clustering all sites hierarchically
    print('Finding %d clusters hierarchically'% i)
    h_clusters, h_distances = cluster.cluster_hierarchically(active_sites, i)

    # generating random clusters labels
    print('Generating %d random cluster labels'% i)
    r_clusters = np.random.choice(range(0,i), 136)

    # to evaluate the clusters I will get the silhouette score for each method of clustering
    # as well as randomly generated cluster labels
    p_sil_scores.append(silhouette_score(p_distances, p_clusters))
    h_sil_scores.append(silhouette_score(h_distances, h_clusters))
    r_sil_scores.append(silhouette_score(p_distances, r_clusters))
Esempio n. 16
0
#from .utils import Atom, Residue, ActiveSite
from hw2skeleton import io
from hw2skeleton import cluster
import matplotlib.pyplot as plt
import numpy as np

active_sites = io.read_active_sites(
    "C:\\Users\Zoë\Documents\GitHub\hw2-skeleton\data")
#site1 = active_sites[5]
#site2 = active_sites[7]
#print('site1: ', site1.categories)
#print('site2: ', site2.categories)
#sim = cluster.compute_similarity(site1,site2)

# Run for one clustering by kmeans
Pclusters, PmaxDistance = cluster.cluster_by_partitioning(active_sites)
##for i in clusters:
##    print(i.toStr())
io.write_clustering("clusterPk=10", Pclusters)

# Run for just one clustering by agglomerative clustering
Hclusters, distH = cluster.cluster_hierarchically(active_sites)
io.write_clustering("clusterHcutoff=0.3", Hclusters)

## Run for one clustering by agglomerative clustering
#Hclusters, HmaxDist, Hclusterings = cluster.cluster_hierarchically(active_sites)
#io.write_mult_clusterings("clusteringsH1", Hclusterings)

#%%
## Clusterings of multiple k values in kmeans
#kvals = [2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,120,130,136]