Beispiel #1
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb"%id)
        active_sites.append(io.read_active_site(filepath))

    # update this assertion
    clusters_out = cluster.cluster_hierarchically(active_sites)
    print('the clusters out look like', clusters_out)

    #Check length and structure
    hierarchical_structure_check(clusters_out)

    #Let's also check what this hierarchical test looks like for much longer, even-length list:
    pdb_ids = [1806, 3458, 3733, 4629, 6040, 7674, 7780, 8208, 8304, 9776, 10701, 10814]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb"%id)
        active_sites.append(io.read_active_site(filepath))

    clusters_out = cluster.cluster_hierarchically(active_sites)
    print('the clusters out look like', clusters_out)

    #Check length and structure
    hierarchical_structure_check(clusters_out)
Beispiel #2
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 52954, 34088]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    sim_matrix = cluster.similarity_matrix(active_sites)

    # update this assertion
    assert cluster.cluster_hierarchically([], {}) == []
    assert cluster.cluster_hierarchically([active_sites[0]],
                                          {}) == [[active_sites[0]]]
Beispiel #3
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701, 10701,10814,13052,14181,15813]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb"%id)
        active_sites.append(io.read_active_site(filepath))

    cluster.get_order_residues(active_sites)

    # update this assertion
    assert len(cluster.cluster_hierarchically(active_sites,2)[0]) == 2
    assert len(cluster.cluster_hierarchically(active_sites,3)[0]) == 3
    assert len(cluster.cluster_hierarchically(active_sites,4)[0]) == 4
Beispiel #4
0
def test_compareTwoMethods():
    allPossible = []
    for filename in os.listdir("data"):
        allPossible.append(int(filename.split(".")[0]))
    hierTotal = 0
    partTotal = 0
    iterations = 100
    numPDBs = 15
    for i in range(0, iterations):
        indices = random.sample(range(0, len(allPossible)), numPDBs)
        pdb_ids = []
        for j in indices:
            pdb_ids.append(allPossible[j])
        active_sites = []
        for id in pdb_ids:
            filepath = os.path.join("data", "%i.pdb" % id)
            active_sites.append(io.read_active_site(filepath))

        hierScore = cluster.qualityMetric(
            cluster.cluster_hierarchically(active_sites))
        partScore = cluster.qualityMetric(
            cluster.cluster_by_partitioning(active_sites))

        hierTotal += hierScore
        partTotal += partScore
    print("hierScoreAverage: ", hierTotal / float(iterations))
    print("partScoreAverage: ", partTotal / float(iterations))
Beispiel #5
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # clusters more similar clusters together
    clustering = cluster.cluster_hierarchically(active_sites, [2])
    assert get_names(flatten(clustering[0])) in [['4629', '276'], ['10701']]
    assert get_names(flatten(clustering[1])) in [['4629', '276'], ['10701']]

    # len(clustered_list.unique()==k)
    active_sites = read_active_sites("data")
    assert len(cluster.cluster_hierarchically(active_sites, [2])) == 2
    assert len(cluster.cluster_hierarchically(active_sites, [3])) == 3
Beispiel #6
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # update this assertion
    assert cluster.cluster_hierarchically(active_sites) == [[1, 0], 2]
Beispiel #7
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 276, 10701, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))
    assert cluster.cluster_hierarchically(
        active_sites, 2) == [[10701, 10701], [276, 276]] or [[276, 276],
                                                             [10701, 10701]]
Beispiel #8
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    print()
    assert len(cluster.cluster_hierarchically(active_sites).keys()) >= 2
Beispiel #9
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # clusters produce k number of final labels
    assert len(
        cluster.cluster_hierarchically(active_sites)) == 3  #k = 3 in my code
Beispiel #10
0
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # update this assertion
    # checking the the three sites cluster as expected
    assert np.array_equal(
        cluster.cluster_hierarchically(active_sites, 2)[0], [0, 0, 1])
Beispiel #11
0
def test_hierarchical_clustering():
    random.seed(40)
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # update this assertion
    label = cluster.cluster_hierarchically(active_sites)
    assert all(label[1] == [0, 0, 1])
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))
    assignment = cluster.cluster_hierarchically(active_sites)

    assert cluster.convert_indices_to_active_sites(assignment,
                                                   active_sites) == [["276"],
                                                                     ["4629"],
                                                                     ["10701"]]

    pdb_ids = [276, 4629]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    # check empty active sites doesn't crash
    assert cluster.cluster_hierarchically(None, K=10) is None
def test_hierarchical_clustering():
    # tractable subset
    pdb_ids = [276, 4629, 10701]

    active_sites = []
    for id in pdb_ids:
        filepath = os.path.join("data", "%i.pdb" % id)
        active_sites.append(io.read_active_site(filepath))

    clusters = []
    for clust in cluster.cluster_hierarchically(active_sites, 1):
        elems = []
        for e in clust:
            elems.append([int(n.name) for n in e])
        clusters.append(elems)

    assert clusters == [[[276], [4629], [10701]], [[276], [10701, 4629]],
                        [[276, 10701, 4629]]]
Beispiel #14
0

number_clusters = []
p_sil_scores = []
h_sil_scores = []
r_sil_scores = []

for i in range(2,10):

    # clustering all sites by partition
    print('Finding %d clusters by partitioning'% i)
    p_clusters, p_distances = cluster.cluster_by_partitioning(active_sites, i)

    # clustering all sites hierarchically
    print('Finding %d clusters hierarchically'% i)
    h_clusters, h_distances = cluster.cluster_hierarchically(active_sites, i)

    # generating random clusters labels
    print('Generating %d random cluster labels'% i)
    r_clusters = np.random.choice(range(0,i), 136)

    # to evaluate the clusters I will get the silhouette score for each method of clustering
    # as well as randomly generated cluster labels
    p_sil_scores.append(silhouette_score(p_distances, p_clusters))
    h_sil_scores.append(silhouette_score(h_distances, h_clusters))
    r_sil_scores.append(silhouette_score(p_distances, r_clusters))

fig1 = plt.figure(dpi = 300)
plt.xlabel("Number of Clusters")
plt.ylabel('Silhouette Score')
plt.title('Evaluating Clustering Methods')
active_sites = io.read_active_sites(
    "C:\\Users\Zoë\Documents\GitHub\hw2-skeleton\data")
#site1 = active_sites[5]
#site2 = active_sites[7]
#print('site1: ', site1.categories)
#print('site2: ', site2.categories)
#sim = cluster.compute_similarity(site1,site2)

# Run for one clustering by kmeans
Pclusters, PmaxDistance = cluster.cluster_by_partitioning(active_sites)
##for i in clusters:
##    print(i.toStr())
io.write_clustering("clusterPk=10", Pclusters)

# Run for just one clustering by agglomerative clustering
Hclusters, distH = cluster.cluster_hierarchically(active_sites)
io.write_clustering("clusterHcutoff=0.3", Hclusters)

## Run for one clustering by agglomerative clustering
#Hclusters, HmaxDist, Hclusterings = cluster.cluster_hierarchically(active_sites)
#io.write_mult_clusterings("clusteringsH1", Hclusterings)

#%%
## Clusterings of multiple k values in kmeans
#kvals = [2,3,4,5,6,7,8,9,10,20,30,40,50,60,70,80,90,100,120,130,136]
#kvalstest = [2,10,100]
#
#numtrials = 10
#allscores = np.zeros((numtrials, len(kvals)))
#for i in range(numtrials):
#    Pclusterings = cluster.cluster_by_partitioning(active_sites)