def test_hierarchical(): # gets the clustering results from agglomerative clustering, and checks that the number of different labels is correct # use tiny datasets because this ish takes forever test_cluster5 = algs.HierarchicalClustering(5) labels5 = test_cluster5.cluster(ligands[:100]) assert (1 in labels5 and 2 in labels5 and 3 in labels5 and 4 in labels5 and 5 in labels5) test_cluster2 = algs.HierarchicalClustering(10, seed=6) labels2 = test_cluster2.cluster(ligands[:100]) assert (1 in labels2 and 2 in labels2)
def test_similarity(): testcluster = np.array([[3.08232755e-01, 7.31276243e-01], [1.38059574e-01, 5.96831094e-01], [7.17477934e-01, 6.92660634e-01], [1.04842083e-01, 5.81815300e-01], [2.63517862e-01, 8.56987831e-01], [6.82660482e-01, 7.65745298e-01], [3.30899459e-01, 1.27005643e-01], [2.15388524e+00, 2.76495447e+00], [2.02847470e+00, 2.17510569e+00], [2.81339552e+00, 2.92175026e+00], [2.11079023e+00, 2.70619934e+00], [2.51975852e+00, 2.72664963e+00]]) TestPT = algs.PartitionClustering(rawdata=testcluster, n_clusters=2, max_iteration=100) TestPT.runClustering() score = algs.SilhouetteScore(TestPT) assert (score > 0.80) TestPT = algs.HierarchicalClustering(rawdata=testcluster, n_clusters=2) TestPT.runClustering() score = algs.SilhouetteScore(TestPT) assert (score > 0.80)
def test_Hierarchical(ligand_test): """this function will test the k-means clustering for a series of k values and confirm that the end result has at least 1 ligand per cluster""" test_cluster = algs.HierarchicalClustering('single-linkage',1) test_cluster.get_data(ligand_test) test_cluster.cluster() for i in range(10): number_clusters = len(numpy.unique(test_cluster.dendogram[i,:])) assert number_clusters == (10-i)
def test_hierarchical(): ligands = read_test_ligands('ligand_information.csv') distanceMatrix = algs.makeDistanceMatrix(ligands) hc = algs.HierarchicalClustering() hcclusters = hc.cluster(ligands, distanceMatrix, 2) ligandIDs = [] for cluster in hcclusters: for ligand in cluster.ligands: ligandIDs.append(ligand.ligandID) assert ligandIDs == [0, 0, 0, 0, 0, 1, 1, 1, 1, 1], 'Hierarchical Clustering Test Failed :(' print('Hierarchical Clustering Test Passed')
def test_quality_metric(): # Testing quality metric with pre-computed silhouette score test_array = np.array([[1, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 1, 0], [1, 0, 1, 1, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0]]) h_cluster = algs.HierarchicalClustering(num_clusters=2, linkage='ward') h_labels = list(h_cluster.cluster(test_array)) cq = h_cluster.cluster_quality(test_array, h_labels) assert np.around(cq, decimals=2) == 0.51
def test_hierarchical(): ligand_dict = algs.read_in_ligands("ligand_information.csv") ctrl = pd.DataFrame( { 'a': [np.nan, 17, 21, 31, 23], 'b': [17, np.nan, 30, 34, 21], 'c': [21, 30, np.nan, 28, 39], 'd': [31, 34, 28, np.nan, 43], 'e': [23, 21, 39, 43, np.nan] }, index=["a", "b", "c", "d", "e"]) ctr1_dict = {'a': 0, 'b': 1, 'c': 2, 'd': 3, 'e': 4} tester = algs.HierarchicalClustering(ligand_dict, linkage_metric="single", distance_metric="euclidean", desired_k=1, testing=True) #intialiize object tester.proximity_matrix = ctrl tester.Nclusters = 5 #manually overwrite the areas I want to test tester.element_dict = ctr1_dict tmp = tester.cluster() correct_one = [['d', ' e', ' c', ' a', ' b']] assert tmp.cluster_results == correct_one, "Failing to merge columns correctly -- one column" tester = algs.HierarchicalClustering(ligand_dict, linkage_metric="single", distance_metric="euclidean", desired_k=3, testing=True) #intialiize object tester.proximity_matrix = ctrl tester.Nclusters = 5 #overwrite the areas I want to test tester.element_dict = ctr1_dict tmp = tester.cluster() correct_three = [['d'], ['e'], ['c', ' a', ' b']] assert tmp.cluster_results == correct_three, "Failing to merge columns correctly -- three column"
def test_hierarchical(): # Setup thresh = 0.42 x_input_hc = np.array([[0., 0., 1., 0., 0.], [1., 0., 0., 0., 0.], [0., 1., 0., 1., 1.], [0., 1., 0., 0., 0.], [0., 0., 0., 1., 0.]]) desired_p_lab_hc = np.array([0, 1, 2, 2, 2]) # Exercise HC = algs.HierarchicalClustering(x_input_hc, thresh) p_lab_hc = HC.cluster() # Verify np.testing.assert_array_equal(p_lab_hc, desired_p_lab_hc)
def test_hierarchical(): # Since this clustering is deterministic then I will test_array = np.array([[1, 1, 1, 1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 1, 1, 0, 0], [0, 0, 0, 0, 1, 1, 0, 0, 0], [0, 0, 0, 0, 1, 1, 0, 1, 0], [1, 0, 1, 1, 0, 0, 0, 0, 0], [0, 1, 1, 0, 0, 0, 0, 0, 0], [1, 1, 0, 0, 0, 0, 0, 0, 0]]) h_cluster = algs.HierarchicalClustering(num_clusters=2, linkage='ward') h_labels = list(h_cluster.cluster(test_array)) # Checking it creates the proper number of clusters assert len(np.unique(h_labels)) == 2 # Checking that id does correct clustering assert h_labels == [0, 1, 1, 1, 0, 0, 0 ] or h_labels == [1, 0, 0, 0, 1, 1, 1]
def test_hierarchical(): testcluster = np.array([[3.08232755e-01, 7.31276243e-01], [1.38059574e-01, 5.96831094e-01], [7.17477934e-01, 6.92660634e-01], [1.04842083e-01, 5.81815300e-01], [2.63517862e-01, 8.56987831e-01], [6.82660482e-01, 7.65745298e-01], [3.30899459e-01, 1.27005643e-01], [2.15388524e+00, 2.76495447e+00], [2.02847470e+00, 2.17510569e+00], [2.81339552e+00, 2.92175026e+00], [2.11079023e+00, 2.70619934e+00], [2.51975852e+00, 2.72664963e+00]]) TestHC = algs.HierarchicalClustering(rawdata=testcluster, n_clusters=2) TestHC.runClustering() assert (TestHC.DistanceMatrix.shape == (len(testcluster), len(testcluster))) assert (len(TestHC.clusters) == 2) sorted_clus = [] for clus in TestHC.clusters: sorted_clus.append(sorted(clus)) assert (sorted(sorted_clus) == [[0, 1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11]])
def main(): LigandInformation = pd.read_csv("../ligand_information.csv", sep=",") LigandData = Ligand.Ligand(LigandID=LigandInformation['LigandID'], score=LigandInformation['Score'], SMILES=LigandInformation['SMILES'], OnBits=LigandInformation['OnBits']) LigandData.OnbitToLong() #Question 2 #I am going to implement a Umap of the ligand fit = umap.UMAP() u = fit.fit_transform(LigandData.long[0:2000]) #Used to just load the data for later questions/visualizations, if you uncomment, keep the loadtxt command to so the data #can be typecast to a numpy array, for array index notation consistentcy . np.savetxt('UmapDimensionalSpace.txt', [u[:, 0], u[:, 1]]) u = np.loadtxt("UmapDimensionalSpace.txt") # plt.scatter(u[0], u[1]) plt.title('UMAP embedding of Ligands') plt.show() LABEL_COLOR_MAP = {1: 'r', 2: 'b', 3: 'g', 4: 'y', 5: 'm', 6: 'c'} #Question 3 +4 score = [] for i in range(1, 10): print(i) PT = algs.PartitionClustering(LigandData.long[0:2000], n_clusters=i, max_iteration=100) PT.runClustering() score.append(algs.SilhouetteScore(PT)) del PT print(score) #Will rerun singluar test on higest silscore to get data for generation. #Best score was found when K=6, see Guardado_Miguel_BMI203_HW2_WriteUp.pdf for more info. PT_k6 = algs.PartitionClustering(LigandData.long[0:2000], n_clusters=6, max_iteration=100) PT_k6.runClustering() print(PT_k6.clusterassignment) label_color = [LABEL_COLOR_MAP[l] for l in PT_k6.clusterassignment] print(np.unique(PT_k6.clusterassignment)) u = np.loadtxt("UmapDimensionalSpace.txt") plt.figure(figsize=(20, 10)) plt.scatter(u[0], u[1], c=label_color) plt.title('UMAP embedding of Ligands,2000 Ligands, 6 clusters') plt.show() #Question 5+6 score = [] for i in range(1, 10): print(i) HC = algs.PartitionClustering(LigandData.long[0:2000], n_clusters=i, max_iteration=100) HC.runClustering() score.append(algs.SilhouetteScore(HC)) print(score) del HC print(score) HC_k4 = algs.HierarchicalClustering(LigandData.long[0:2000], n_clusters=4) HC_k4.runClustering() print(HC_k4.clusterassignment) label_color = [LABEL_COLOR_MAP[l] for l in HC_k4.clusterassignment] print(np.unique(HC_k4.clusterassignment)) u = np.loadtxt("UmapDimensionalSpace.txt") plt.figure(figsize=(20, 10)) plt.scatter(u[0], u[1], c=label_color) plt.title('UMAP embedding of Ligands,2000 Ligands, 4 clusters') plt.show() # #Question 7 arr1 = np.array( [0.2035, 0.0933, 0.18810, 0.0485, 0.396001, 0.22705, 0.08660, 0.29346]) arr2 = np.array([ 0.0953, 0.08660, 0.26153, 0.081803, 0.163898, 0.233848, 0.09873, -0.167866 ]) print(np.sum(arr1 - arr2)) print(algs.CalculatePairWiseDistance(arr1, arr2)) k = [4, 6] for n_cluster in k: PT = algs.PartitionClustering(LigandData.long[0:2000], n_clusters=n_cluster, max_iteration=100) PT.runClustering() HC = algs.HierarchicalClustering(LigandData.long[0:2000], n_clusters=n_cluster) HC.runClustering() print(algs.TanimotoCoeff(PT.clusters, HC.clusters))