def setUpClass(cls): cls.matrix = CondensedMatrix(squared_CH_table1) cls.clusterings = [Clustering([Cluster(None, [0,1,2,3]), Cluster(None, [4,5])]), Clustering([Cluster(None, [0,1]), Cluster(None, [2,3]), Cluster(None, [4,5])])] update_medoids(cls.clusterings[0], cls.matrix) update_medoids(cls.clusterings[0], cls.matrix)
def test_get_all_clustered_elements(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) self.assertItemsEqual(sorted(clustering.get_all_clustered_elements()), range(17))
def test_batch_load(self): clusters =((Cluster(16,[16]), "data/training_clustering_1.bin"), (Cluster(4,[4,5,6,7,8]), "data/training_clustering_2.bin"), (Cluster(0,[0,1,2,3]), "data/training_clustering_3.bin"), (Cluster(9,[9,10,11,12,13,14,15]), "data/training_clustering_4.bin")) # Creates 4 clusterings of 1 cluster filenames = [] for cluster, filename in clusters: Clustering([cluster]).save_to_disk(filename) filenames.append(filename) # Then loads them and extracts its elements elements = [] for filename in filenames: elements.extend(Clustering.load_from_disk(filename).get_all_clustered_elements()) elements_batch = [] clusterings_batch = Clustering.load_all_from_directory("data/") for clustering, filename in clusterings_batch: elements_batch.extend(clustering.get_all_clustered_elements()) # And cleans the house for filename in filenames: os.system("rm "+filename) numpy.testing.assert_equal(sorted(elements), range(17)) numpy.testing.assert_equal(sorted(elements_batch), range(17))
def test_cluster_cohe_sep_wo_prot_eval(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters_1 = [ Cluster(None, elements=[0, 1]), Cluster(None, elements=[2]), Cluster(None, elements=[3, 4]) ] clusters_2 = [ Cluster(None, elements=[0, 2, 4]), Cluster(None, elements=[1, 3]) ] clusterization_1 = Clustering(clusters_1) clusterization_2 = Clustering(clusters_2) sep_calctor = SeparationCalculator() self.assertEqual( sep_calctor.cluster_separation(clusters_1[0], clusterization_1, 1., distances), 27.0) self.assertEqual( sep_calctor.cluster_separation(clusters_1[1], clusterization_1, 1., distances), 24.0) self.assertEqual( sep_calctor.cluster_separation(clusters_1[2], clusterization_1, 1., distances), 37.0) self.assertEqual( sep_calctor.cluster_separation(clusters_2[0], clusterization_2, 1., distances), 34.0) self.assertEqual( sep_calctor.cluster_separation(clusters_2[1], clusterization_2, 1., distances), 34.0)
def test_remove_noise(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) clustering.eliminate_noise(5) self.assertEqual(len(clustering.clusters), 2)
def test_to_dic(self): clustering = Clustering([ Cluster(16, [16]), Cluster(9, [9, 10, 11, 12, 13, 14, 15]), Cluster(0, [0, 1, 2, 3]), Cluster(4, [4, 5, 6, 7, 8]) ]) self.assertDictEqual( clustering.to_dic(), { 'clusters': [{ 'prototype': 9, 'elements': '9:15', 'id': 'cluster_1' }, { 'prototype': 4, 'elements': '4:8', 'id': 'cluster_3' }, { 'prototype': 0, 'elements': '0:3', 'id': 'cluster_2' }, { 'prototype': 16, 'elements': '16', 'id': 'cluster_0' }], 'total_number_of_elements': 17, 'number_of_clusters': 4 })
def test_get_all_clustered_elements(self): clusters =( Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) clusterization = Clustering(clusters) self.assertItemsEqual(sorted( clusterization.get_all_clustered_elements()), range(17))
def test_get_medoids(self): clusters = [ ClusterMock(range(0, 10)), ClusterMock(range(10, 50)), ClusterMock(range(50, 80)), ClusterMock(range(80, 200)) ] clustering = Clustering(clusters) self.assertItemsEqual(clustering.get_medoids("distance_matrix"), [0, 10, 50, 80])
def test_get_percent_of_n_clusters(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) percents = clustering.get_population_percent_of_n_bigger_clusters(3) expected_percents = [41.1764705882, 29.4117647059, 23.5294117647] for i in range(3): self.assertAlmostEqual(percents[i], expected_percents[i], 1)
def test_remove_noise(self): clusters =( Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) clusterization = Clustering(clusters) clusterization.eliminate_noise(5) self.assertEqual(len(clusterization.clusters), 2)
def test_get_percent_population_of_cluster(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) total = 0 for i in range(4): total = total + clustering.get_population_percent_of_cluster(i) self.assertAlmostEqual(total, 100., 2)
def test_remove_cluster(self): clusters =( Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) clustering = Clustering(clusters) c = Cluster(0,[0,1,2,3]) clustering.eliminate_cluster(c) self.assertEqual(len(clustering.clusters), 3)
def test_to_dic(self): clustering =Clustering([Cluster(16,[16]), Cluster(9,[9,10,11,12,13,14,15]), Cluster(0,[0,1,2,3]), Cluster(4,[4,5,6,7,8])]) self.assertDictEqual(clustering.to_dic(), {'clusters': [{'prototype': 9, 'elements': '9:15', 'id': 'cluster_1'}, {'prototype': 4, 'elements': '4:8', 'id': 'cluster_3'}, {'prototype': 0, 'elements': '0:3', 'id': 'cluster_2'}, {'prototype': 16, 'elements': '16', 'id': 'cluster_0'}], 'total_number_of_elements': 17, 'number_of_clusters': 4})
def test_number_of_clusters_needed_to_get_this_percent_of_elems(self): clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) self.assertEqual(clustering.number_of_clusters_to_get_percent(71), 3) self.assertEqual(clustering.number_of_clusters_to_get_percent(70), 2) self.assertEqual(clustering.number_of_clusters_to_get_percent(40), 1) self.assertEqual(clustering.number_of_clusters_to_get_percent(42), 2) self.assertEqual(clustering.number_of_clusters_to_get_percent(100), 4)
def test_get_percent_population_of_cluster(self): clusters =( Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) clusterization = Clustering(clusters) total = 0 for i in range(4): total = total + clusterization.get_population_percent_of_cluster(i) self.assertAlmostEqual(total, 100., 2)
def test_load_and_save_to_disk(self): clusters =(Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15])) clustering = Clustering(clusters) before_saving_elements = clustering.get_all_clustered_elements() clustering.save_to_disk("data/saved_clustering_for_test") loaded_clustering = Clustering.load_from_disk("data/saved_clustering_for_test") after_saving_elements = loaded_clustering.get_all_clustered_elements() self.assertItemsEqual(before_saving_elements, after_saving_elements) os.system("rm data/saved_clustering_for_test")
def test_get_percent_of_n_clusters(self): clusters =( Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) clusterization = Clustering(clusters) percents = clusterization.get_population_percent_of_n_bigger_clusters(3) expected_percents = [41.1764705882,29.4117647059,23.5294117647] for i in range(3): self.assertAlmostEqual(percents[i], expected_percents[i], 1)
def test_cluster_is_inside(self): clusters =( Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) not_in_cluster= Cluster(17,[17,16]) in_cluster = Cluster(0,[0,1,2,3]) clusterization = Clustering(clusters) self.assertEqual(clusterization.cluster_index(not_in_cluster),-1) self.assertEqual(clusterization.cluster_index(in_cluster),2) self.assertEqual(clusterization.cluster_is_inside(not_in_cluster),False) self.assertEqual(clusterization.cluster_is_inside(in_cluster),True)
def test_get_proportional_size_representatives(self): clusters = [ ClusterMock(range(0, 10)), ClusterMock(range(10, 50)), ClusterMock(range(50, 80)), ClusterMock(range(80, 200)) ] clustering = Clustering(clusters) rep = clustering.get_proportional_size_representatives( 30, "distance_matrix") self.assertItemsEqual(rep, [ 0, 0, 10, 10, 11, 12, 13, 14, 50, 50, 51, 52, 53, 80, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96 ])
def test_number_of_clusters_needed_to_get_this_percent_of_elems(self): clusters =( Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) clusterization = Clustering(clusters) self.assertEqual(clusterization.number_of_clusters_to_get_percent( 71),3) self.assertEqual(clusterization.number_of_clusters_to_get_percent( 70),2) self.assertEqual(clusterization.number_of_clusters_to_get_percent( 40),1) self.assertEqual(clusterization.number_of_clusters_to_get_percent( 42),2) self.assertEqual(clusterization.number_of_clusters_to_get_percent( 100),4)
def perform_clustering(self, kwargs): """ Does the actual clustering. """ cutoff = kwargs["cutoff"] try: max_clusters = kwargs["max_clusters"] except KeyError: max_clusters = sys.maxint nodes = range(self.condensed_matrix.row_length) clusters = [] elements_already_clustered = 0 iteration = 0 # Do it while there are nodes left while not len(nodes) == 0 and not len(clusters) >= max_clusters: cluster = self.__do_one_iteration(nodes, cutoff) clusters.append(cluster) elements_already_clustered = elements_already_clustered + cluster.get_size( ) if elements_already_clustered + len( nodes) > self.condensed_matrix.row_length: print "[ERROR :: GROMOS perform_clustering] ", elements_already_clustered + len( nodes), iteration exit(1) iteration = iteration + 1 return Clustering(clusters, details="GROMOS (cutoff = " + str(cutoff) + ")")
def purge_mixed_clusters_and_do_graph(mixed, pure_clusters_traj1,condensed_distance_matrix,std_devs_from_A,path): """ """ common.print_and_flush( "Purging clusters...") # Purge all mixed clusters of elements from traj2 purged = [] num_elems_of_traj_2 = [] for i in range(len(mixed)): cluster, elems_in_traj1, elems_in_traj2 = mixed[i] #@UnusedVariable num_elems_of_traj_2.append(len(elems_in_traj2)) # We rebuild the cluster with only elements of traj 1 purged.append(Cluster(prototype=None,elements = elems_in_traj1)) # print "l ",len(elems_in_traj1)," ",len(elems_in_traj2) # we also need to have traj 1 pure clusters purged.extend(pure_clusters_traj1) # Those don't have any element of traj 2, so we put 0s in the number of # elements list num_elems_of_traj_2.extend([0]*len(pure_clusters_traj1)) #Calculate statistics for the remaining clusters for i in range(len(pure_clusters_traj1)): medoid = pure_clusters_traj1[i].calculate_medoid(condensed_distance_matrix) std_devs_from_A.append(get_distance_std_dev_for_elems(pure_clusters_traj1[i].all_elements,medoid,condensed_distance_matrix)) common.print_and_flush( "Done.\n") common.print_and_flush("Trying to draw state graph...") do_graph(Clustering(purged,sort = False),num_elems_of_traj_2,std_devs_from_A,path) common.print_and_flush("Done.\n")
def perform_clustering(self, kwargs): """ Main loop to perform the DBSCAN algorithm. """ elements_class = [PointClassType.UNCLASSIFIED ] * self.number_of_elements eps = kwargs["eps"] minpts = kwargs["minpts"] current_cluster_id = PointClassType.NOISE + 1 for i in range(self.number_of_elements): current_element = i if elements_class[current_element] == PointClassType.UNCLASSIFIED: last_forms_a_cluster = self.__expand_cluster( current_element, current_cluster_id, eps, minpts, elements_class) if last_forms_a_cluster: current_cluster_id = current_cluster_id + 1 # Return the clusters once the clustering is done # NOISE elements form a single cluster with ID = PointClassType.NOISE # and will be removed from the clustering clusters = gen_clusters_from_class_list( elements_class, skip_list=[PointClassType.NOISE]) return Clustering(clusters, details="DBSCAN (eps = " + str(eps) + " minpts = " + str(minpts) + ") " + str(self.number_of_elements) + " elems")
def test_classify(self): tags = ["A", "B", "C"] clusterings = [ Clustering([], "this is of type A"), Clustering([], "this is of type B"), Clustering([], "this is of type C"), Clustering([], "this is of type B"), Clustering([], "this is of type S"), Clustering([], "this is of type A"), Clustering([], "this is of type A"), Clustering([], "this is of type C"), Clustering([], "this is of type D") ] counter = Clustering.classify(tags, clusterings) self.assertEqual(counter['A'], 3) self.assertEqual(counter['B'], 2) self.assertEqual(counter['C'], 2)
def get_best_clustering(results_file): """ Loads and returns the best clustering from a results file. """ results = convert_to_utf8(json.loads(open(results_file).read())) best_clustering_id =results["best_clustering"] best_clustering_dic = results["selected"][best_clustering_id] return Clustering.from_dic(best_clustering_dic["clustering"])
def get_best_clustering(results_file): """ Loads and returns the best clustering from a results file. """ results = convert_to_utf8(json.loads(open(results_file).read())) best_clustering_id = results["best_clustering"] best_clustering_dic = results["selected"][best_clustering_id] return Clustering.from_dic(best_clustering_dic["clustering"])
def test_classify(self): tags = ["A","B","C"] clusterings = [Clustering([], "this is of type A"),Clustering([], "this is of type B"),Clustering([], "this is of type C"), Clustering([], "this is of type B"),Clustering([], "this is of type S"),Clustering([], "this is of type A"), Clustering([], "this is of type A"),Clustering([], "this is of type C"),Clustering([], "this is of type D")] counter = Clustering.classify(tags, clusterings) self.assertEqual(counter['A'], 3) self.assertEqual(counter['B'], 2) self.assertEqual(counter['C'], 2)
def perform_clustering(self, kwargs): """ Performs the hierarchical clustering step and the clustering step. If the hierarchical matrix is given, then it just calculates the clusters for a given cutoff. If we call the algorithm a second time it will use the last matrix. """ """ Gets a condensed matrix and calculates the clustering. One can use diverse methodologies to do this clustering... With preserve_input=False the matrix is destroyed while clustering, ut it saves memory. The metric is not needed in this case,as we are giving the function the calculated matrix. The method is the method used to determine distances when fusing clusters. methods are described in: http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html """ try: cutoff = kwargs["cutoff"] except KeyError: cutoff = None try: hie_mat = kwargs["hie_mat"] except KeyError: hie_mat = None try: method = kwargs["method"] except KeyError: method = 'complete' if hie_mat != None: self.hie_mat = hie_mat # print "[HIERARCHICAL] Matrix provided." else: if self.hie_mat == None: #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False) # print "[HIERARCHICAL] Calculating Matrix" #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method) self.hie_mat = hcluster_fast.linkage( self.condensed_matrix.get_data(), method=method) # else: # print "[HIERARCHICAL] Matrix was already stored" algorithm_details = "Hierarchical with " + method + " method (cutoff = " + str( cutoff) + ")" if cutoff != None: # Then apply the cutoff, this doesn't work much as expected # print "[HIERARCHICAL] getting clustering."+algorithm_details group_list = hcluster.fcluster(self.hie_mat, cutoff) # print "[HIERARCHICAL] Clustering done."+algorithm_details # Then let's generate the clusters clusters = gen_clusters_from_class_list(group_list) return Clustering(clusters, details=algorithm_details) else: return None
def test_update_medois(self): clusters = [Cluster(None, [1,2]),Cluster(None, [3,4]), Cluster(None, [5])] clustering = Clustering(clusters) matrix = CondensedMatrix(squared_CH_table1) update_medoids(clustering, matrix) for c in clusters: self.assertNotEqual(c.prototype, None) self.assertItemsEqual([c.prototype for c in clusters], [1,3,5])
def test_creation(self): # The inner list is a copy but shares clusters clusters = (Cluster(16, [16]), Cluster(4, [4, 5, 6, 7, 8]), Cluster(0, [0, 1, 2, 3]), Cluster(9, [9, 10, 11, 12, 13, 14, 15])) clustering = Clustering(clusters) clusters[1].prototype = -20 self.assertEqual(clusters[1].prototype, clustering.clusters[1].prototype)
def load_clustering(self, parameters): best_clustering = { "clustering": Clustering.from_dic( parameters["clustering"]["generation"]["parameters"]) } return ("loaded_clustering", { "loaded_clustering": best_clustering }, {}, None)
def test_mean_cluster_size(self): clusters = [ Cluster(0,[0,4,5,7,13]), Cluster(1,[1,16,17,18]), Cluster(2,[2,3,8,19]), Cluster(6,[6,11,12,15]), Cluster(9,[9,10,14])] clustering = Clustering(clusters, "Test Clustering") analysisPopulator = AnalysisPopulatorMock("") self.assertEqual(4, analysisPopulator.analysis_function_mean_cluster_size(clustering))
def test_mini_evaluation(self): calculator = MeanMinimumDistanceCalculator(10) clusters = [ Cluster(None, elements=[0, 1, 2]), Cluster(None, elements=[3, 4]) ] triangle = [1., 2., 3., 4., 5., 6., 7., 8., 9., 10.] distances = CondensedMatrix(triangle) clustering = Clustering(clusters) self.assertEqual(7.0, calculator.evaluate(clustering, distances, 20))
def test_gen_class_list(self): clusters =( Cluster(16,[16]), Cluster(4,[4,5,6,7,8]), Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) clusterization = Clustering(clusters) class_list = clusterization.gen_class_list() expected_class_list = [2, 2, 2, 2, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 3] self.assertItemsEqual(class_list, expected_class_list) clusters =( Cluster(0,[0,1,2,3]), Cluster(9,[9,10,11,12,13,14,15]) ) clusterization = Clustering(clusters) class_list = clusterization.gen_class_list() expected_class_list = [1, 1, 1, 1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0] self.assertItemsEqual(class_list, expected_class_list)
def test_regression_cohesion_eval(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters = [ Cluster(None, elements=[0, 1]), Cluster(None, elements=[2]), Cluster(None, elements=[3, 4]) ] clustering = Clustering(clusters) cohesion_calctor = CohesionCalculator() self.assertEqual(cohesion_calctor.evaluate(clustering, distances), 5.5)
def test_getClusterAndComplementary(self): clustering = Clustering([ Cluster(1, range(5)), Cluster(5, range(5, 10)), Cluster(10, range(10, 20)) ]) A, Acomp = get_cluster_and_complementary(1, clustering.clusters) A.sort() Acomp.sort() self.assertItemsEqual(A, [0, 1, 2, 3, 4]) self.assertItemsEqual( Acomp, [5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19])
def test_PCA(self): """ Regression test. """ trajectory_handler = TrajectoryHandlerStub( testPCAMetric.not_iterposed_coordsets, 66) clustering = Clustering( [Cluster(None, range(6)), Cluster(None, range(6, 12))], "a clustering") pcaMetric = PCAMetric(trajectory_handler) self.assertAlmostEquals(pcaMetric.evaluate(clustering), 1.427748687873, 12)
def testSilhouetteSpecialCase(self): clustering = Clustering.from_dic(data.clustering_01) mh = MatrixHandler({ "method": "load", "parameters":{ "path": "data/example_clustering_1_matrix" } } ) s = SilhouetteCoefficientCalculator() matrix = mh.create_matrix(None) print s.evaluate(clustering, matrix)
def test_evaluation(self): clusterings = [ { "clustering": Clustering( [Cluster(None, [0, 1, 2, 3]), Cluster(None, [4, 5])]), "result": 3.74 }, { "clustering": Clustering([ Cluster(None, [0, 1]), Cluster(None, [2, 3]), Cluster(None, [4, 5]) ]), "result": 3.705 }, { "clustering": Clustering([ Cluster(None, [0, 1]), Cluster(None, [2]), Cluster(None, [3]), Cluster(None, [4, 5]) ]), "result": 2.91 }, ] calculator = CalinskiHarabaszCalculator() matrix = CondensedMatrix(CH_table1) for i in range(len(clusterings)): self.assertAlmostEqual( clusterings[i]["result"], calculator.evaluate(clusterings[i]["clustering"], matrix), 2)
def repartition_with_kmedoids(cls, initial_cluster, k, submatrix): partitioned_clustering = cls.KMedoidsAlgorithmClass( submatrix).perform_clustering({ "k": k, "seeding_type": "RANDOM", "tries": 10 }) remapped_clusters = [] for partitioned_cluster in partitioned_clustering.clusters: remapped_clusters.append( cls.redefine_cluster_with_map(initial_cluster, partitioned_cluster)) return Clustering(remapped_clusters)
def test_batch_load(self): clusters = ((Cluster(16, [16]), os.path.join(test_data.__path__[0], "training_clustering_1.bin")), (Cluster(4, [4, 5, 6, 7, 8]), os.path.join(test_data.__path__[0], "training_clustering_2.bin")), (Cluster(0, [0, 1, 2, 3]), os.path.join(test_data.__path__[0], "training_clustering_3.bin")), (Cluster(9, [9, 10, 11, 12, 13, 14, 15]), os.path.join(test_data.__path__[0], "training_clustering_4.bin"))) # Creates 4 clusterings of 1 cluster filenames = [] for cluster, filename in clusters: Clustering([cluster]).save_to_disk(filename) filenames.append(filename) # Then loads them and extracts its elements elements = [] for filename in filenames: elements.extend( Clustering.load_from_disk( filename).get_all_clustered_elements()) elements_batch = [] clusterings_batch = Clustering.load_all_from_directory("data/") for clustering, filename in clusterings_batch: elements_batch.extend(clustering.get_all_clustered_elements()) # And cleans the house for filename in filenames: os.system("rm " + filename) numpy.testing.assert_equal(sorted(elements), range(17)) numpy.testing.assert_equal(sorted(elements_batch), range(17))
def test_one_clusterization_silhouette(self): distances = CondensedMatrix( [ 1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters_1 = [Cluster(None, elements=[0,1]), Cluster(None, elements=[2] ), Cluster(None, elements=[3,4])] clusterization_1 = Clustering(clusters_1) sil_calc = SilhouetteCoefficientCalculator() expected = [0.5, 0.80000000000000004, -0.55000000000000004, -0.45000000000000001, 0.7142857142857143] self.assertItemsEqual(sil_calc._SilhouetteCoefficientCalculator__one_clusterization_partial_silhouette(clusterization_1,distances),expected)
def test_regression_separation_eval(self): distances = CondensedMatrix([1., 2., 3., 4., 5., 6., 7., 8., 9., 10.]) clusters = [ Cluster(None, elements=[0, 1]), Cluster(None, elements=[2]), Cluster(None, elements=[3, 4]) ] clustering = Clustering(clusters) sep_calctor = SeparationCalculator() self.assertEqual( sep_calctor.evaluate(clustering, distances, [1, 1, 1]), 27.0 + 24.0 + 37.0) self.assertEqual(sep_calctor.evaluate(clustering, distances), (1 / 0.5) * 27.0 + (1 / 5.0) * 37.0)
def get_best_clustering(self, parameters): best_clustering = None ############################## # Do the actual clustering ############################## clustering_results = None ############################## # Load the clustering ############################## if parameters["clustering"]["generation"]["method"] == "load": best_clustering = {"clustering":Clustering.from_dic(parameters["clustering"]["generation"])} ############################## # Or generate it ############################## elif parameters["clustering"]["generation"]["method"] == "generate": clustering_results = ClusteringProtocol(self.timer, self.observer).run(parameters, self.matrixHandler, self.workspaceHandler, self.trajectoryHandler) best_clustering = None abort = False if clustering_results != None: best_clustering_id, selected, not_selected, scores = clustering_results # @UnusedVariable ################################# # Abort if no clusters were found ################################# if best_clustering_id is None: abort = True best_clustering = selected[best_clustering_id] else: abort = True if abort: self.notify("SHUTDOWN", "Improductive clustering search. Relax evaluation constraints.") print "[FATAL Driver:get_best_clustering] Improductive clustering search. Exiting..." exit() return best_clustering, clustering_results
parameters = ProtocolParameters.get_params_from_json(script_str) # And change another hypothesis stuff parameters["clustering"]["evaluation"]["maximum_noise"] = data.noise[dataset_name] parameters["clustering"]["evaluation"]["minimum_cluster_size"] = data.minsize[dataset_name] parameters["clustering"]["evaluation"]["minimum_clusters"] = data.num_cluster_ranges[dataset_name][0] parameters["clustering"]["evaluation"]["maximum_clusters"] = data.num_cluster_ranges[dataset_name][1] print parameters["clustering"]["evaluation"]["minimum_clusters"], parameters["clustering"]["evaluation"]["maximum_clusters"] if dataset_name in data.criteria: parameters["clustering"]["evaluation"]["evaluation_criteria"] = data.criteria[dataset_name] else: parameters["clustering"]["evaluation"]["evaluation_criteria"] = data.criteria["default"] Driver(Observer()).run(parameters) for dataset_name in ['concentric_circles']: #data.all_datasets: results_file = os.path.join(os.path.abspath("./tmp/%s"%dataset_name),"results/results.json") results = convert_to_utf8(json.loads(open(results_file).read())) best = results["best_clustering"] clustering = Clustering.from_dic(results["selected"][best]["clustering"]) vtools.show_2D_dataset_clusters(all_observations[dataset_name], clustering, scale = 20, margin = 20).save("clustering_images/%s.jpg"%dataset_name, "JPEG") print dataset_name,results["selected"][best]["type"],results["selected"][best]["clustering"]["number_of_clusters"], results["selected"][best]["evaluation"]["Noise level"],#results["selected"][best]["parameters"] # look for the best criteria criteria_scores = [] for criteria in results["scores"]: criteria_scores.append((results["scores"][criteria][best],criteria)) print criteria_scores print "\nDone"
# # RCD_script = copy.deepcopy(template_script) # RCD_script["global"]["workspace"]["base"] = os.path.join("RDCvsRMSD", "campari", "RDC", "clustering") # RCD_script["data"]["matrix"]["method"] = "load" # RCD_script["data"]["matrix"]["parameters"]["path"] = os.path.join("RDCvsRMSD", "campari", "RDC", "matrix") # RCD_script["data"]["files"].append(os.path.join("RDCvsRMSD", "campari.pdb")) # # tools.save_dic_in_json(RCD_script, os.path.join("RDCvsRMSD", "campari", "RDC", "script.json")) # tools.save_dic_in_json(RMSD_script, os.path.join("RDCvsRMSD", "campari", "RMSD", "script.json")) # # os.system("python %s %s "%(PYPROCT, os.path.join("RDCvsRMSD", "campari", "RDC", "script.json"))) # os.system("python %s %s "%(PYPROCT, os.path.join("RDCvsRMSD", "campari", "RMSD", "script.json"))) results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "RDC_refined", "clustering","results","results.json")) RDC_clustering = Clustering.from_dic(results["selected"][results["best_clustering"]]["clustering"]).gen_class_list(number_of_elements = 5926) results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "RMSD_refined", "clustering","results","results.json")) RMSD_clustering = Clustering.from_dic(results["selected"][results["best_clustering"]]["clustering"]).gen_class_list(number_of_elements = 5926) results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "Dihedral", "clustering","results","results.json")) Dihedral_clustering = Clustering.from_dic(results["selected"][results["best_clustering"]]["clustering"]).gen_class_list(number_of_elements = 5926) results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "Dihedral", "clustering","results","results.json")) Dihedral_bad_score = Clustering.from_dic(results["selected"]["clustering_0098"]["clustering"]).gen_class_list(number_of_elements = 5926) results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "Dihedral", "clustering","results","results.json")) Dihedral_medium_score = Clustering.from_dic(results["selected"]["clustering_0056"]["clustering"]).gen_class_list(number_of_elements = 5926) results = tools.load_dic_in_json(os.path.join("RDCvsRMSD", "campari", "Dihedral", "clustering","results","results.json")) Dihedral_fairly_good_score = Clustering.from_dic(results["selected"]["clustering_0212"]["clustering"]).gen_class_list(number_of_elements = 5926)
fig = plt.figure() ax = fig.gca(projection='3d') # Plot protein pdb = prody.parsePDB(params["data"]["files"][0]) if options.show_protein: pdb_backbone = pdb.select("name CA").getCoordsets()[0] # "backbone not hetero" ax.plot(pdb_backbone.T[0], pdb_backbone.T[1], pdb_backbone.T[2]) # Get geometric centers and plot ligands ligand_coords = pdb.select(params["data"]["matrix"]["parameters"]["body_selection"]).getCoordsets() # Get clustering if options.clustering_to_see is None: options.clustering_to_see = results["best_clustering"] try: clustering = Clustering.from_dic(results["selected"][options.clustering_to_see]["clustering"]) # Print some info print_cluster_info("selected", options.clustering_to_see, results) except: clustering = Clustering.from_dic(results["not_selected"][options.clustering_to_see]["clustering"]) # Print some info print_cluster_info("not_selected", options.clustering_to_see, results) # Show all clusters colors = iter(cm.rainbow(numpy.linspace(0, 1, len(clustering.clusters)))) for cluster in clustering.clusters: centers = [] for i,element in enumerate(cluster.all_elements): if options.stride is None or i%options.stride == 0: coords = ligand_coords[element] centers.append(coords.mean(0))
#-------------------------------- # Prepare the clustering for this guy #-------------------------------- ## Load template and modify its contents for this case CLUSTERING_PATH = os.path.join(RESULTS_PATH,"%s_%s_clustering"%(options.drug, options.protein)) MAX_CLUSTERS = 10 SCRIPT_PATH = os.path.join(RESULTS_PATH,"clustering.json") OUT_FILE = os.path.join(RESULTS_PATH, "clustering.out") script = load_dic_in_json(options.template) script["global"]["workspace"]["base"] = CLUSTERING_PATH script["data"]["files"].append(FILTERED_PDB_FILE) script["clustering"]["evaluation"]["maximum_clusters"] = MAX_CLUSTERS save_dic_in_json(script, SCRIPT_PATH) os.system("python -m pyproct.main %s > %s"%(SCRIPT_PATH, OUT_FILE)) best_clustering = Clustering.from_dic(get_best_clustering(CLUSTERING_PATH)["clustering"]) #-------------------------------- # Now calculate the values #-------------------------------- results = {} for cluster in best_clustering.clusters: energies = metrics[1][cluster.all_elements] distances = metrics[0][cluster.all_elements] results[cluster.id] = {} results[cluster.id]["max_energy"] = numpy.max(energies) results[cluster.id]["min_energy"] = numpy.min(energies) results[cluster.id]["mean_energy"] = numpy.mean(energies) results[cluster.id]["mean_distance"] = numpy.mean(distances) results[cluster.id]["population"] = len(cluster.all_elements)
def test_get_medoids(self): clusters = [ClusterMock(range(0,10)),ClusterMock(range(10,50)),ClusterMock(range(50,80)),ClusterMock(range(80,200))] clustering = Clustering(clusters) self.assertItemsEqual(clustering.get_medoids("distance_matrix"),[0, 10, 50, 80])
for j in range(0,N): if (i,j) in cluster.percents: data[str(i)].append( cluster.percents[(i,j)]) else: data[str(i)].append(0) return data if __name__ == '__main__': results = convert_to_utf8(json.loads(open(sys.argv[1]).read())) best_clustering_id =results["best_clustering"] best_clustering_dic = results["selected"][best_clustering_id] num_clusters = best_clustering_dic["clustering"]["number_of_clusters"] clustering = Clustering.from_dic(best_clustering_dic["clustering"]) file_frames = int(sys.argv[2]) # generate a map element -> interpolation index_to_interpolation = {} acc = 0 for i in range(0, file_frames-1): for j in range(i+1, file_frames): for k in range(20): index_to_interpolation[acc] = (i,j) acc += 1 for cluster in clustering.clusters: colors = iter(cm.rainbow(np.linspace(0, 1, N))) theta = radar_factory(N, frame='polygon')
def load_clustering(self, parameters): best_clustering = {"clustering":Clustering.from_dic(parameters["clustering"]["generation"]["parameters"])} return ( "loaded_clustering", {"loaded_clustering":best_clustering}, {}, None)
def test_get_proportional_size_representatives(self): clusters = [ClusterMock(range(0,10)),ClusterMock(range(10,50)),ClusterMock(range(50,80)),ClusterMock(range(80,200))] clustering = Clustering(clusters) rep = clustering.get_proportional_size_representatives(30, "distance_matrix" ) self.assertItemsEqual(rep, [0, 0, 10, 10, 11, 12, 13, 14, 50, 50, 51, 52, 53, 80, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96])