def test_get_intra_cluster_distances(self): matrix = CondensedMatrix(CH_table1) numpy.testing.assert_almost_equal(get_intra_cluster_distances(Cluster(None, [4,5]), matrix),[2.4494897427831779],5) numpy.testing.assert_almost_equal(get_intra_cluster_distances(Cluster(None, [1,3,5]), matrix),[2.4494897427831779, 3.8729833462074170, 3.8729833462074170],5) data = [0.0, 1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0] matrix = CondensedMatrix(data) expected_distance = 4 self.assertEqual(expected_distance, numpy.sum(get_intra_cluster_distances(Cluster(None, range(5)), matrix)))
def calculate_per_cluster_stats(best_clustering, matrix, parameters, results_folder): """ CSV file """ file_name = parameters.get_value( "file", default_value="per_cluster_stats") + ".csv" stats_file_path = os.path.join(results_folder, file_name) stats_file = open(stats_file_path, "w") header_line = "," for i in range(len(best_clustering.clusters)): cluster = best_clustering.clusters[i] header_line += "%s," % cluster.id header_line = header_line[:-1] + "\n" stats_file.write(header_line) for i in range(len(best_clustering.clusters)): cluster_i = best_clustering.clusters[i] intra_distances = get_intra_cluster_distances(cluster_i, matrix) radius = max(intra_distances) if intra_distances != [] else 0. line = "%s(%.2f)," % (cluster_i.id, radius) for j in range(0, i + 1): line += "," for j in range(i + 1, len(best_clustering.clusters)): cluster_j = best_clustering.clusters[j] line += "%.2f," % matrix[cluster_i.prototype, cluster_j.prototype] line = line[:-1] + "\n" stats_file.write(line) stats_file.close() return stats_file_path
def min_intracluster_distances(cls, clustering, matrix): """ Calculates d_min, the minimum internal distance. @param clustering: The clustering being checked. @param matrix: The condensed matrix containing all distances. @return: d_min's value """ return numpy.min([ numpy.min(get_intra_cluster_distances(c, matrix)) for c in clustering.clusters ])
def ch_cluster_term(cls, cluster, global_mean_distance, matrix): """ Calculates one of the formula terms (ng-1)(D-d_g) @param cluster: The cluster to use in calculation. @param global_mean_distance: 'D'. Is the mean of the n*(n-1)/2 distances of all the elements. @param matrix: The condensed matrix containing all distances. @return: Calculated term. """ # Calculate cluster mean distance n = len(cluster.all_elements) cluster_mean_distance = mean( numpy.array(get_intra_cluster_distances(cluster, matrix))**2) return (n - 1) * (global_mean_distance - cluster_mean_distance)
def WGSS(cls, clusters, matrix): """ C-H description of the "Within group sum of squares". @param clusters: An array with all clusters description (usually Clustering.clusters) @param matrix: The condensed matrix containing all distances. @return: The value of WGSS. """ wgss = 0 for c in clusters: n = len(c.all_elements) d = mean(numpy.array(get_intra_cluster_distances(c, matrix))**2) wgss += (n - 1) * d return wgss * 0.5
def WGSS(cls, clusters, matrix): """ C-H description of the "Within group sum of squares". :param clusters: An array with all clusters description (usually Clustering.clusters) :param matrix: The condensed matrix containing all distances. :return: The value of WGSS. """ wgss = 0 for c in clusters: n = len(c.all_elements) d = mean(numpy.array(get_intra_cluster_distances( c, matrix))**2) wgss += (n-1)*d return wgss*0.5
def calculate_per_cluster_stats(best_clustering, matrix, parameters, results_folder): """ CSV file """ file_name = parameters.get_value( "file", default_value="per_cluster_stats") + ".csv" stats_file_path = os.path.join(results_folder, file_name) stats_file = open(stats_file_path, "w") header_line = "," for i in range(len(best_clustering.clusters)): cluster = best_clustering.clusters[i] header_line += "%s," % cluster.id header_line = header_line[:-1] + "\n" stats_file.write(header_line) # TODO: Once clusterings and clusters become inmutable its medoids will be always updated, # then this kind of operations will be unnecessary update_medoids(best_clustering, matrix) #---------------------------------------- for i in range(len(best_clustering.clusters)): cluster_i = best_clustering.clusters[i] try: intra_distances = get_intra_cluster_distances(cluster_i, matrix) diameter = max(intra_distances) distances_from_proto = get_distances_of_elements_to( cluster_i.prototype, cluster_i.all_elements, matrix) radius = max(distances_from_proto) except SingularClusterException: diameter = 0 radius = 0 finally: line = "%s(d: %.2f r: %.2f)," % (cluster_i.id, diameter, radius) for j in range(0, i + 1): line += "," for j in range(i + 1, len(best_clustering.clusters)): cluster_j = best_clustering.clusters[j] line += "%.2f," % matrix[cluster_i.prototype, cluster_j.prototype] line = line[:-1] + "\n" stats_file.write(line) stats_file.close() return stats_file_path
def ch_cluster_term(cls, cluster, global_mean_distance, matrix): """ Calculates one of the formula terms (ng-1)(D-d_g) :param cluster: The cluster to use in calculation. :param global_mean_distance: 'D'. Is the mean of the n*(n-1)/2 distances of all the elements. :param matrix: The condensed matrix containing all distances. :return: Calculated term. """ # Calculate cluster mean distance n = len(cluster.all_elements) try: cluster_mean_distance = mean(numpy.array(get_intra_cluster_distances( cluster, matrix))**2) except SingularClusterException: cluster_mean_distance = 0 return (n-1) * (global_mean_distance - cluster_mean_distance)
def calculate_per_cluster_stats(best_clustering, matrix, parameters, results_folder): """ CSV file """ file_name = parameters.get_value("file", default_value = "per_cluster_stats") + ".csv" stats_file_path = os.path.join(results_folder,file_name) stats_file = open(stats_file_path,"w") header_line ="," for i in range(len(best_clustering.clusters)): cluster = best_clustering.clusters[i] header_line+="%s,"%cluster.id header_line = header_line[:-1] +"\n" stats_file.write(header_line) # TODO: Once clusterings and clusters become inmutable its medoids will be always updated, # then this kind of operations will be unnecessary update_medoids(best_clustering, matrix) #---------------------------------------- for i in range(len(best_clustering.clusters)): cluster_i = best_clustering.clusters[i] try: intra_distances = get_intra_cluster_distances(cluster_i, matrix) diameter = max(intra_distances) distances_from_proto = get_distances_of_elements_to(cluster_i.prototype, cluster_i.all_elements, matrix) radius = max(distances_from_proto) except SingularClusterException: diameter = 0 radius = 0 finally: line = "%s(d: %.2f r: %.2f),"%(cluster_i.id, diameter, radius) for j in range(0, i+1): line += "," for j in range(i+1, len(best_clustering.clusters)): cluster_j = best_clustering.clusters[j] line+="%.2f,"%matrix[ cluster_i.prototype, cluster_j.prototype] line = line[:-1] + "\n" stats_file.write(line) stats_file.close() return stats_file_path
def min_intracluster_distances(cls, clustering, matrix): """ Calculates d_min, the minimum internal distance. @param clustering: The clustering being checked. @param matrix: The condensed matrix containing all distances. @return: d_min's value """ distances = [] for c in clustering.clusters: try: distances.append(numpy.min(get_intra_cluster_distances(c, matrix))) except SingularClusterException: # If we work with a singular cluster, we add 0s so that no min function # fails. The convention for the distance of a cluster with only one element # will be 0 in this case. distances.append(0) return numpy.min(distances)
def min_intracluster_distances(cls, clustering, matrix): """ Calculates d_min, the minimum internal distance. @param clustering: The clustering being checked. @param matrix: The condensed matrix containing all distances. @return: d_min's value """ distances = [] for c in clustering.clusters: try: distances.append( numpy.min(get_intra_cluster_distances(c, matrix))) except SingularClusterException: # If we work with a singular cluster, we add 0s so that no min function # fails. The convention for the distance of a cluster with only one element # will be 0 in this case. distances.append(0) return numpy.min(distances)