def analyze_clustering(cls, separated_decomposed_clusters, distance_matrix, analysis): """ Performs the overlap analysis of a clustering (calculates global measurements). """ analysis["total_num_clusters"] = 0 analysis["total_num_elements"] = 0 analysis["overlap"] = OverlapCalculator.calculate_clustering_overlap( mergeSeparatedClusters(separated_decomposed_clusters), distance_matrix) analysis[ "mixed_overlap"] = OverlapCalculator.calculate_clustering_overlap( mergeSeparatedClusters( {"mixed": separated_decomposed_clusters["mixed"]}), distance_matrix) for cluster_type in separated_decomposed_clusters: analysis["num_" + cluster_type] = len( separated_decomposed_clusters[cluster_type]) analysis["total_num_clusters"] += analysis["num_" + cluster_type] analysis["num_" + cluster_type + "_elements"] = numpy.sum([ len( getAllElements( separated_decomposed_clusters[cluster_type][dc_id])) for dc_id in separated_decomposed_clusters[cluster_type] ]) analysis["total_num_elements"] += analysis["num_" + cluster_type + "_elements"]
def analyze_clustering(cls, separated_decomposed_clusters, distance_matrix, analysis): analysis["total_num_clusters"] = 0 analysis["total_num_elements"] = 0 analysis["overlap"] = OverlapCalculator.calculate_global_overlap(mergeSeparatedClusters(separated_decomposed_clusters), distance_matrix, 2, 1) for cluster_type in separated_decomposed_clusters: analysis["num_" + cluster_type] = len(separated_decomposed_clusters[cluster_type]) analysis["total_num_clusters"] += analysis["num_" + cluster_type] analysis["num_" + cluster_type + "_elements"] = numpy.sum([len(getAllElements(separated_decomposed_clusters[cluster_type][dc_id])) for dc_id in separated_decomposed_clusters[cluster_type]]) analysis["total_num_elements"] += analysis["num_" + cluster_type + "_elements"] return cluster_type
def calculate_global_overlap(cls, decomposed_clusters, distance_matrix, cluster_method, global_method): """ """ sys.stdout.flush() cluster_overlaps = numpy.array([ cls.calculate_cluster_overlap(cluster_method, decomposed_cluster, distance_matrix) for decomposed_cluster in decomposed_clusters]) cluster_sizes = numpy.array([len(getAllElements(decomposed_cluster)) for decomposed_cluster in decomposed_clusters]) global_size = numpy.sum(cluster_sizes) overlap = numpy.sum(cluster_overlaps * cluster_sizes) / float(global_size) # Overlap 0 is the best overlap, overlap 1 is the worst. We reverse it to do it more easy to understand return 1 - overlap
def analyze_clusters(cls, separated_decomposed_clusters, distance_matrix, analysis): """ Performs the overlap analysis of separated clusters. """ for cluster_type in separated_decomposed_clusters: for cluster_id in separated_decomposed_clusters[cluster_type]: decomposed_cluster = separated_decomposed_clusters[ cluster_type][cluster_id] analysis[cluster_id] = { "components": decomposed_cluster.keys(), "global": {} } analysis[cluster_id]["global"]["mean"], analysis[cluster_id][ "global"]["std"], analysis[cluster_id]["global"][ "max"] = calculate_distance_stats( getAllElements(decomposed_cluster), distance_matrix) analysis[cluster_id]["global"]["num_elements"] = len( getAllElements(decomposed_cluster)) for traj_id in decomposed_cluster: analysis[cluster_id]["global"][traj_id] = {} analysis[cluster_id]["global"][traj_id]["mean"], analysis[ cluster_id]["global"][traj_id]["std"], analysis[ cluster_id]["global"][traj_id][ "max"] = calculate_distance_stats( decomposed_cluster[traj_id], distance_matrix) analysis[cluster_id]["global"][traj_id][ "num_elements"] = len(decomposed_cluster[traj_id]) if cluster_type == "mixed": analysis[cluster_id][ "centers_mean_diff"] = calculate_mean_center_differences( decomposed_cluster, distance_matrix) # The overlap ranges between 0 and 1, being 0 the best value. We invert it in order to # to get a more understandable range (1 is the best value and 0 the worst). analysis[cluster_id]["global"][ "overlap"] = 1 - OverlapCalculator.calculate_cluster_overlap( decomposed_cluster, distance_matrix)
def calculate_clustering_overlap(cls, decomposed_clusters, distance_matrix): """ """ cluster_overlaps = numpy.array([ cls.calculate_cluster_overlap(decomposed_cluster, distance_matrix) for decomposed_cluster in decomposed_clusters]) cluster_sizes = numpy.array([len(getAllElements(decomposed_cluster)) for decomposed_cluster in decomposed_clusters]) num_elements = numpy.sum(cluster_sizes) clustering_overlap = ((1./num_elements)* numpy.dot(cluster_overlaps, cluster_sizes)) # Overlap 0 is the best overlap, overlap 1 is the worst. We reverse it to do it more easy to understand return 1 - clustering_overlap
def analyze_clustering(cls, separated_decomposed_clusters, distance_matrix, analysis): """ Performs the overlap analysis of a clustering (calculates global measurements). """ analysis["total_num_clusters"] = 0 analysis["total_num_elements"] = 0 analysis["overlap"] = OverlapCalculator.calculate_clustering_overlap(mergeSeparatedClusters(separated_decomposed_clusters), distance_matrix) analysis["mixed_overlap"] = OverlapCalculator.calculate_clustering_overlap(mergeSeparatedClusters({"mixed":separated_decomposed_clusters["mixed"]}), distance_matrix) for cluster_type in separated_decomposed_clusters: analysis["num_" + cluster_type] = len(separated_decomposed_clusters[cluster_type]) analysis["total_num_clusters"] += analysis["num_" + cluster_type] analysis["num_" + cluster_type + "_elements"] = numpy.sum([len(getAllElements(separated_decomposed_clusters[cluster_type][dc_id])) for dc_id in separated_decomposed_clusters[cluster_type]]) analysis["total_num_elements"] += analysis["num_" + cluster_type + "_elements"]
def analyze_clusters(cls, separated_decomposed_clusters, distance_matrix, analysis): for cluster_type in separated_decomposed_clusters: for cluster_id in separated_decomposed_clusters[cluster_type]: decomposed_cluster = separated_decomposed_clusters[ cluster_type][cluster_id] analysis[cluster_id] = { "components": decomposed_cluster.keys(), "global": {} } analysis[cluster_id]["global"]["mean"], analysis[cluster_id][ "global"]["std"], analysis[cluster_id]["global"][ "max"] = calculate_distance_stats( getAllElements(decomposed_cluster), distance_matrix) analysis[cluster_id]["global"]["num_elements"] = len( getAllElements(decomposed_cluster)) for traj_id in decomposed_cluster: analysis[cluster_id]["global"][traj_id] = {} analysis[cluster_id]["global"][traj_id]["mean"], analysis[ cluster_id]["global"][traj_id]["std"], analysis[ cluster_id]["global"][traj_id][ "max"] = calculate_distance_stats( decomposed_cluster[traj_id], distance_matrix) analysis[cluster_id]["global"][traj_id][ "num_elements"] = len(decomposed_cluster[traj_id]) if cluster_type == "mixed": analysis[cluster_id][ "centers_mean_diff"] = calculate_mean_center_differences( decomposed_cluster, distance_matrix) analysis[cluster_id]["global"][ "overlap"] = OverlapCalculator.calculate_cluster_overlap( 2, decomposed_cluster, distance_matrix)
def analyze_clusters(cls, separated_decomposed_clusters, distance_matrix, analysis): for cluster_type in separated_decomposed_clusters: for cluster_id in separated_decomposed_clusters[cluster_type]: decomposed_cluster = separated_decomposed_clusters[cluster_type][cluster_id] analysis[cluster_id] = {"components":decomposed_cluster.keys(),"global":{}} analysis[cluster_id]["global"]["mean"], analysis[cluster_id]["global"]["std"], analysis[cluster_id]["global"]["max"] = calculate_distance_stats(getAllElements(decomposed_cluster), distance_matrix) analysis[cluster_id]["global"]["num_elements"] = len(getAllElements(decomposed_cluster)) for traj_id in decomposed_cluster: analysis[cluster_id]["global"][traj_id] = {} analysis[cluster_id]["global"][traj_id]["mean"], analysis[cluster_id]["global"][traj_id]["std"], analysis[cluster_id]["global"][traj_id]["max"] = calculate_distance_stats(decomposed_cluster[traj_id], distance_matrix) analysis[cluster_id]["global"][traj_id]["num_elements"] = len(decomposed_cluster[traj_id]) if cluster_type == "mixed": analysis[cluster_id]["centers_mean_diff"] = calculate_mean_center_differences(decomposed_cluster, distance_matrix) analysis[cluster_id]["global"]["overlap"] = OverlapCalculator.calculate_cluster_overlap(2, decomposed_cluster, distance_matrix)
def calculate_clustering_overlap(cls, decomposed_clusters, distance_matrix): """ """ cluster_overlaps = numpy.array([ cls.calculate_cluster_overlap(decomposed_cluster, distance_matrix) for decomposed_cluster in decomposed_clusters ]) cluster_sizes = numpy.array([ len(getAllElements(decomposed_cluster)) for decomposed_cluster in decomposed_clusters ]) num_elements = numpy.sum(cluster_sizes) clustering_overlap = ((1. / num_elements) * numpy.dot(cluster_overlaps, cluster_sizes)) # Overlap 0 is the best overlap, overlap 1 is the worst. We reverse it to do it more easy to understand return 1 - clustering_overlap
def calculate_global_overlap(cls, decomposed_clusters, distance_matrix, cluster_method, global_method): """ """ sys.stdout.flush() cluster_overlaps = numpy.array([ cls.calculate_cluster_overlap(cluster_method, decomposed_cluster, distance_matrix) for decomposed_cluster in decomposed_clusters ]) cluster_sizes = numpy.array([ len(getAllElements(decomposed_cluster)) for decomposed_cluster in decomposed_clusters ]) global_size = numpy.sum(cluster_sizes) overlap = numpy.sum( cluster_overlaps * cluster_sizes) / float(global_size) # Overlap 0 is the best overlap, overlap 1 is the worst. We reverse it to do it more easy to understand return 1 - overlap
def calculate_cluster_overlap(cls, decomposed_cluster, distance_matrix): """ Calculates the overlap value for a cluster in a range [0,1]. @param decomposed_cluster: A """ if len(decomposed_cluster) == 1: return 1.0 # If the cluster is 'pure' we penalize the global overlap else: N = len(getAllElements(decomposed_cluster)) min_distances = cls.get_cluster_min_distances(decomposed_cluster, distance_matrix) max_min_distance = max(min_distances) if max_min_distance == 0: # Then overlap is total return 0.0 else: return (1./N) * (numpy.sum(min_distances) /max_min_distance)
def calculate_cluster_overlap(cls, method, decomposed_cluster, distance_matrix): """ Calculates the overlap value for a cluster in a range [0,1]. @param decomposed_cluster: A """ min_distances, max_distances= cls.get_cluster_min_max_distances(decomposed_cluster, distance_matrix) if len(min_distances) == 0: return 1. if method == 1: return numpy.sum(min_distances) / numpy.sum(max_distances) elif method == 2: return numpy.sum(min_distances / max_distances) / len(getAllElements(decomposed_cluster)) else: print "[ERROR OverlapCalculator::calculate_cluster_overlap] The method nr. %d does not exist."%(method) exit()
def analyze_clustering(cls, separated_decomposed_clusters, distance_matrix, analysis): analysis["total_num_clusters"] = 0 analysis["total_num_elements"] = 0 analysis["overlap"] = OverlapCalculator.calculate_global_overlap( mergeSeparatedClusters(separated_decomposed_clusters), distance_matrix, 2, 1) for cluster_type in separated_decomposed_clusters: analysis["num_" + cluster_type] = len( separated_decomposed_clusters[cluster_type]) analysis["total_num_clusters"] += analysis["num_" + cluster_type] analysis["num_" + cluster_type + "_elements"] = numpy.sum([ len( getAllElements( separated_decomposed_clusters[cluster_type][dc_id])) for dc_id in separated_decomposed_clusters[cluster_type] ]) analysis["total_num_elements"] += analysis["num_" + cluster_type + "_elements"] return cluster_type
def analyze_clusters(cls, separated_decomposed_clusters, distance_matrix, analysis): """ Performs the overlap analysis of separated clusters. """ for cluster_type in separated_decomposed_clusters: for cluster_id in separated_decomposed_clusters[cluster_type]: decomposed_cluster = separated_decomposed_clusters[cluster_type][cluster_id] analysis[cluster_id] = {"components":decomposed_cluster.keys(),"global":{}} analysis[cluster_id]["global"]["mean"], analysis[cluster_id]["global"]["std"], analysis[cluster_id]["global"]["max"] = calculate_distance_stats(getAllElements(decomposed_cluster), distance_matrix) analysis[cluster_id]["global"]["num_elements"] = len(getAllElements(decomposed_cluster)) for traj_id in decomposed_cluster: analysis[cluster_id]["global"][traj_id] = {} analysis[cluster_id]["global"][traj_id]["mean"], analysis[cluster_id]["global"][traj_id]["std"], analysis[cluster_id]["global"][traj_id]["max"] = calculate_distance_stats(decomposed_cluster[traj_id], distance_matrix) analysis[cluster_id]["global"][traj_id]["num_elements"] = len(decomposed_cluster[traj_id]) if cluster_type == "mixed": analysis[cluster_id]["centers_mean_diff"] = calculate_mean_center_differences(decomposed_cluster, distance_matrix) # The overlap ranges between 0 and 1, being 0 the best value. We invert it in order to # to get a more understandable range (1 is the best value and 0 the worst). analysis[cluster_id]["global"]["overlap"] = 1 - OverlapCalculator.calculate_cluster_overlap( decomposed_cluster, distance_matrix)
def calculate_cluster_overlap(cls, decomposed_cluster, distance_matrix): """ Calculates the overlap value for a cluster in a range [0,1]. @param decomposed_cluster: A """ if len(decomposed_cluster) == 1: return 1.0 # If the cluster is 'pure' we penalize the global overlap else: N = len(getAllElements(decomposed_cluster)) min_distances = cls.get_cluster_min_distances( decomposed_cluster, distance_matrix) max_min_distance = max(min_distances) if max_min_distance == 0: # Then overlap is total return 0.0 else: return (1. / N) * (numpy.sum(min_distances) / max_min_distance)
def test_decompose(self): traj_ranges = {"traj_A": (0, 6), "traj_B": (7, 15)} clusters = [ Cluster(None, [0, 1, 2]), Cluster(None, [3, 8, 10]), Cluster(None, [14, 4, 15]), Cluster(None, [5, 6]), Cluster(None, [7, 9, 11, 12, 13]), ] for i in range(len(clusters)): clusters[i].id = str(i) decomposed = Separator.decompose(clusters, traj_ranges) all_elements = [] for cluster_id in decomposed: all_elements.extend(getAllElements(decomposed[cluster_id])) expected = { '0': { 'traj_A': [0, 1, 2] }, '1': { 'traj_A': [3], 'traj_B': [8, 10] }, '2': { 'traj_A': [4], 'traj_B': [14, 15] }, '3': { 'traj_A': [5, 6] }, '4': { 'traj_B': [9, 11, 12, 13, 7] } } self.assertItemsEqual(range(16), sorted(all_elements)) self.assertDictEqual(expected, decomposed)
def test_decompose(self): traj_ranges = {"traj_A":(0,6),"traj_B":(7,15)} clusters = [ Cluster(None,[0,1,2]), Cluster(None,[3,8,10]), Cluster(None,[14,4,15]), Cluster(None,[5,6]), Cluster(None,[7,9,11,12,13]), ] for i in range(len(clusters)): clusters[i].id = str(i) decomposed = Separator.decompose(clusters, traj_ranges) all_elements = [] for cluster_id in decomposed: all_elements.extend(getAllElements(decomposed[cluster_id])) expected = { '0': { 'traj_A': [0, 1, 2] }, '1': { 'traj_A': [3], 'traj_B': [8, 10] }, '2': { 'traj_A': [4], 'traj_B': [14, 15] }, '3': { 'traj_A': [5, 6] }, '4': { 'traj_B': [9, 11, 12, 13, 7] } } self.assertItemsEqual(range(16),sorted(all_elements)) self.assertDictEqual(expected, decomposed )
def calculate_cluster_overlap(cls, method, decomposed_cluster, distance_matrix): """ Calculates the overlap value for a cluster in a range [0,1]. @param decomposed_cluster: A """ min_distances, max_distances = cls.get_cluster_min_max_distances( decomposed_cluster, distance_matrix) if len(min_distances) == 0: return 1. if method == 1: return numpy.sum(min_distances) / numpy.sum(max_distances) elif method == 2: return numpy.sum(min_distances / max_distances) / len( getAllElements(decomposed_cluster)) else: print "[ERROR OverlapCalculator::calculate_cluster_overlap] The method nr. %d does not exist." % ( method) exit()
def test_getAllElements(self): numpy.testing.assert_array_equal( sorted(getAllElements(self.decomposed_cluster)), range(15))