Ejemplo n.º 1
0
    def get_cluster_min_max_distances(cls, decomposed_cluster,
                                      distance_matrix):
        """

        """
        allIds = decomposed_cluster.keys()
        min_distances = []
        max_distances = []

        if len(
                allIds
        ) > 1:  # if the cluster is pure, we do not calculate min or max (it does not have sense)
            for setId in allIds:
                myVsIds = list(allIds)
                myVsIds.remove(setId)
                vs_elements = []
                for vsId in myVsIds:
                    vs_elements.extend(decomposed_cluster[vsId])
                for element in decomposed_cluster[setId]:
                    min_distances.append(
                        numpy.min(
                            get_distances_of_elements_to(
                                element, vs_elements, distance_matrix)))
                    max_distances.append(
                        numpy.max(
                            get_distances_of_elements_to(
                                element, vs_elements, distance_matrix)))
        return numpy.array(min_distances), numpy.array(max_distances)
Ejemplo n.º 2
0
 def cluster_variance(cls, cluster, matrix):
     """
     precondition, cluster medoid (prototype) it's alread
     """
     return numpy.var(
         get_distances_of_elements_to(cluster.prototype,
                                      cluster.all_elements, matrix))
Ejemplo n.º 3
0
    def get_cluster_min_distances(cls, decomposed_cluster, distance_matrix):
        """
        Calculates the distances between the elements of all different classes in the cluster
        and returns the minimum distance for each of these elements.
        Some distances will be counted twice. This is OK.
        """
        allIds = decomposed_cluster.keys()
        min_distances = []

        if len(
                allIds
        ) > 1:  # if the cluster is pure, we do not calculate min or max (it does not have sense)
            for setId in allIds:
                myVsIds = list(allIds)
                myVsIds.remove(setId)
                vs_elements = []
                for vsId in myVsIds:
                    vs_elements.extend(decomposed_cluster[vsId])
                for element in decomposed_cluster[setId]:
                    min_distances.append(
                        numpy.min(
                            get_distances_of_elements_to(
                                element, vs_elements, distance_matrix)))
            return numpy.array(min_distances)
        else:
            raise ValueError("Asking min max distances of a PURE cluster.")
Ejemplo n.º 4
0
    def get_cluster_min_max_distances(cls, decomposed_cluster, distance_matrix):
        """

        """
        allIds = decomposed_cluster.keys()
        min_distances = []
        max_distances = []

        if len(allIds)>1: # if the cluster is pure, we do not calculate min or max (it does not have sense)
            for setId in allIds:
                myVsIds = list(allIds)
                myVsIds.remove(setId)
                vs_elements = []
                for vsId in myVsIds:
                    vs_elements.extend(decomposed_cluster[vsId])
                for element in decomposed_cluster[setId]:
                    min_distances.append( numpy.min(get_distances_of_elements_to(element, vs_elements, distance_matrix)))
                    max_distances.append( numpy.max(get_distances_of_elements_to(element, vs_elements, distance_matrix)))
            return numpy.array(min_distances), numpy.array(max_distances)
        else:
            raise ValueError("Asking min max distances of a PURE cluster.")
Ejemplo n.º 5
0
 def calculate_average_distance_from_prototype(cls, cluster, matrix):
     """
     Returns the average distance of the elements of a cluster with its medoid.
     @param cluster: The cluster from which we want to calculate this distance.
     @param matrix: The condensed matrix containing all distances.
     @return: The calculated value.
     """
     proto = cluster.prototype
     elements_copy = list(cluster.all_elements)
     elements_copy.remove(proto)
     distances = get_distances_of_elements_to(proto, elements_copy, matrix)
     if distances == []:
         return 0.
     else:
         return numpy.mean(distances)
Ejemplo n.º 6
0
 def calculate_average_distance_from_prototype(cls, cluster, matrix):
     """
     Returns the average distance of the elements of a cluster with its medoid.
     @param cluster: The cluster from which we want to calculate this distance.
     @param matrix: The condensed matrix containing all distances.
     @return: The calculated value.
     """
     proto = cluster.prototype
     elements_copy = list(cluster.all_elements)
     elements_copy.remove(proto)
     distances = get_distances_of_elements_to(proto, elements_copy, matrix)
     if distances == []:
         return 0.
     else:
         return numpy.mean(distances)   
Ejemplo n.º 7
0
def calculate_per_cluster_stats(best_clustering, matrix, parameters,
                                results_folder):
    """
    CSV file
    """
    file_name = parameters.get_value(
        "file", default_value="per_cluster_stats") + ".csv"
    stats_file_path = os.path.join(results_folder, file_name)
    stats_file = open(stats_file_path, "w")
    header_line = ","
    for i in range(len(best_clustering.clusters)):
        cluster = best_clustering.clusters[i]
        header_line += "%s," % cluster.id
    header_line = header_line[:-1] + "\n"

    stats_file.write(header_line)

    # TODO: Once clusterings and clusters become inmutable its medoids will be always updated,
    # then this kind of operations will be unnecessary
    update_medoids(best_clustering, matrix)
    #----------------------------------------

    for i in range(len(best_clustering.clusters)):
        cluster_i = best_clustering.clusters[i]

        try:
            intra_distances = get_intra_cluster_distances(cluster_i, matrix)
            diameter = max(intra_distances)
            distances_from_proto = get_distances_of_elements_to(
                cluster_i.prototype, cluster_i.all_elements, matrix)
            radius = max(distances_from_proto)
        except SingularClusterException:
            diameter = 0
            radius = 0
        finally:
            line = "%s(d: %.2f r: %.2f)," % (cluster_i.id, diameter, radius)

        for j in range(0, i + 1):
            line += ","

        for j in range(i + 1, len(best_clustering.clusters)):
            cluster_j = best_clustering.clusters[j]
            line += "%.2f," % matrix[cluster_i.prototype, cluster_j.prototype]

        line = line[:-1] + "\n"
        stats_file.write(line)
    stats_file.close()
    return stats_file_path
Ejemplo n.º 8
0
def calculate_distance_stats(elements, matrix):
    """
    Calculates the mean, dispersion and radius of all the distances to the central element of a set of
    elements.

    @param elements: The elements we are working with.
    @param matrix: The used condensed matrix.

    @return: Mean, std deviation and radius of all the elements with respect to their central element.
    """
    cluster = Cluster(None, elements)
    medoid = cluster.calculate_medoid(matrix)

    # We also get a 0 distance from the medoid vs itself (it is contained in 'elements')
    distances = get_distances_of_elements_to(medoid, elements, matrix)
    return numpy.mean(distances), numpy.std(distances), numpy.max(distances)
Ejemplo n.º 9
0
def calculate_per_cluster_stats(best_clustering, matrix, parameters, results_folder):
    """
    CSV file
    """
    file_name = parameters.get_value("file", default_value = "per_cluster_stats") + ".csv"
    stats_file_path = os.path.join(results_folder,file_name)
    stats_file = open(stats_file_path,"w")
    header_line =","
    for i in range(len(best_clustering.clusters)):
        cluster = best_clustering.clusters[i]
        header_line+="%s,"%cluster.id
    header_line = header_line[:-1] +"\n"

    stats_file.write(header_line)

    # TODO: Once clusterings and clusters become inmutable its medoids will be always updated,
    # then this kind of operations will be unnecessary 
    update_medoids(best_clustering, matrix)
    #----------------------------------------
    
    for i in range(len(best_clustering.clusters)):
        cluster_i = best_clustering.clusters[i]
        
        try:
            intra_distances = get_intra_cluster_distances(cluster_i, matrix)
            diameter = max(intra_distances) 
            distances_from_proto = get_distances_of_elements_to(cluster_i.prototype, 
                                                                cluster_i.all_elements, 
                                                                matrix)
            radius = max(distances_from_proto)
        except SingularClusterException:
            diameter = 0
            radius = 0
        finally:
            line = "%s(d: %.2f r: %.2f),"%(cluster_i.id, diameter, radius)

        for j in range(0, i+1):
            line += ","

        for j in range(i+1, len(best_clustering.clusters)):
            cluster_j = best_clustering.clusters[j]
            line+="%.2f,"%matrix[ cluster_i.prototype, cluster_j.prototype]

        line = line[:-1] + "\n"
        stats_file.write(line)
    stats_file.close()
    return stats_file_path
Ejemplo n.º 10
0
    def evaluate(self, clustering, matrix):
        """
        Mean is approximated to medoid.
        """
        update_medoids(clustering, matrix)

        global_cluster = Cluster(None, clustering.get_all_clustered_elements())
        global_cluster.prototype = global_cluster.calculate_medoid(matrix)
        global_variance = numpy.var(get_distances_of_elements_to(global_cluster.prototype,
                                                                 global_cluster.all_elements,
                                                                 matrix))
        variances = [self.cluster_variance(cluster,matrix) for cluster in clustering.clusters]

        sum_ci = numpy.sum(variances)

        Cmp = sum_ci / (len(clustering.clusters)*global_variance)

        return Cmp
    def get_cluster_min_distances(cls, decomposed_cluster, distance_matrix):
        """
        Calculates the distances between the elements of all different classes in the cluster
        and returns the minimum distance for each of these elements.
        Some distances will be counted twice. This is OK.
        """
        allIds = decomposed_cluster.keys()
        min_distances = []

        if len(allIds)>1: # if the cluster is pure, we do not calculate min or max (it does not have sense)
            for setId in allIds:
                myVsIds = list(allIds)
                myVsIds.remove(setId)
                vs_elements = []
                for vsId in myVsIds:
                    vs_elements.extend(decomposed_cluster[vsId])
                for element in decomposed_cluster[setId]:
                    min_distances.append( numpy.min(get_distances_of_elements_to(element, vs_elements, distance_matrix)))
            return numpy.array(min_distances)
        else:
            raise ValueError("Asking min max distances of a PURE cluster.")
Ejemplo n.º 12
0
 def test_get_distances_of_elements_to(self):
     matrix = CondensedMatrix(list(squared_CH_table1))
     numpy.testing.assert_equal(get_distances_of_elements_to(3, [0,1,2,4,5], matrix), [11.0, 6.0, 6.0, 13.0, 15.0])
Ejemplo n.º 13
0
 def cluster_variance(cls, cluster, matrix):
     """
     precondition, cluster medoid (prototype) it's alread
     """
     return numpy.var(get_distances_of_elements_to(cluster.prototype, cluster.all_elements, matrix))