Esempio n. 1
0
    def perform_clustering(self, kwargs):
        """
        Main loop to perform the DBSCAN algorithm.
        """
        elements_class = [PointClassType.UNCLASSIFIED
                          ] * self.number_of_elements
        eps = kwargs["eps"]
        minpts = kwargs["minpts"]
        current_cluster_id = PointClassType.NOISE + 1

        for i in range(self.number_of_elements):
            current_element = i
            if elements_class[current_element] == PointClassType.UNCLASSIFIED:
                last_forms_a_cluster = self.__expand_cluster(
                    current_element, current_cluster_id, eps, minpts,
                    elements_class)
                if last_forms_a_cluster:
                    current_cluster_id = current_cluster_id + 1

        # Return the clusters once the clustering is done
        # NOISE elements form a single cluster with ID = PointClassType.NOISE
        # and will be removed from the clustering
        clusters = gen_clusters_from_class_list(
            elements_class, skip_list=[PointClassType.NOISE])
        return Clustering(clusters,
                          details="DBSCAN (eps = " + str(eps) + " minpts = " +
                          str(minpts) + ") " + str(self.number_of_elements) +
                          " elems")
Esempio n. 2
0
    def perform_clustering(self, kwargs):
        """
        Performs the hierarchical clustering step and the clustering step. If the hierarchical
        matrix is given, then it just calculates the clusters for a given cutoff. If we call the algorithm
        a second time it will use the last matrix.
        """
        """
        Gets a condensed matrix and calculates the clustering. One can use
        diverse methodologies to do this clustering...
        With preserve_input=False the matrix is destroyed while clustering, ut it saves
        memory.
        The metric is not needed in this case,as we are giving the function the calculated
        matrix.
        The method is the method used to determine distances when fusing clusters. methods are described in:
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
        """
        try:
            cutoff = kwargs["cutoff"]
        except KeyError:
            cutoff = None

        try:
            hie_mat = kwargs["hie_mat"]
        except KeyError:
            hie_mat = None

        try:
            method = kwargs["method"]
        except KeyError:
            method = 'complete'

        if hie_mat != None:
            self.hie_mat = hie_mat
#            print "[HIERARCHICAL] Matrix provided."
        else:
            if self.hie_mat == None:
                #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False)
                #                print "[HIERARCHICAL] Calculating Matrix"
                #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method)
                self.hie_mat = hcluster_fast.linkage(
                    self.condensed_matrix.get_data(), method=method)
#            else:
#                print "[HIERARCHICAL] Matrix was already stored"

        algorithm_details = "Hierarchical with " + method + " method (cutoff = " + str(
            cutoff) + ")"

        if cutoff != None:
            # Then apply the cutoff, this doesn't work much as expected
            #            print "[HIERARCHICAL] getting clustering."+algorithm_details
            group_list = hcluster.fcluster(self.hie_mat, cutoff)
            #            print "[HIERARCHICAL] Clustering done."+algorithm_details
            # Then let's generate the clusters
            clusters = gen_clusters_from_class_list(group_list)
            return Clustering(clusters, details=algorithm_details)
        else:
            return None
Esempio n. 3
0
    def perform_clustering(self, kwargs):
        """
        Performs the hierarchical clustering step and the clustering step. If the hierarchical
        matrix is given, then it just calculates the clusters for a given cutoff. If we call the algorithm
        a second time it will use the last matrix.
        """
        """
        Gets a condensed matrix and calculates the clustering. One can use
        diverse methodologies to do this clustering...
        With preserve_input=False the matrix is destroyed while clustering, ut it saves
        memory.
        The metric is not needed in this case,as we are giving the function the calculated
        matrix.
        The method is the method used to determine distances when fusing clusters. methods are described in:
        http://docs.scipy.org/doc/scipy/reference/generated/scipy.spatial.distance.pdist.html
        """
        try:
            cutoff = kwargs["cutoff"]
        except KeyError:
            cutoff = None

        try:
            hie_mat = kwargs["hie_mat"]
        except KeyError:
            hie_mat = None

        try:
            method = kwargs["method"]
        except KeyError:
            method = 'complete'

        if hie_mat != None:
            self.hie_mat = hie_mat
#            print "[HIERARCHICAL] Matrix provided."
        else:
            if self.hie_mat == None:
                #self.hie_mat = fast_hcluster.linkage(condensed_matrix, method='centroid', metric='euclidean', preserve_input=False)
#                print "[HIERARCHICAL] Calculating Matrix"
                #self.hie_mat = fastclust.linkage(self.condensed_matrix.get_data(), method = method)
                self.hie_mat = hcluster_fast.linkage(self.condensed_matrix.get_data(), method = method)
#            else:
#                print "[HIERARCHICAL] Matrix was already stored"

        algorithm_details = "Hierarchical with "+method+" method (cutoff = " +str(cutoff)+")"

        if cutoff != None:
            # Then apply the cutoff, this doesn't work much as expected
#            print "[HIERARCHICAL] getting clustering."+algorithm_details
            group_list = hcluster.fcluster(self.hie_mat,cutoff)
#            print "[HIERARCHICAL] Clustering done."+algorithm_details
            # Then let's generate the clusters
            clusters = gen_clusters_from_class_list(group_list)
            return Clustering(clusters,details = algorithm_details)
        else:
            return None
Esempio n. 4
0
    def update_medoids(self):
        """
        Regenerates the medoids list once the new clusters have been generated.

        @return: A new medoid list.
        """
        clusters = gen_clusters_from_class_list(self.class_list)
        medoids = []
        for c in clusters:
            medoids.append(c.calculate_medoid(self.condensed_matrix))
        return medoids
Esempio n. 5
0
 def test_gen_clusters_from_grouping_list(self):
     #  numpy.random.random_integers(0,4,20)
     numclusters = 5
     group_list = [4, 1, 2, 2, 4, 4, 3, 4, 2, 0, 0, 3, 3, 4, 0, 3, 1, 1, 1, 2]
     true_clusters = [Cluster(0,[0,4,5,7,13]),
                      Cluster(1,[1,16,17,18]),
                      Cluster(2,[2,3,8,19]),
                      Cluster(6,[6,11,12,15]),
                      Cluster(9,[9,10,14])]
     clusters =  gen_clusters_from_class_list(group_list)
     sorted_clusters = sorted(clusters, key=lambda c: c.prototype)
     self.assertEqual(numclusters,len(sorted_clusters))
     for i in range(numclusters):
         self.assertEqual(true_clusters[i], sorted_clusters[i])
 def perform_clustering(self,kwargs):
     """
     Creates a clustering where the clusters have been created by random selection of 
     the elements in the dataset, following a cluster size distribution.
     """
     distribution = kwargs["distribution"]
     num_of_nodes = self.condensed_matrix.row_length
     node_class = []
     next_class = 0
     for d in distribution:
         node_class.extend([next_class]*int((d/100.)*num_of_nodes))
         next_class = next_class + 1
     random.shuffle(node_class)
     clusters = gen_clusters_from_class_list(node_class[0:num_of_nodes])
     return Clustering(clusters, details = "Fake Distribution Random (distribution = "+str(distribution)+")")
Esempio n. 7
0
 def perform_clustering(self,kwargs):
     """
     Creates a clustering where the clusters have been created by random selection of
     the elements in the dataset, following a cluster size distribution.
     """
     distribution = kwargs["distribution"]
     num_of_nodes = self.condensed_matrix.row_length
     node_class = []
     next_class = 0
     for d in distribution:
         node_class.extend([next_class]*int((d/100.)*num_of_nodes))
         next_class = next_class + 1
     random.shuffle(node_class)
     clusters = gen_clusters_from_class_list(node_class[0:num_of_nodes])
     return Clustering(clusters, details = "Fake Distribution Random (distribution = "+str(distribution)+")")
Esempio n. 8
0
    def perform_clustering(self, kwargs):
        """
        Creates a clustering where the clusters have been created by random selection of
        the elements in the dataset. It will create a random number of clusters if "max_num_of_clusters" is
        defined, or an exact number of clusters this clusters if "num_clusters" is defined.
        """
        num_of_nodes = self.condensed_matrix.row_length
        num_of_clusters = 0
        elements_per_cluster = 0
        max_num_of_clusters = 0
        try:
            num_of_clusters = kwargs["num_clusters"]
            elements_per_cluster = max(1, num_of_nodes / num_of_clusters)
        except KeyError:
            try:
                max_num_of_clusters = kwargs["max_num_of_clusters"]
                num_of_clusters = random.randint(1, max_num_of_clusters)
            except KeyError:
                num_of_clusters = 2
                elements_per_cluster = 1

        node_class = []

        try:
            elements_per_cluster = max(1, num_of_nodes / num_of_clusters)
        except:
            elements_per_cluster = 1

        for i in range(num_of_clusters):
            node_class.extend([i] * elements_per_cluster)

        while len(node_class) < num_of_nodes:
            node_class.append(0)

        random.seed()
        random.shuffle(node_class)

        clusters = gen_clusters_from_class_list(node_class)

        return Clustering(
            clusters,
            details="Random (max_num_of_clusters = %d, num_of_clusters = %d )"
            % (max_num_of_clusters, num_of_clusters))
Esempio n. 9
0
    def perform_clustering(self, kwargs):
        """
        Does the actual clustering.
        @param kwargs: Dictionary with this mandatory keys:
            - 'k': Number of clusters to generate.
            - 'seeding_type': One of the initial medoid selectors available (@see seeding_types() ).
                If seeding type is 'GROMOS', 'seeding_max_cutoff' must be also defined, containing the
                cutoff that the GROMOS Algorithm will use. Default is EQUIDISTANT
        """
        self.k = kwargs["k"]

        self.seeding_type =  kwargs["seeding_type"] if "seeding_type" in kwargs else  "EQUIDISTANT"

        if self.seeding_type == 'GROMOS':
            self.seeding_max_cutoff = kwargs["seeding_max_cutoff"]
        else:
            self.seeding_max_cutoff = -1.0

        # Getting first medoids
        current_medoids = self.seeding(self.k, self.seeding_max_cutoff, self.seeding_type)
#         print "Initial medoids:", current_medoids

        current_iteration = 0
        last_medoids = []
        while not self.convergence(current_medoids,last_medoids) and current_iteration < KMedoidsAlgorithm.MAX_ITERATIONS:
#             print "Iteration"
            # Update clusters
            self.cluster_update(current_medoids, self.condensed_matrix)
            # Copy current to last (by reference)
            last_medoids = current_medoids
            # Update medoids
            current_medoids = self.update_medoids()
#             print "Current medoids:", current_medoids
            current_iteration = current_iteration + 1

        algorithm_details = "K-Medoids algorithm with k ="+str(int(self.k))+" and %s initial seeding"%self.seeding_to_str()
        clusters = gen_clusters_from_class_list(self.class_list)
        
        # Put medoids as representatives
        for c in clusters:
            c.set_prototype(c.calculate_medoid(self.condensed_matrix))
            
        return Clustering(clusters,details = algorithm_details)
Esempio n. 10
0
 def perform_clustering(self,kwargs):
     """
     Main loop to perform the DBSCAN algorithm. 
     """
     elements_class = [PointClassType.UNCLASSIFIED]*self.number_of_elements
     eps = kwargs["eps"]
     minpts = kwargs["minpts"]
     current_cluster_id = PointClassType.NOISE + 1
     
     for i in range(self.number_of_elements):
         current_element = i
         if elements_class[current_element] == PointClassType.UNCLASSIFIED:
             last_forms_a_cluster = self.__expand_cluster(current_element, current_cluster_id, eps, minpts, elements_class)
             if last_forms_a_cluster:
                 current_cluster_id = current_cluster_id + 1
 
     # Return the clusters once the clustering is done
     # NOISE elements form a single cluster with ID = PointClassType.NOISE 
     # and will be removed from the clustering
     clusters = gen_clusters_from_class_list(elements_class, skip_list = [PointClassType.NOISE])
     return Clustering(clusters,details="DBSCAN (eps = "+str(eps)+" minpts = "+str(minpts)+") "+str(self.number_of_elements)+" elems")
Esempio n. 11
0
    def perform_clustering(self, kwargs):
        """
        Creates a clustering where the clusters have been created by random selection of
        the elements in the dataset. It will create a random number of clusters if "max_num_of_clusters" is
        defined, or an exact number of clusters this clusters if "num_clusters" is defined.
        """
        num_of_nodes = self.condensed_matrix.row_length
        num_of_clusters = 0
        elements_per_cluster = 0
        max_num_of_clusters = 0
        try:
            num_of_clusters = kwargs["num_clusters"]
            elements_per_cluster = max(1, num_of_nodes / num_of_clusters)
        except KeyError:
            try:
                max_num_of_clusters = kwargs["max_num_of_clusters"]
                num_of_clusters = random.randint(1, max_num_of_clusters)
            except KeyError:
                num_of_clusters = 2
                elements_per_cluster = 1

        node_class = []

        try:
            elements_per_cluster = max(1, num_of_nodes / num_of_clusters)
        except:
            elements_per_cluster = 1

        for i in range(num_of_clusters):
            node_class.extend([i]*elements_per_cluster)

        while len(node_class) < num_of_nodes:
            node_class.append(0)

        random.seed()
        random.shuffle(node_class)

        clusters = gen_clusters_from_class_list(node_class)

        return Clustering(clusters, details = "Random (max_num_of_clusters = %d, num_of_clusters = %d )"%(max_num_of_clusters, num_of_clusters))
    def perform_clustering(self, kwargs):
        """
        Does the actual clustering by doing a k-medoids clustering of the first k eigenvector rows.

        @param kwargs: Dictionary with this mandatory keys:
            - 'k': Number of clusters to generate. Must be <= than max_clusters

        @return: a Clustering instance with the clustered data.
        """
        # Mandatory parameter
        k = int(kwargs["k"])

        if k > self.max_clusters:
            print "[ERROR SpectralClusteringAlgorithm::perform_clustering] this algorithm was defined to generate at most %d clusters."%self.max_clusters,

        algorithm_details = "Spectral algorithm with k = %d and sigma squared = %.3f" %(int(k), self.sigma_sq)

        if self.use_k_medoids:
            # The row vectors we have are in R^k (so k length)
            eigen_distances = CondensedMatrix(pdist(self.eigenvectors[:,:k]))
            k_medoids_args = {
                              "k":k,
                              "seeding_max_cutoff":-1,
                              "seeding_type": "RANDOM"
                              }

            k_medoids_alg = KMedoidsAlgorithm(eigen_distances)
            clustering = k_medoids_alg.perform_clustering(k_medoids_args)
            clustering.details = algorithm_details
            return k_medoids_alg.perform_clustering(k_medoids_args)
        else:
            centroid, labels = scipy.cluster.vq.kmeans2(self.eigenvectors[:,:k],
                                                        k,
                                                        iter = 1000,
                                                        minit = 'random')
            del centroid
            clusters = gen_clusters_from_class_list(labels)
            return Clustering(clusters,details = algorithm_details)
Esempio n. 13
0
    def perform_clustering(self, kwargs):
        """
        Does the actual clustering by doing a k-medoids clustering of the first k eigenvector rows.

        @param kwargs: Dictionary with this mandatory keys:
            - 'k': Number of clusters to generate. Must be <= than max_clusters

        @return: a Clustering instance with the clustered data.
        """
        # Mandatory parameter
        k = int(kwargs["k"])

        if k > self.max_clusters:
            print "[ERROR SpectralClusteringAlgorithm::perform_clustering] this algorithm was defined to generate at most %d clusters." % self.max_clusters,

        algorithm_details = "Spectral algorithm with k = %d and sigma squared = %.3f" % (
            int(k), self.sigma_sq)

        if self.use_k_medoids:
            # The row vectors we have are in R^k (so k length)
            eigen_distances = CondensedMatrix(pdist(self.eigenvectors[:, :k]))
            k_medoids_args = {
                "k": k,
                "seeding_max_cutoff": -1,
                "seeding_type": "RANDOM"
            }

            k_medoids_alg = KMedoidsAlgorithm(eigen_distances)
            clustering = k_medoids_alg.perform_clustering(k_medoids_args)
            clustering.details = algorithm_details
            return k_medoids_alg.perform_clustering(k_medoids_args)
        else:
            centroid, labels = scipy.cluster.vq.kmeans2(
                self.eigenvectors[:, :k], k, iter=1000, minit='random')
            del centroid
            clusters = gen_clusters_from_class_list(labels)
            return Clustering(clusters, details=algorithm_details)