Beispiel #1
0
    def process(self):
        """!
        @brief Performs cluster analysis in line with rules of BIRCH algorithm.
        
        @return (birch) Returns itself (BIRCH instance).
        
        @see get_clusters()
        
        """
        
        self.__insert_data()
        self.__extract_features()

        cf_data = [feature.get_centroid() for feature in self.__features]

        algorithm = agglomerative(cf_data, self.__number_clusters, type_link.SINGLE_LINK).process()
        self.__cf_clusters = algorithm.get_clusters()

        cf_labels = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, self.__cf_clusters, cf_data).\
            set_encoding(type_encoding.CLUSTER_INDEX_LABELING).get_clusters()

        self.__clusters = [[] for _ in range(len(self.__cf_clusters))]
        for index_point in range(len(self.__pointer_data)):
            index_cf_entry = numpy.argmin(numpy.sum(numpy.square(
                numpy.subtract(cf_data, self.__pointer_data[index_point])), axis=1))
            index_cluster = cf_labels[index_cf_entry]
            self.__clusters[index_cluster].append(index_point)

        return self
Beispiel #2
0
    def getIndexRepresentorTwoDimensionData(self):
        clusters = [[0, 1, 2, 3], [4, 5, 6, 7]]
        data = [[5.1, 5.2], [5.2, 5.1], [5.4, 5.2], [5.1, 5.0], [8.1, 8.0],
                [8.4, 8.2], [8.3, 8.4], [8.5, 8.5]]

        return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION,
                               clusters, data)
def aggl_cluster(df, n_clusters, link, hover_text):
    datadf = df.loc[:, df.columns != hover_text]
    data_list = datadf.to_numpy(dtype="int64").tolist()
    if (link == "centroid"):
        typelink = type_link.CENTROID_LINK
    elif (link == "single"):
        typelink = type_link.SINGLE_LINK
    elif (link == "complete"):
        typelink = agglomerative.type_link.COMPLETE_LINK
    else:
        typelink = agglomerative.type_link.AVERAGE_LINK
    aggl_instance = agglomerative(data_list, n_clusters, typelink)
    aggl_instance.process()
    clusters = aggl_instance.get_clusters()
    reps = aggl_instance.get_cluster_encoding()
    encoder = cluster_encoder(reps, clusters, data_list)
    encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
    label = np.array(encoder.get_clusters(), dtype='int32')
    data_array = np.array(data_list)
    col_len = len(datadf.columns)
    if (col_len == 2):
        clus = scat2d(data_array, label, hover_text, df)
        return clus
    else:
        clus = scat3d(data_array, label, hover_text, df)
        return clus
Beispiel #4
0
    def getIndexRepresentorDoubleData(self):
        clusters = [[0, 1, 2, 3], [4, 5, 6, 7]]
        data = [
            5.4562, 5.1235, 4.9235, 4.8712, 8.3451, 8.4215, 8.6535, 8.7345
        ]

        return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION,
                               clusters, data)
Beispiel #5
0
    def testLabelsToIndexListAndObjectListMissedPoint(self):
        clusters = [0, 0, float('NaN'), 1, 1]
        data = [[5.1, 5.2], [5.2, 5.1], [14.1, 76.0], [8.1, 8.0], [8.4, 8.2]]

        encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LABELING,
                                  clusters, data)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION)
        expected = [[0, 1], [3, 4]]
        actual = encoder.get_clusters()

        self.assertEqual(len(expected), len(actual))
        self.assertEqual(expected, actual)

        encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LABELING,
                                  clusters, data)
        encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION)
        expected = [[[5.1, 5.2], [5.2, 5.1]], [[8.1, 8.0], [8.4, 8.2]]]
        actual = encoder.get_clusters()

        self.assertEqual(len(expected), len(actual))
        self.assertEqual(expected, actual)
Beispiel #6
0
    def templateEncoderProcedures(ccore_flag):
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)

        cure_instance = cure(sample, 4, 5, 0.5, ccore=ccore_flag)
        cure_instance.process()

        clusters = cure_instance.get_clusters()
        encoding = cure_instance.get_cluster_encoding()

        encoder = cluster_encoder(encoding, clusters, sample)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
        encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION)

        assert 4 == len(clusters)
 def templateEncoderProcedures(filename, initial_centers, number_clusters, ccore_flag):
     sample = read_sample(filename)
     
     kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag)
     kmeans_instance.process()
     
     clusters = kmeans_instance.get_clusters()
     encoding = kmeans_instance.get_cluster_encoding()
     
     encoder = cluster_encoder(encoding, clusters, sample)
     encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
     encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION)
     encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION)
     
     assertion.eq(number_clusters, len(clusters))
Beispiel #8
0
 def templateEncoderProcedures(filename, initial_centers, number_clusters, ccore_flag):
     sample = read_sample(filename);
     
     kmeans_instance = kmeans(sample, initial_centers, 0.025, ccore_flag);
     kmeans_instance.process();
     
     clusters = kmeans_instance.get_clusters();
     encoding = kmeans_instance.get_cluster_encoding();
     
     encoder = cluster_encoder(encoding, clusters, sample);
     encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING);
     encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION);
     encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION);
     
     assertion.eq(number_clusters, len(clusters));
    def templateEncoderProcedures(ccore_flag):
        sample = read_sample(SIMPLE_SAMPLES.SAMPLE_SIMPLE3)
        
        cure_instance = cure(sample, 4, 5, 0.5, ccore = ccore_flag)
        cure_instance.process()
        
        clusters = cure_instance.get_clusters()
        encoding = cure_instance.get_cluster_encoding()
        
        encoder = cluster_encoder(encoding, clusters, sample)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
        encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION)

        assertion.eq(4, len(clusters))
    def templateEncoderProcedures(sample, initial_centers, number_clusters,
                                  ccore_flag):
        sample = read_sample(sample)

        cure_instance = kmeans(sample, initial_centers, 0.025, ccore_flag)
        cure_instance.process()

        clusters = cure_instance.get_clusters()
        encoding = cure_instance.get_cluster_encoding()

        encoder = cluster_encoder(encoding, clusters, sample)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
        encoder.set_encoding(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LIST_SEPARATION)

        assert number_clusters == len(clusters)
Beispiel #11
0
    def testObjectListToLabelsMissedPoint(self):
        clusters = [[[5.1, 5.2], [5.2, 5.1]], [[8.1, 8.0], [8.4, 8.2]]]
        data = [[5.1, 5.2], [5.2, 5.1], [14.1, 76.0], [8.1, 8.0], [8.4, 8.2]]

        encoder = cluster_encoder(type_encoding.CLUSTER_OBJECT_LIST_SEPARATION,
                                  clusters, data)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)

        expected = [0, 0, float('NaN'), 1, 1]
        actual = encoder.get_clusters()

        self.assertEqual(len(expected), len(actual))

        for i in range(len(expected)):
            if math.isnan(expected[i]) is True:
                self.assertTrue(math.isnan(actual[i]))
            else:
                self.assertEqual(expected[i], actual[i])
Beispiel #12
0
    def testIndexListToLabelsMissedPoint(self):
        clusters = [[0, 1, 2, 3], [4, 5, 6]]  # the last point is missed
        data = [[5.1, 5.2], [5.2, 5.1], [5.4, 5.2], [5.1, 5.0], [8.1, 8.0],
                [8.4, 8.2], [8.3, 8.4], [8.5, 8.5]]

        encoder = cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION,
                                  clusters, data)
        encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)

        expected = [0, 0, 0, 0, 1, 1, 1, float('NaN')]
        actual = encoder.get_clusters()

        self.assertEqual(len(expected), len(actual))

        for i in range(len(expected)):
            if math.isnan(expected[i]) is True:
                self.assertTrue(math.isnan(actual[i]))
            else:
                self.assertEqual(expected[i], actual[i])
def dbscan_cluster(df, eps, neighbours, hover_text):
    datadf = df.loc[:, df.columns != hover_text]
    data_list = datadf.to_numpy(dtype="int64").tolist()
    dbscan_instance = dbscan(data_list, eps, neighbours)
    dbscan_instance.process()
    clusters = dbscan_instance.get_clusters()
    reps = dbscan_instance.get_cluster_encoding()

    encoder = cluster_encoder(reps, clusters, data_list)
    encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
    label = np.array(encoder.get_clusters(), dtype='int32')
    data_array = np.array(data_list)
    col_len = len(datadf.columns)
    if (col_len == 2):
        clus = scat2d(data_array, label, hover_text, df)
        return clus
    else:
        clus = scat3d(data_array, label, hover_text, df)
        return clus
def kmeans_cluster(df, n_clusters, tolerance, metric, hover_text):
    datadf = df.loc[:, df.columns != hover_text]
    data_list = datadf.to_numpy(dtype="int64").tolist()
    if (metric == "manhattan"):
        metric_str = distance_metric(type_metric.MANHATTAN)
    else:
        metric_str = distance_metric(type_metric.EUCLIDEAN_SQUARE)
    centers = kmeans_plusplus_initializer(data_list, n_clusters).initialize()
    kmeans_instance = kmeans(data_list, centers, tolerance, metric_str)
    kmeans_instance.process()
    clusters = kmeans_instance.get_clusters()
    reps = kmeans_instance.get_cluster_encoding()
    encoder = cluster_encoder(reps, clusters, data_list)
    encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING)
    label = np.array(encoder.get_clusters(), dtype='int32')
    data_array = np.array(data_list)
    col_len = len(datadf.columns)
    if (col_len == 2):
        clus = scat2d(data_array, label, hover_text, df)
        return clus
    else:
        clus = scat3d(data_array, label, hover_text, df)
        return clus
Beispiel #15
0
 def getIndexRepresentor(self):
     clusters = [ [0, 1, 2, 3], [4, 5, 6, 7] ];
     data = [10, 11, 13, 12, 64, 65, 65, 68];
     
     return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data);
def cluster_nodes(visualisation=False):
    # (kmedoids_cluster_nodes.py
    # template_clustering()

    # K-medoids clustering using points as data
    
    # Get the nodes sample from the data file
    # samplePath = os.path.dirname(os.path.abspath("kmedoids_cluster_nodes.py")) + os.sep + "nodes-test1.data"
    
    #scenarios = ["scenario1.data", "scenario2.data", "scenario3.data", "scenario4.data", "scenario5.data"]
    scenarios = ["scenario1.data", "scenario2.data", "scenario3.data", "scenario4.data", "scenario5.data"]
    """
    #random.seed(35344796)
    random.seed(35334096)
    scenario4 = []
    for i in range(10):
        scenario4.append(random.randrange(1000))
    
    print(scenario4)
    """
    
    # [564, 162, 959, 271, 663, 992, 566, 883, 438, 118] # 1k
    #initial_medoids = [[8, 12, 17, 25], [8, 12, 17, 25], [8, 12, 22, 28]];
    #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199], [10461, 7157, 11717, 2709, 13116, 16811, 2041, 19481, 9130, 12817]];
    #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199], [8257, 3356, 10812, 2440, 14783, 10547, 11063, 11980, 6929, 18896]]
    #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199], [1129, 324, 1919, 542, 1326, 1985, 1133, 1767, 877, 237], [103, 16, 196, 76, 80]]
    #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199], [564, 162, 959, 271, 663, 992, 566, 883, 438, 118]]
    #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [9, 12, 25], [103, 16, 196, 76, 80, 41, 47, 52, 112, 199]];
    #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [15, 25, 28, 32, 10]]; 0.2476339234441543
    #initial_medoids = [[8, 12, 17, 25], [8, 12, 17], [13, 23, 28, 32, 8]];

    scenarioClustersDistanceList = []

    for scenarioIndex in range(0, len(scenarios)):
        total_time_start = time.perf_counter()
        total_wall_time_start = time.time()
        samplePath = os.path.dirname(os.path.abspath("kmedoids_cluster_nodes.py")) + os.sep + scenarios[scenarioIndex]
        sample = read_sample(samplePath)

        print("\nScenario", scenarioIndex+1, "\nSample:", samplePath, "\n")

        # Use Manhattan distance
        metric = distance_metric(type_metric.MANHATTAN);

        # Store the silhouette value for different number of clusters (k)
        silhouettes = []
        
        # Run clustering k times, calculate silhouette value for each time and choose clustering with best value
        for k in range(2, 11):
            # Randomly generate the medoids
            """random.seed(35334096)
            random_medoids = []
            for i in range(k):
                random_medoids.append(random.randrange(len(sample)))
"""
            # Initialise the clustering algorithm with the k-means++ algorithm
            # Initialise the random generator with a seed for reproducibility
            random.seed(35334096)
            initial_points = kmeans_plusplus_initializer(sample, k).initialize()
            #print(sample)
            #print("Type of initial points:", type(initial_points))
            #print("Initial points:", initial_points)
            initial_medoids = []
            for point in sample:
                #print("Sample point:", point)
                for initial_point in initial_points:
                    #print("Single point type:", type(initial_point))
                    if(point[0] == initial_point[0] and point[1] == initial_point[1]):
                        initial_medoids.append(sample.index(point))
            #print("Initial medoids:", initial_medoids)
            

            #print("Random medoids:", random_medoids)
            
            # Initiate the k-medoids algorithm with the sample and the initial medoids
            #kmedoids_instance = kmedoids(sample, initial_medoids[scenarioIndex], 0.001, metric=metric, ccore = True);
            kmedoids_instance = kmedoids(sample, initial_medoids, 0.001, metric=metric, ccore = True);

            # Start performance counter
            time_start = time.perf_counter()
            wall_time_start = time.time()

            # Perform actual clustering
            kmedoids_instance.process()

            # Stop performance counter
            time_end = time.perf_counter()
            wall_time_end = time.time()

            # Calculate execution time and wall time
            clustering_time = time_end - time_start
            clustering_wall_time = wall_time_end - wall_time_start
            print("Execution time for clustering for k=" + str(k) + ":", clustering_time, "\nWall time for clustering for k=" + str(k) + ":", clustering_wall_time)
            
            # by default k-medoids returns representation CLUSTER_INDEX_LIST_SEPARATION
            clusters = kmedoids_instance.get_clusters()
            medoids = kmedoids_instance.get_medoids();
            #print("Clusters before changing encoding:", clusters)
            type_repr = kmedoids_instance.get_cluster_encoding();
            #print("Representator type:", type_repr)
            
            encoder = cluster_encoder(type_repr, clusters, sample);
        
            # change representation from index list to label list
            encoder.set_encoding(type_encoding.CLUSTER_INDEX_LABELING);
            #kmedoids_instance.process
            type_repr2 = encoder.get_encoding;

            # Cluster representation converted from a list of sample indexes to their respective labels
            cluster_labels = encoder.get_clusters()
            
            #print("Representator type afterwards:", type_repr2)
            #print("Cluster labels", cluster_labels)

            #print("Number of medoids:", len(medoids))
            #[[float(y) for y in x] for x in l]
            medoidPoints = [[point for point in sample[index]] for index in medoids]
            #print("Medoid points:", medoidPoints)

            # Calculate the silhouette value
            silhouettes.append((calculate_silhouette(sample, cluster_labels, medoidPoints, k, scenarioIndex, visualisation), k, clustering_time, clustering_wall_time, clusters, medoids))
            
            # Calculate Silhouette value
            #sil_val = silhouette_value(kmedoids_instance.get_clusters(), sample)
            #print("pyclustering silhouette value for", k, "clusters:", sil_val)
        
        best_silhouette, best_k, best_time, best_wall_time, best_clusters, best_medoids = max(silhouettes,key=itemgetter(0))
        print("The best silhouette value of", best_silhouette, "was achieved with k=" + str(best_k) + "\nExecution time of best clustering: " + str(best_time) + "\nWall time of best clustering: " + str(best_wall_time))
        #print("Best clusters:", best_clusters)
        #print("Best medoids:", best_medoids)

        # Run clustering and print result of clustering as well as execution time
        #(ticks, result) = timedcall(kmedoids_instance.process);
        #print( "\nExecution time:", time);
        #clusters = kmedoids_instance.get_clusters();
        #medoids = kmedoids_instance.get_medoids();
        #print("Clusters:", clusters);
        #print("Medoids:", medoids)

        # Generate visualisation
        if(visualisation):
            title = "K-medoids clustering - Scenario " + str(scenarioIndex+1)
            visualizer = cluster_visualizer(1, titles=[title]);
            visualizer.append_clusters(best_clusters, sample, 0);
            #visualizer.append_cluster([ sample[index] for index in initial_medoids[index] ], marker = '*', markersize = 15);
            visualizer.append_cluster(best_medoids, data=sample, marker='*', markersize=15, color="black");
            visualizer.show(visible_axis = False, visible_grid = False);

        # Post-processing
        
        # Calculate Manhattan distance from medoid to all points in the cluster
        metric = distance_metric(type_metric.MANHATTAN);
        clusterList = []
        #print("Number of clusters:", len(best_clusters),)
        for index in range(0, len(best_clusters)):
            #print("Index: ", index)
            medoidPoint = sample[best_medoids[index]]
            #print("Medoid point array: ", medoidPoint)
            #print("Cluster index array: ", clusters[index])
            nodeList = []
            for currentClusterIndex in best_clusters[index]:
                # Make sure not to compare the medoid to itself
                if best_medoids[index] != currentClusterIndex:
                    # Get the point array of the current cluster to compare to the medoid
                    currentClusterPoint = sample[currentClusterIndex]
                    #print("Current cluster point from sample:", currentClusterPoint)
                    
                    # Calculate the Manhattan distance between the medoid and the current point to compare with
                    distance = metric(medoidPoint, currentClusterPoint)

                    # Append the result to a list as the index of the medoid, the index of the current point and the distance between them
                    nodeList.append([best_medoids[index], currentClusterIndex, distance])
                    #print("Distance between ", medoidPoint, " and ", currentClusterPoint, " is: ", distance)
                    
            clusterList.append(nodeList)
            
        scenarioClustersDistanceList.append(clusterList)

        total_time_end = time.perf_counter()
        total_wall_time_end = time.time()
        print("\nTotal scenario execution time:", total_time_end - total_time_start, "\nTotal scenario wall time:", total_wall_time_end - total_wall_time_start, "\n\n----")
    
    return scenarioClustersDistanceList
    
    """# K-medoids clustering using distance matrix
Beispiel #17
0
 def getIndexRepresentorTwoDimensionData(self):
     clusters = [ [0, 1, 2, 3], [4, 5, 6, 7] ];
     data = [ [5.1, 5.2], [5.2, 5.1], [5.4, 5.2], [5.1, 5.0], [8.1, 8.0], [8.4, 8.2], [8.3, 8.4], [8.5, 8.5]];
     
     return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data);
Beispiel #18
0
    def getIndexRepresentor(self):
        clusters = [[0, 1, 2, 3], [4, 5, 6, 7]]
        data = [10, 11, 13, 12, 64, 65, 65, 68]

        return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION,
                               clusters, data)
Beispiel #19
0
 def getIndexRepresentorDoubleData(self):
     clusters = [ [0, 1, 2, 3], [4, 5, 6, 7] ];
     data = [5.4562, 5.1235, 4.9235, 4.8712, 8.3451, 8.4215, 8.6535, 8.7345];
     
     return cluster_encoder(type_encoding.CLUSTER_INDEX_LIST_SEPARATION, clusters, data);