def relative_validity_hard_large_data(X): # Initialization no_of_clusters_list = [i for i in range(2, 11)] DB = np.zeros(len(no_of_clusters_list)) # Centroids must remain the same. The only parameter that should change is the number of clusters clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme( X) for i, total_clusters in tqdm( enumerate(no_of_clusters_list)): # no_of_clusters if len(centroids_BSAS) < total_clusters: centroids = np.zeros((total_clusters, len(X[0]))) # First centroids values centroids[:len(centroids_BSAS), :] = centroids_BSAS # Last centroids values random_indices = np.random.randint(len(X), size=total_clusters - len(centroids_BSAS)) centroids[len(centroids_BSAS):, :] = X[random_indices, :] elif len(centroids_BSAS) > total_clusters: centroids = centroids_BSAS[:total_clusters, :] elif len(centroids_BSAS) == total_clusters: centroids = centroids_BSAS X_, centroids, centroids_history = kmeans_clustering.kmeans( X, total_clusters, centroids_initial=centroids) DB[i] = Davies_Bouldin(X_, centroids) return no_of_clusters_list, DB
def gap_index(X, no_of_clusters): log_W = _gap_index_calculation(X) # Create an array to hold the logW values of the 100 monte carlo simulations log_W_sample = np.zeros((100)) N = len(X) m = len(X[0]) - 1 # Monte Carlo simulation - create the datasets (random position hypothesis) for i in range(100): random_data = np.empty((N, m)) for i in range(m): max_value = np.amax(X[:, i]) min_value = np.min(X[:, i]) temp = (max_value - min_value) * np.random.random(size=(N, 1)) + min_value random_data[:, [i]] = temp X_, centroids, centroids_history = kmeans_clustering.kmeans( random_data, no_of_clusters) log_W_sample[i] = _gap_index_calculation(X_) Gap = np.average(log_W_sample) - log_W return Gap
def testMoons(self): no_of_clusters = 2 # Create the dataset X, y = make_moons(n_samples=300, shuffle=True, noise=0.1, random_state=10) # Run the clustering algorithm X, centroids, centroids_history = kmeans_clustering.kmeans( X, no_of_clusters) # Plotting plot_data(X, no_of_clusters, centroids, centroids_history) # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity( X, no_of_clusters, kmeans_clustering.kmeans) initial_indices, list_of_indices, result_list = external_criteria.external_validity( X, no_of_clusters, y, kmeans_clustering.kmeans) # Histogram of gammas from internal and external criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) plt.show()
def testBlobs(self): no_of_clusters = 4 # Create the dataset X, y = make_blobs(n_samples=500, centers=no_of_clusters, n_features=2, random_state=185) # Run the clustering algorithm but first run a sequential algorithm to obtain initial centroids clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme( X) X, centroids, centroids_history = kmeans_clustering.kmeans( X, no_of_clusters, centroids_initial=centroids) # Plotting plot_data(X, no_of_clusters, centroids, centroids_history) # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity( X, no_of_clusters, kmeans_clustering.kmeans) initial_indices, list_of_indices, result_list = external_criteria.external_validity( X, no_of_clusters, y, kmeans_clustering.kmeans) # Histogram of gammas from internal criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) plt.show()
def testImageSegmentation(self): image = ndimage.imread('..//..//images//181091.jpg') image = image.astype(np.int32, copy=False) # Algorithm execution. We run BSAS first to get estimates for the centroids number_of_clusters = 3 clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme( image) X_, centroids, centroids_history = kmeans_clustering.kmeans( image, no_of_clusters=number_of_clusters, centroids_initial=centroids) ################################################################### # Merging procedure X_ = image_segm_utility.merging_procedure(X_, 500) # Calculate the Rand Index to test similarity to external data original_image = '181091.jpg' seg_file = '181091.seg' external_info = image_segm_utility.insert_clusters( original_image, seg_file) rand_index = image_segm_utility.rand_index_calculation( X_, external_info) print(rand_index) # Draw the clustered image draw_clustered_image(X_, image.shape, rand_index) plt.show()
def testTobeErased(self): image = ndimage.imread('..//..//images//231015.jpg') image = image.astype(np.int32, copy=False) number_of_clusters = 2 X_, centroids, centroids_history = kmeans_clustering.kmeans( image, no_of_clusters=number_of_clusters) X_ = image_segm_utility.merging_procedure(X_)
def relative_validity_hard(X): ''' Defines the several values of the kmeans parameter. Then conducts successive executions of the algorithm by passing to it those values and calculates all the proper relative indices. Parameters: X((N x m) numpy array): a data set of N instances and m features Returns: no_of_clusters_list: the different values of the clusters number DI, DB, SI, GI: the arrays holding the values of the relative indices ''' # Initialization no_of_clusters_list = [i for i in range(2, 11)] DI = np.zeros(len(no_of_clusters_list)) DB = np.zeros(len(no_of_clusters_list)) SI = np.zeros(len(no_of_clusters_list)) GI = np.zeros(len(no_of_clusters_list)) # Centroids must remain the same. The only parameter that should change is the number of clusters clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme( X) for i, total_clusters in tqdm( enumerate(no_of_clusters_list)): # no_of_clusters if len(centroids_BSAS) < total_clusters: centroids = np.zeros((total_clusters, len(X[0]))) # First centroids values centroids[:len(centroids_BSAS), :] = centroids_BSAS # Last centroids values random_indices = np.random.randint(len(X), size=total_clusters - len(centroids_BSAS)) centroids[len(centroids_BSAS):, :] = X[random_indices, :] elif len(centroids_BSAS) > total_clusters: centroids = centroids_BSAS[:total_clusters, :] elif len(centroids_BSAS) == total_clusters: centroids = centroids_BSAS X_, centroids, centroids_history = kmeans_clustering.kmeans( X, total_clusters, centroids_initial=centroids) DI[i] = Dunn_index(X_) DB[i] = Davies_Bouldin(X_, centroids) SI[i] = silhouette_index(X_) GI[i] = gap_index(X_, total_clusters, kmeans_clustering.kmeans) return no_of_clusters_list, DI, DB, SI, GI
def relative_validity_hard(X, no_of_clusters): # Initialization no_of_clusters_list = [i for i in range(2, 11)] DI = np.zeros(len(no_of_clusters_list)) DB = np.zeros(len(no_of_clusters_list)) SI = np.zeros(len(no_of_clusters_list)) GI = np.zeros(len(no_of_clusters_list)) # Centroids must remain the same. The only parameter that should change is the number of clusters clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme( X) for i, total_clusters in tqdm( enumerate(no_of_clusters_list)): # no_of_clusters if len(centroids_BSAS) < total_clusters: centroids = np.zeros((total_clusters, len(X[0]))) # First centroids values centroids[:len(centroids_BSAS), :] = centroids_BSAS # Last centroids values random_indices = np.random.randint(len(X), size=total_clusters - len(centroids_BSAS)) centroids[len(centroids_BSAS):, :] = X[random_indices, :] elif len(centroids_BSAS) > total_clusters: centroids = centroids_BSAS[:no_of_clusters, :] elif len(centroids_BSAS) == total_clusters: centroids = centroids_BSAS X_, centroids, centroids_history = kmeans_clustering.kmeans( X, total_clusters, centroids_initial=centroids) DI[i] = Dunn_index(X_) DB[i] = Davies_Bouldin(X_, centroids) SI[i] = silhouette_index(X_) GI[i] = gap_index(X_, total_clusters) # Print just one clustering effort, the correct one in order to compare it with the indices' signals if total_clusters == no_of_clusters: plot_data(X_, centroids, total_clusters, centroids_history) return no_of_clusters_list, DI, DB, SI, GI
def gap_index(X, no_of_clusters, algorithm): ''' Calculates the Gap index of a clustered dataset. Parameters: X((N x m + 1) numpy array): a clustered data set of N instances, m features and the cluster id at the last column of each vector no_of_clusters: the number of clusters algorithm: the function object representing the algorithm that called the function Returns: The Gap index ''' log_W = _gap_index_calculation(X) # Create an array to hold the logW values of the 100 monte carlo simulations log_W_sample = np.zeros((100)) N = len(X) m = len(X[0]) - 1 # Monte Carlo simulation - create the datasets (random position hypothesis) for i in range(100): random_data = np.empty((N, m)) for j in range(m): max_value = np.amax(X[:, j]) min_value = np.min(X[:, j]) temp = (max_value - min_value) * np.random.random(size=(N, 1)) + min_value random_data[:, [j]] = temp if algorithm == kmeans_clustering.kmeans: X_, centroids, centroids_history = kmeans_clustering.kmeans( random_data, no_of_clusters) log_W_sample[i] = _gap_index_calculation(X_) Gap = np.average(log_W_sample) - log_W return Gap