def monte_carlo(data, no_of_clusters, algorithm): ''' Creates 100 (could be set as argument) sampling distributions of uniformingly distributed data and calls the algorithm passed as argument in order to cluster each distribution and calculate its Gamma statistic. Parameters: data((N x m) 2-d numpy array): a data set of N instances and m features no_of_clusters(integer): the number of clusters algorithm: the algorithm function to be used to cluster the data Returns: list_of_gammas(list): the Gamma statistics of all the monte carlo sample distributions ''' N = len(data) m = len(data[0]) - 1 # Monte Carlo simulation - create the datasets (random position hypothesis) list_of_gammas = [] #pbar = tqdm(range(100)) #pbar.set_description('Monte carlo sim. - internal indices') j = 0 while j < 100: random_data = np.empty((N, m)) for i in range(m): max_value = np.amax(data[:, i]) min_value = np.min(data[:, i]) temp = (max_value - min_value) * np.random.random(size=(N, 1)) + min_value random_data[:, [i]] = temp if algorithm == fuzzy_clustering.fuzzy: X, centroids, ita, centroids_history, partition_matrix = algorithm( random_data, no_of_clusters) elif algorithm == possibilistic_clustering.possibilistic: X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( random_data, no_of_clusters) X, centroids, centroids_history, typicality_matrix = algorithm( random_data, no_of_clusters, ita, centroids_initial=centroids) elif algorithm == kmeans_clustering.kmeans: X, centroids, centroids_history = algorithm( random_data, no_of_clusters) elif algorithm == BSAS.basic_sequential_scheme: X, centroids, no_of_clusters = algorithm(random_data) if (X is None): continue # Being able to rerun this loop is the reason we use a while instead of a for loop elif algorithm == TTSS.two_threshold_sequential_scheme: X, centroids, no_of_clusters = algorithm(random_data) if (X is None): continue elif algorithm == MST.minimum_spanning_tree: X, no_of_clusters = algorithm(random_data) elif algorithm == DTA.minimum_spanning_tree_variation: X, no_of_clusters = algorithm(random_data) list_of_gammas.append(gamma(X)) print(j) j += 1 return list_of_gammas
def testMoons(self): # Create the dataset X, y = make_moons(n_samples=300, shuffle=True, noise=0.05, random_state=10) # Run the clustering algorithm clusters_number_to_execute = 2 X, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( X, no_of_clusters=clusters_number_to_execute) # Plotting plot_data(X, clusters_number_to_execute, centroids, centroids_history) # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity( X, clusters_number_to_execute, fuzzy_clustering.fuzzy) initial_indices, list_of_indices, result_list = external_criteria.external_validity( X, clusters_number_to_execute, y, fuzzy_clustering.fuzzy) # Histogram of gammas from internal and external criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) plt.show()
def relative_validity_possibilistic(X): ''' Defines the several values of the possibilistic parameter. Then conducts successive executions of the algorithm by passing to it those values and calculates all the proper relative indices. Parameters: X((N x m) numpy array): a data set of N instances and m features Returns: no_of_clusters_list: the different values of the clusters number values_of_q: the different values of the q parameter PC, PE, XB, FS: the arrays holding the values of the relative indices ''' # Initialization no_of_clusters_list = [i for i in range(2, 11)] values_of_q = [1.25, 1.5, 1.75] N = reduce(lambda x, y: x * y, X.shape[:-1]) m = X.shape[-1] # Conversion to 2-D array X = X.reshape(N, m) # Initialize arrays to hold the indices. We use separate arrays for easier modification of the code if needed. # If we wanted to use one array then this would be a 3 - dimensional array. PC = np.zeros((len(no_of_clusters_list), len(values_of_q))) PE = np.zeros((len(no_of_clusters_list), len(values_of_q))) XB = np.zeros((len(no_of_clusters_list), len(values_of_q))) FS = np.zeros((len(no_of_clusters_list), len(values_of_q))) for i, total_clusters in tqdm( enumerate(no_of_clusters_list)): # no_of_clusters # IMPORTANT: The centroids must remain the same for every run of the algorithm with the same no_of_clusters centroids_initial = np.random.choice(np.arange(np.min(X), np.max(X), 0.1), size=(total_clusters, len(X[0])), replace=False) for j, q_value in enumerate(values_of_q): #edw vazw to q # When X returns it has one more column that needs to be erased X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( X, total_clusters, centroids_initial, q=q_value) X_, centroids, centroids_history, typicality_matrix = possibilistic_clustering.possibilistic( X, total_clusters, ita, centroids_initial=centroids, q=q_value) # Calculate indices # In order to calculate indices we pass them the normalized typicality matrix, see Yang, Wu, Unsupervised possibilistic learning typicality_matrix_norm = typicality_matrix / np.sum( typicality_matrix, axis=1).reshape(len(X), 1) PC[i, j] = partition_coefficient(X, typicality_matrix_norm) PE[i, j] = partition_entropy(X, typicality_matrix_norm) XB[i, j] = Xie_Beni(X, centroids, typicality_matrix_norm) FS[i, j] = fukuyama_sugeno(X, centroids, typicality_matrix_norm, q=2) return no_of_clusters_list, values_of_q, PC, PE, XB, FS
def testBlobs(self): no_of_clusters = 4 # Create the dataset X, y = make_blobs(n_samples=500, centers=no_of_clusters, n_features=2, random_state=46) # Run the clustering algorithm clusters_number_to_execute = 4 X, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( X, no_of_clusters=clusters_number_to_execute) # Plotting plot_data(X, clusters_number_to_execute, centroids, centroids_history) # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity( X, clusters_number_to_execute, fuzzy_clustering.fuzzy) initial_indices, list_of_indices, result_list = external_criteria.external_validity( X, clusters_number_to_execute, y, fuzzy_clustering.fuzzy) # Histogram of gammas from internal criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) plt.show()
def testImageSegmentation(self): image = ndimage.imread('..//..//images//181091.jpg') image = image.astype(np.int32, copy=False) # Algorithm execution. clusters_number_to_execute = 28 clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme( image) X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( image, no_of_clusters=clusters_number_to_execute) ################################################################### # Merging procedure X_ = image_segm_utility.merging_procedure(X_, 500) # Calculate the Rand Index to test similarity to external data original_image = '181091.jpg' seg_file = '181091.seg' external_info = image_segm_utility.insert_clusters( original_image, seg_file) rand_index = image_segm_utility.rand_index_calculation( X_, external_info) print(rand_index) # Draw the clustered image draw_clustered_image(X_, image.shape, rand_index) plt.show()
def testMoons(self): no_of_clusters = 2 # Create the dataset X, y = make_moons(n_samples=300, shuffle=True, noise=0.1, random_state=10) # Run the clustering algorithm X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( X, no_of_clusters) X, centroids, centroids_history, typicality_matrix = possibilistic_clustering.possibilistic( X, no_of_clusters, ita, centroids_initial=centroids) # Plotting plot_data(X, centroids, no_of_clusters, centroids_history) # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity( X, no_of_clusters, possibilistic_clustering.possibilistic) initial_indices, list_of_indices, result_list = external_criteria.external_validity( X, no_of_clusters, y, possibilistic_clustering.possibilistic) # Histogram of gammas from internal and external criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) plt.show()
def testBlobs(self): no_of_clusters = 3 # Create the dataset X, y = make_blobs(n_samples=100, centers=no_of_clusters, n_features=2, random_state=None) # Run the clustering algorithm. First run fuzzy clustering to get ita X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( X, no_of_clusters) X, centroids, centroids_history, typicality_matrix = possibilistic_clustering.possibilistic( X, no_of_clusters, ita, centroids_initial=centroids) # Plotting plot_data(X, centroids, no_of_clusters, centroids_history) ''' # Examine Cluster Validity with statistical tests initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(X, no_of_clusters, possibilistic_clustering.possibilistic) initial_indices, list_of_indices, result_list = external_criteria.external_validity(X, no_of_clusters, y, possibilistic_clustering.possibilistic) # Histogram of gammas from internal criteria hist_internal_criteria(initial_gamma, list_of_gammas, result) hist_external_criteria(initial_indices, list_of_indices, result_list) ''' plt.show()
def relative_validity_fuzzy(X, no_of_clusters): ''' Constructs the framework into which successive executions of the algorithm take place Parameters: X((m x n) 2-d numpy array): a data set of m instances and n features no_of_clusters: the number of clusters Returns: no_of_clusters_list: the different number of clusters tried values_of_q: the different values of q that were tried. PC, PE, XB, FS : the arrays holding the values of the four indices ''' # Initialization no_of_clusters_list = [i for i in range(2, 11)] values_of_q = [1.25, 1.5, 2, 2.5, 3, 3.5, 5] # Initialize arrays to hold the indices. We use separate arrays for easier modification of the code if needed. # If we wanted to use one array then this would be a 3 - dimensional array. PC = np.zeros((len(no_of_clusters_list), len(values_of_q))) PE = np.zeros((len(no_of_clusters_list), len(values_of_q))) XB = np.zeros((len(no_of_clusters_list), len(values_of_q))) FS = np.zeros((len(no_of_clusters_list), len(values_of_q))) for i, total_clusters in tqdm( enumerate(no_of_clusters_list)): # no_of_clusters # IMPORTANT: The centroids must remain the same for every run of the algorithm with the same no_of_clusters centroids_initial = np.random.choice(np.arange(np.min(X), np.max(X), 0.1), size=(total_clusters, len(X[0])), replace=False) for j, q_value in enumerate(values_of_q): #edw vazw to q # When X returns it has one more column that needs to be erased X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( X, total_clusters, centroids_initial, q=q_value) # Calculate indices PC[i, j] = partition_coefficient(X, partition_matrix) PE[i, j] = partition_entropy(X, partition_matrix) XB[i, j] = Xie_Beni(X, centroids, partition_matrix) FS[i, j] = fukuyama_sugeno(X, centroids, partition_matrix, q=2) # Print just one clustering effort, the correct one in order to compare it with the indices' signals if q_value == 1.25 and total_clusters == no_of_clusters: plot_data(X_, centroids, total_clusters, centroids_history) return no_of_clusters_list, values_of_q, PC, PE, XB, FS
def monte_carlo(data, no_of_clusters, algorithm): ''' Creates 100 (could be set as argument) sampling distributions of uniformingly distributed data and calls the appropriate functions in order to cluster each distribution and calculate its Gamma statistic. Parameters: data((m x n) 2-d numpy array): a data set of m instances and n features no_of_clusters(integer): the number of clusters Returns: list_of_gammas(list): the Gamma statistics of all the monte carlo sample distributions ''' N = len(data) m = len(data[0]) - 1 # Monte Carlo simulation - create the datasets (random position hypothesis) list_of_gammas = [] pbar = tqdm(range(100)) pbar.set_description('Monte carlo sim. - internal indices') for _ in pbar: random_data = np.empty((N, m)) for i in range(m): max_value = np.amax(data[:, i]) min_value = np.min(data[:, i]) temp = (max_value - min_value) * np.random.random(size=(N, 1)) + min_value random_data[:, [i]] = temp if algorithm == fuzzy_clustering.fuzzy: X, centroids, ita, centroids_history, partition_matrix = algorithm( random_data, no_of_clusters) elif algorithm == possibilistic_clustering.possibilistic: X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy( random_data, no_of_clusters) X, centroids, centroids_history, typicality_matrix = algorithm( random_data, no_of_clusters, ita, centroids_initial=centroids) elif algorithm == kmeans_clustering.kmeans: X, centroids, centroids_history = algorithm( random_data, no_of_clusters) elif algorithm == BSAS.basic_sequential_scheme: X, centroids, no_of_clusters = algorithm(random_data) elif algorithm == TTSS.two_threshold_sequential_scheme: X, centroids, no_of_clusters = algorithm(random_data) list_of_gammas.append(gamma(X)) return list_of_gammas