def monte_carlo(data, no_of_clusters, algorithm):
    ''' Creates 100 (could be set as argument) sampling distributions of uniformingly distributed data and
        calls the algorithm passed as argument in order to cluster each distribution and calculate its Gamma statistic.
        
    Parameters:
        data((N x m) 2-d numpy array): a data set of N instances and m features
        no_of_clusters(integer): the number of clusters
        algorithm: the algorithm function to be used to cluster the data
    
    Returns:
        list_of_gammas(list): the Gamma statistics of all the monte carlo sample distributions
        
    '''
    N = len(data)
    m = len(data[0]) - 1

    # Monte Carlo simulation - create the datasets (random position hypothesis)
    list_of_gammas = []
    #pbar = tqdm(range(100))
    #pbar.set_description('Monte carlo sim. - internal indices')
    j = 0
    while j < 100:
        random_data = np.empty((N, m))

        for i in range(m):
            max_value = np.amax(data[:, i])
            min_value = np.min(data[:, i])
            temp = (max_value -
                    min_value) * np.random.random(size=(N, 1)) + min_value
            random_data[:, [i]] = temp

        if algorithm == fuzzy_clustering.fuzzy:
            X, centroids, ita, centroids_history, partition_matrix = algorithm(
                random_data, no_of_clusters)
        elif algorithm == possibilistic_clustering.possibilistic:
            X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
                random_data, no_of_clusters)
            X, centroids, centroids_history, typicality_matrix = algorithm(
                random_data, no_of_clusters, ita, centroids_initial=centroids)
        elif algorithm == kmeans_clustering.kmeans:
            X, centroids, centroids_history = algorithm(
                random_data, no_of_clusters)
        elif algorithm == BSAS.basic_sequential_scheme:
            X, centroids, no_of_clusters = algorithm(random_data)
            if (X is None):
                continue  # Being able to rerun this loop is the reason we use a while instead of a for loop
        elif algorithm == TTSS.two_threshold_sequential_scheme:
            X, centroids, no_of_clusters = algorithm(random_data)
            if (X is None):
                continue
        elif algorithm == MST.minimum_spanning_tree:
            X, no_of_clusters = algorithm(random_data)
        elif algorithm == DTA.minimum_spanning_tree_variation:
            X, no_of_clusters = algorithm(random_data)

        list_of_gammas.append(gamma(X))
        print(j)
        j += 1

    return list_of_gammas
Example #2
0
    def testMoons(self):
        # Create the dataset
        X, y = make_moons(n_samples=300,
                          shuffle=True,
                          noise=0.05,
                          random_state=10)

        # Run the clustering algorithm
        clusters_number_to_execute = 2
        X, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
            X, no_of_clusters=clusters_number_to_execute)

        # Plotting
        plot_data(X, clusters_number_to_execute, centroids, centroids_history)

        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(
            X, clusters_number_to_execute, fuzzy_clustering.fuzzy)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(
            X, clusters_number_to_execute, y, fuzzy_clustering.fuzzy)

        # Histogram of gammas from internal and external criteria
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)

        plt.show()
Example #3
0
def relative_validity_possibilistic(X):
    ''' Defines the several values of the possibilistic parameter. Then conducts successive executions of the algorithm by passing to it 
        those values and calculates all the proper relative indices.
        
        Parameters:
            X((N x m) numpy array): a data set of N instances and m features
        
        Returns:
            no_of_clusters_list: the different values of the clusters number
            values_of_q: the different values of the q parameter
            PC, PE, XB, FS: the arrays holding the values of the relative indices
    '''
    # Initialization
    no_of_clusters_list = [i for i in range(2, 11)]
    values_of_q = [1.25, 1.5, 1.75]

    N = reduce(lambda x, y: x * y, X.shape[:-1])
    m = X.shape[-1]

    # Conversion to 2-D array
    X = X.reshape(N, m)

    # Initialize arrays to hold the indices. We use separate arrays for easier modification of the code if needed.
    # If we wanted to use one array then this would be a 3 - dimensional array.
    PC = np.zeros((len(no_of_clusters_list), len(values_of_q)))
    PE = np.zeros((len(no_of_clusters_list), len(values_of_q)))
    XB = np.zeros((len(no_of_clusters_list), len(values_of_q)))
    FS = np.zeros((len(no_of_clusters_list), len(values_of_q)))

    for i, total_clusters in tqdm(
            enumerate(no_of_clusters_list)):  # no_of_clusters
        # IMPORTANT: The centroids must remain the same for every run of the algorithm with the same no_of_clusters
        centroids_initial = np.random.choice(np.arange(np.min(X), np.max(X),
                                                       0.1),
                                             size=(total_clusters, len(X[0])),
                                             replace=False)

        for j, q_value in enumerate(values_of_q):  #edw vazw to q

            # When X returns it has one more column that needs to be erased
            X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
                X, total_clusters, centroids_initial, q=q_value)
            X_, centroids, centroids_history, typicality_matrix = possibilistic_clustering.possibilistic(
                X, total_clusters, ita, centroids_initial=centroids, q=q_value)

            # Calculate indices
            # In order to calculate indices we pass them the normalized typicality matrix, see Yang, Wu, Unsupervised possibilistic learning

            typicality_matrix_norm = typicality_matrix / np.sum(
                typicality_matrix, axis=1).reshape(len(X), 1)

            PC[i, j] = partition_coefficient(X, typicality_matrix_norm)
            PE[i, j] = partition_entropy(X, typicality_matrix_norm)
            XB[i, j] = Xie_Beni(X, centroids, typicality_matrix_norm)
            FS[i, j] = fukuyama_sugeno(X,
                                       centroids,
                                       typicality_matrix_norm,
                                       q=2)

    return no_of_clusters_list, values_of_q, PC, PE, XB, FS
Example #4
0
    def testBlobs(self):
        no_of_clusters = 4

        # Create the dataset
        X, y = make_blobs(n_samples=500,
                          centers=no_of_clusters,
                          n_features=2,
                          random_state=46)

        # Run the clustering algorithm
        clusters_number_to_execute = 4
        X, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
            X, no_of_clusters=clusters_number_to_execute)

        # Plotting
        plot_data(X, clusters_number_to_execute, centroids, centroids_history)

        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(
            X, clusters_number_to_execute, fuzzy_clustering.fuzzy)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(
            X, clusters_number_to_execute, y, fuzzy_clustering.fuzzy)

        # Histogram of gammas from internal criteria
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)

        plt.show()
Example #5
0
    def testImageSegmentation(self):
        image = ndimage.imread('..//..//images//181091.jpg')
        image = image.astype(np.int32, copy=False)

        # Algorithm execution.
        clusters_number_to_execute = 28
        clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme(
            image)
        X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
            image, no_of_clusters=clusters_number_to_execute)

        ###################################################################
        # Merging procedure

        X_ = image_segm_utility.merging_procedure(X_, 500)

        # Calculate the Rand Index to test similarity to external data
        original_image = '181091.jpg'
        seg_file = '181091.seg'
        external_info = image_segm_utility.insert_clusters(
            original_image, seg_file)
        rand_index = image_segm_utility.rand_index_calculation(
            X_, external_info)
        print(rand_index)

        # Draw the clustered image
        draw_clustered_image(X_, image.shape, rand_index)
        plt.show()
Example #6
0
    def testMoons(self):
        no_of_clusters = 2

        # Create the dataset
        X, y = make_moons(n_samples=300,
                          shuffle=True,
                          noise=0.1,
                          random_state=10)

        # Run the clustering algorithm
        X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
            X, no_of_clusters)
        X, centroids, centroids_history, typicality_matrix = possibilistic_clustering.possibilistic(
            X, no_of_clusters, ita, centroids_initial=centroids)

        # Plotting
        plot_data(X, centroids, no_of_clusters, centroids_history)

        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(
            X, no_of_clusters, possibilistic_clustering.possibilistic)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(
            X, no_of_clusters, y, possibilistic_clustering.possibilistic)

        # Histogram of gammas from internal and external criteria
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)

        plt.show()
Example #7
0
    def testBlobs(self):
        no_of_clusters = 3

        # Create the dataset
        X, y = make_blobs(n_samples=100,
                          centers=no_of_clusters,
                          n_features=2,
                          random_state=None)

        # Run the clustering algorithm. First run fuzzy clustering to get ita
        X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
            X, no_of_clusters)
        X, centroids, centroids_history, typicality_matrix = possibilistic_clustering.possibilistic(
            X, no_of_clusters, ita, centroids_initial=centroids)

        # Plotting
        plot_data(X, centroids, no_of_clusters, centroids_history)
        '''
        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(X, no_of_clusters, possibilistic_clustering.possibilistic)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(X, no_of_clusters, y,  possibilistic_clustering.possibilistic)
        
        # Histogram of gammas from internal criteria 
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)
        '''
        plt.show()
Example #8
0
def relative_validity_fuzzy(X, no_of_clusters):
    ''' Constructs the framework into which successive executions of the 
        algorithm take place
        
        Parameters:
        X((m x n) 2-d numpy array): a data set of m instances and n features
        no_of_clusters: the number of clusters
        
        Returns:
        no_of_clusters_list: the different number of clusters tried
        values_of_q: the different values of q that were tried.
        PC, PE, XB, FS : the arrays holding the values of the four indices
    '''
    # Initialization
    no_of_clusters_list = [i for i in range(2, 11)]
    values_of_q = [1.25, 1.5, 2, 2.5, 3, 3.5, 5]

    # Initialize arrays to hold the indices. We use separate arrays for easier modification of the code if needed.
    # If we wanted to use one array then this would be a 3 - dimensional array.
    PC = np.zeros((len(no_of_clusters_list), len(values_of_q)))
    PE = np.zeros((len(no_of_clusters_list), len(values_of_q)))
    XB = np.zeros((len(no_of_clusters_list), len(values_of_q)))
    FS = np.zeros((len(no_of_clusters_list), len(values_of_q)))

    for i, total_clusters in tqdm(
            enumerate(no_of_clusters_list)):  # no_of_clusters
        # IMPORTANT: The centroids must remain the same for every run of the algorithm with the same no_of_clusters
        centroids_initial = np.random.choice(np.arange(np.min(X), np.max(X),
                                                       0.1),
                                             size=(total_clusters, len(X[0])),
                                             replace=False)

        for j, q_value in enumerate(values_of_q):  #edw vazw to q

            # When X returns it has one more column that needs to be erased
            X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
                X, total_clusters, centroids_initial, q=q_value)

            # Calculate indices
            PC[i, j] = partition_coefficient(X, partition_matrix)
            PE[i, j] = partition_entropy(X, partition_matrix)
            XB[i, j] = Xie_Beni(X, centroids, partition_matrix)
            FS[i, j] = fukuyama_sugeno(X, centroids, partition_matrix, q=2)

            # Print just one clustering effort, the correct one in order to compare it with the indices' signals
            if q_value == 1.25 and total_clusters == no_of_clusters:
                plot_data(X_, centroids, total_clusters, centroids_history)

    return no_of_clusters_list, values_of_q, PC, PE, XB, FS
Example #9
0
def monte_carlo(data, no_of_clusters, algorithm):
    ''' Creates 100 (could be set as argument) sampling distributions of uniformingly distributed data and
        calls the appropriate functions in order to cluster each distribution and calculate its Gamma statistic.
        
    Parameters:
        data((m x n) 2-d numpy array): a data set of m instances and n features
        no_of_clusters(integer): the number of clusters
    
    Returns:
        list_of_gammas(list): the Gamma statistics of all the monte carlo sample distributions
        
    '''
    N = len(data)
    m = len(data[0]) - 1

    # Monte Carlo simulation - create the datasets (random position hypothesis)
    list_of_gammas = []
    pbar = tqdm(range(100))
    pbar.set_description('Monte carlo sim. - internal indices')
    for _ in pbar:
        random_data = np.empty((N, m))

        for i in range(m):
            max_value = np.amax(data[:, i])
            min_value = np.min(data[:, i])
            temp = (max_value -
                    min_value) * np.random.random(size=(N, 1)) + min_value
            random_data[:, [i]] = temp

        if algorithm == fuzzy_clustering.fuzzy:
            X, centroids, ita, centroids_history, partition_matrix = algorithm(
                random_data, no_of_clusters)
        elif algorithm == possibilistic_clustering.possibilistic:
            X_, centroids, ita, centroids_history, partition_matrix = fuzzy_clustering.fuzzy(
                random_data, no_of_clusters)
            X, centroids, centroids_history, typicality_matrix = algorithm(
                random_data, no_of_clusters, ita, centroids_initial=centroids)
        elif algorithm == kmeans_clustering.kmeans:
            X, centroids, centroids_history = algorithm(
                random_data, no_of_clusters)
        elif algorithm == BSAS.basic_sequential_scheme:
            X, centroids, no_of_clusters = algorithm(random_data)
        elif algorithm == TTSS.two_threshold_sequential_scheme:
            X, centroids, no_of_clusters = algorithm(random_data)

        list_of_gammas.append(gamma(X))

    return list_of_gammas