Beispiel #1
0
def relative_validity_hard_large_data(X):
    # Initialization
    no_of_clusters_list = [i for i in range(2, 11)]

    DB = np.zeros(len(no_of_clusters_list))

    # Centroids must remain the same. The only parameter that should change is the number of clusters
    clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme(
        X)

    for i, total_clusters in tqdm(
            enumerate(no_of_clusters_list)):  # no_of_clusters

        if len(centroids_BSAS) < total_clusters:
            centroids = np.zeros((total_clusters, len(X[0])))
            # First centroids values
            centroids[:len(centroids_BSAS), :] = centroids_BSAS
            # Last centroids values
            random_indices = np.random.randint(len(X),
                                               size=total_clusters -
                                               len(centroids_BSAS))
            centroids[len(centroids_BSAS):, :] = X[random_indices, :]
        elif len(centroids_BSAS) > total_clusters:
            centroids = centroids_BSAS[:total_clusters, :]
        elif len(centroids_BSAS) == total_clusters:
            centroids = centroids_BSAS

        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            X, total_clusters, centroids_initial=centroids)

        DB[i] = Davies_Bouldin(X_, centroids)

    return no_of_clusters_list, DB
Beispiel #2
0
def gap_index(X, no_of_clusters):

    log_W = _gap_index_calculation(X)
    # Create an array to hold the logW values of the 100 monte carlo simulations
    log_W_sample = np.zeros((100))

    N = len(X)
    m = len(X[0]) - 1
    # Monte Carlo simulation - create the datasets (random position hypothesis)
    for i in range(100):
        random_data = np.empty((N, m))

        for i in range(m):
            max_value = np.amax(X[:, i])
            min_value = np.min(X[:, i])
            temp = (max_value -
                    min_value) * np.random.random(size=(N, 1)) + min_value
            random_data[:, [i]] = temp

        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            random_data, no_of_clusters)
        log_W_sample[i] = _gap_index_calculation(X_)

    Gap = np.average(log_W_sample) - log_W

    return Gap
    def testMoons(self):
        no_of_clusters = 2

        # Create the dataset
        X, y = make_moons(n_samples=300,
                          shuffle=True,
                          noise=0.1,
                          random_state=10)

        # Run the clustering algorithm
        X, centroids, centroids_history = kmeans_clustering.kmeans(
            X, no_of_clusters)

        # Plotting
        plot_data(X, no_of_clusters, centroids, centroids_history)

        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(
            X, no_of_clusters, kmeans_clustering.kmeans)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(
            X, no_of_clusters, y, kmeans_clustering.kmeans)

        # Histogram of gammas from internal and external criteria
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)

        plt.show()
    def testBlobs(self):
        no_of_clusters = 4

        # Create the dataset
        X, y = make_blobs(n_samples=500,
                          centers=no_of_clusters,
                          n_features=2,
                          random_state=185)

        # Run the clustering algorithm but first run a sequential algorithm to obtain initial centroids
        clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme(
            X)
        X, centroids, centroids_history = kmeans_clustering.kmeans(
            X, no_of_clusters, centroids_initial=centroids)

        # Plotting
        plot_data(X, no_of_clusters, centroids, centroids_history)

        # Examine Cluster Validity with statistical tests
        initial_gamma, list_of_gammas, result = internal_criteria.internal_validity(
            X, no_of_clusters, kmeans_clustering.kmeans)
        initial_indices, list_of_indices, result_list = external_criteria.external_validity(
            X, no_of_clusters, y, kmeans_clustering.kmeans)

        # Histogram of gammas from internal criteria
        hist_internal_criteria(initial_gamma, list_of_gammas, result)
        hist_external_criteria(initial_indices, list_of_indices, result_list)

        plt.show()
    def testImageSegmentation(self):
        image = ndimage.imread('..//..//images//181091.jpg')
        image = image.astype(np.int32, copy=False)

        # Algorithm execution. We run BSAS first to get estimates for the centroids
        number_of_clusters = 3
        clustered_data, centroids, total_clusters = BSAS.basic_sequential_scheme(
            image)
        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            image,
            no_of_clusters=number_of_clusters,
            centroids_initial=centroids)

        ###################################################################
        # Merging procedure

        X_ = image_segm_utility.merging_procedure(X_, 500)

        # Calculate the Rand Index to test similarity to external data
        original_image = '181091.jpg'
        seg_file = '181091.seg'
        external_info = image_segm_utility.insert_clusters(
            original_image, seg_file)
        rand_index = image_segm_utility.rand_index_calculation(
            X_, external_info)
        print(rand_index)

        # Draw the clustered image
        draw_clustered_image(X_, image.shape, rand_index)
        plt.show()
    def testTobeErased(self):
        image = ndimage.imread('..//..//images//231015.jpg')
        image = image.astype(np.int32, copy=False)
        number_of_clusters = 2
        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            image, no_of_clusters=number_of_clusters)

        X_ = image_segm_utility.merging_procedure(X_)
Beispiel #7
0
def relative_validity_hard(X):
    ''' Defines the several values of the kmeans parameter. Then conducts successive executions of the algorithm by passing to it 
        those values and calculates all the proper relative indices.
        
        Parameters:
            X((N x m) numpy array): a data set of N instances and m features
        
        Returns:
            no_of_clusters_list: the different values of the clusters number
            DI, DB, SI, GI: the arrays holding the values of the relative indices
    '''
    # Initialization
    no_of_clusters_list = [i for i in range(2, 11)]

    DI = np.zeros(len(no_of_clusters_list))
    DB = np.zeros(len(no_of_clusters_list))
    SI = np.zeros(len(no_of_clusters_list))
    GI = np.zeros(len(no_of_clusters_list))

    # Centroids must remain the same. The only parameter that should change is the number of clusters
    clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme(
        X)

    for i, total_clusters in tqdm(
            enumerate(no_of_clusters_list)):  # no_of_clusters

        if len(centroids_BSAS) < total_clusters:
            centroids = np.zeros((total_clusters, len(X[0])))
            # First centroids values
            centroids[:len(centroids_BSAS), :] = centroids_BSAS
            # Last centroids values
            random_indices = np.random.randint(len(X),
                                               size=total_clusters -
                                               len(centroids_BSAS))
            centroids[len(centroids_BSAS):, :] = X[random_indices, :]
        elif len(centroids_BSAS) > total_clusters:
            centroids = centroids_BSAS[:total_clusters, :]
        elif len(centroids_BSAS) == total_clusters:
            centroids = centroids_BSAS

        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            X, total_clusters, centroids_initial=centroids)

        DI[i] = Dunn_index(X_)
        DB[i] = Davies_Bouldin(X_, centroids)
        SI[i] = silhouette_index(X_)
        GI[i] = gap_index(X_, total_clusters, kmeans_clustering.kmeans)

    return no_of_clusters_list, DI, DB, SI, GI
Beispiel #8
0
def relative_validity_hard(X, no_of_clusters):
    # Initialization
    no_of_clusters_list = [i for i in range(2, 11)]

    DI = np.zeros(len(no_of_clusters_list))
    DB = np.zeros(len(no_of_clusters_list))
    SI = np.zeros(len(no_of_clusters_list))
    GI = np.zeros(len(no_of_clusters_list))

    # Centroids must remain the same. The only parameter that should change is the number of clusters
    clustered_data, centroids_BSAS, total_clusters_ = BSAS.basic_sequential_scheme(
        X)

    for i, total_clusters in tqdm(
            enumerate(no_of_clusters_list)):  # no_of_clusters

        if len(centroids_BSAS) < total_clusters:
            centroids = np.zeros((total_clusters, len(X[0])))
            # First centroids values
            centroids[:len(centroids_BSAS), :] = centroids_BSAS
            # Last centroids values
            random_indices = np.random.randint(len(X),
                                               size=total_clusters -
                                               len(centroids_BSAS))
            centroids[len(centroids_BSAS):, :] = X[random_indices, :]
        elif len(centroids_BSAS) > total_clusters:
            centroids = centroids_BSAS[:no_of_clusters, :]
        elif len(centroids_BSAS) == total_clusters:
            centroids = centroids_BSAS

        X_, centroids, centroids_history = kmeans_clustering.kmeans(
            X, total_clusters, centroids_initial=centroids)

        DI[i] = Dunn_index(X_)
        DB[i] = Davies_Bouldin(X_, centroids)
        SI[i] = silhouette_index(X_)
        GI[i] = gap_index(X_, total_clusters)

        # Print just one clustering effort, the correct one in order to compare it with the indices' signals
        if total_clusters == no_of_clusters:
            plot_data(X_, centroids, total_clusters, centroids_history)

    return no_of_clusters_list, DI, DB, SI, GI
Beispiel #9
0
def gap_index(X, no_of_clusters, algorithm):
    ''' Calculates the Gap index of a clustered dataset.
    
        Parameters: 
        X((N x m + 1) numpy array): a clustered data set of N instances, m features and the cluster id at the last column of each vector
        no_of_clusters: the number of clusters
        algorithm: the function object representing the algorithm that called the function
        
        Returns:
        The Gap index
    '''
    log_W = _gap_index_calculation(X)
    # Create an array to hold the logW values of the 100 monte carlo simulations
    log_W_sample = np.zeros((100))

    N = len(X)
    m = len(X[0]) - 1
    # Monte Carlo simulation - create the datasets (random position hypothesis)
    for i in range(100):
        random_data = np.empty((N, m))

        for j in range(m):
            max_value = np.amax(X[:, j])
            min_value = np.min(X[:, j])
            temp = (max_value -
                    min_value) * np.random.random(size=(N, 1)) + min_value
            random_data[:, [j]] = temp

        if algorithm == kmeans_clustering.kmeans:
            X_, centroids, centroids_history = kmeans_clustering.kmeans(
                random_data, no_of_clusters)

        log_W_sample[i] = _gap_index_calculation(X_)

    Gap = np.average(log_W_sample) - log_W

    return Gap