Esempio n. 1
0
def test_bad_init_config():
    """
    Cannot define own clustering function and try to use Rust backend
    """
    with pytest.raises(ValueError):
        OptimalK(parallel_backend="rust",
                 clusterer=lambda x, k: print("just testing"))
Esempio n. 2
0
def gap_optimalk(matrix):

    optimalk = OptimalK(parallel_backend='joblib')
    k = optimalk(matrix, cluster_array=np.arange(1, 20))
    print('\nOptimal number of clusters is ', k)

    return k
Esempio n. 3
0
def test_dunders():
    """
    Test that implemented dunder methods don't return errors
    """
    from gap_statistic import OptimalK
    optimalK = OptimalK()
    optimalK.__str__()
    optimalK.__repr__()
    optimalK._repr_html_()
Esempio n. 4
0
def test_optimalk_cluster_array_values_error():
    """
    Test ValueError when cluster_array contains values less than 1
    """
    from gap_statistic import OptimalK

    # Create optimalK instance
    optimalK = OptimalK(parallel_backend=None, n_jobs=-1)

    # Create data
    X, y = make_blobs(n_samples=int(1e3), n_features=2, centers=3)

    with pytest.raises(ValueError) as excinfo:
        optimalK(X, cluster_array=[0, -1, 1, 2, 3])
    assert "cluster_array contains values less than 1" in str(excinfo.value)
Esempio n. 5
0
def test_optimalk_cluster_array_empty_error():
    """
    Test ValueError when cluster_array is empty.
    """
    from gap_statistic import OptimalK

    # Create optimalK instance
    optimalK = OptimalK(parallel_backend=None, n_jobs=-1)

    # Create data
    X, y = make_blobs(n_samples=int(1e3), n_features=2, centers=3)

    with pytest.raises(ValueError) as excinfo:
        optimalK(X, cluster_array=[])
    assert "The supplied cluster_array has no values." in str(excinfo.value)
Esempio n. 6
0
def test_optimalk_cluster_array_vs_data_sizes_error():
    """
    Test ValueError when cluster_array is larger than dataset.
    """
    import numpy as np
    from gap_statistic import OptimalK

    # Create optimalK instance
    optimalK = OptimalK(parallel_backend=None, n_jobs=-1)

    # Create data
    X, y = make_blobs(n_samples=5, n_features=2, centers=3)

    with pytest.raises(ValueError) as excinfo:
        optimalK(X, cluster_array=np.arange(1, 10))
    assert "The number of suggested clusters to try" in str(excinfo.value)
    def _estimate_k(self, include_bic: bool, include_gap: bool):
        """
        Estimate the best k -number of clusters- using various methods.

        Returns
        -------
        k : int
            An average estimation of three methods: bic, the gap statistic and
            GMeans gauusians.
            Note: the data would be L2-normalised before proceeding

        """
        gmeans = GMeans(random_state=None, max_depth=500)
        gmeans.fit(self.data)
        k_gaussian = len(unique(gmeans.labels_))

        if include_gap:
            # Define a custom clusterer for the Gap statistic
            def ms(X, k):
                c = MeanShift()
                c.fit(X)
                return c.cluster_centers_, c.predict(X)
            gap = OptimalK(clusterer=ms)
            k_gap = gap(X=self.data,
                        cluster_array=range(2, len(self.data)-1))

            if include_bic:
                k_bic = len(unique(self._cluster_xmeans()))
                est_k = round((k_bic + k_gap + k_gaussian) / 3)
                return (est_k,
                        [est_k, k_bic, k_gap, k_gaussian])
            else:
                est_k = round((k_gap + k_gaussian) / 2)
                return (est_k,
                        [est_k, k_gap, k_gaussian])
        # TODO: None gap stats would generate errors when averaging
        # to form k-trends
        else:
            if include_bic:
                k_bic = len(unique(self._cluster_xmeans()))
                est_k = round((k_bic + k_gaussian) / 3)
                return (est_k,
                        [est_k, k_bic, None, k_gaussian])
            else:
                est_k = round(k_gaussian)
                return (est_k,
                        [est_k, None, k_gaussian])
Esempio n. 8
0
def test_cluster(data, k_min=200, k_max=380, k_incerement = 100, n_references=5 ):
    gap, reference_inertia, ondata_inertia = compute_gap(KMeans(), data, k_min=k_min,k_max=k_max,
                                                         k_incerement = k_incerement, n_references=n_references)

    plt.plot(range(1, k_max + 1), reference_inertia,
           '-o', label='reference')
    plt.plot(range(1, k_max + 1), ondata_inertia,
             '-o', label='data')
    plt.xlabel('k')
    plt.ylabel('log(inertia)')
    plt.show()
    plt.savefig('gap_clustering.jpg')

    # Define the OptimalK instance, but pass in our own clustering function
    optimalk = OptimalK(clusterer=special_clustering_func)
    # Use the callable instance as normal.
    n_clusters = optimalk(X, n_refs=3, cluster_array=range(k_min, k_max, k_incerement))
Esempio n. 9
0
def test_optimalk_rust_ext():
    """
    Test core functionality of OptimalK using all backends.
    """

    # Create optimalK instance
    optimalK = OptimalK(parallel_backend="rust", n_jobs=1)

    # Create data
    X, y = make_blobs(n_samples=int(1e3), n_features=2, centers=3)

    suggested_clusters = optimalK(X, n_refs=3, cluster_array=np.arange(1, 10))

    assert np.allclose(
        suggested_clusters, 3,
        2), "Correct clusters is {}, OptimalK suggested {}".format(
            3, suggested_clusters)
Esempio n. 10
0
def test_optimalk(parallel_backend, n_jobs, n_clusters):
    """
    Test core functionality of OptimalK using all backends.
    """
    import numpy as np
    from sklearn.datasets.samples_generator import make_blobs
    from gap_statistic import OptimalK

    # Create optimalK instance
    optimalK = OptimalK(parallel_backend=parallel_backend, n_jobs=n_jobs)

    # Create data
    X, y = make_blobs(n_samples=int(1e3), n_features=2, centers=3)

    suggested_clusters = optimalK(X, n_refs=3, cluster_array=np.arange(1, 10))

    assert np.allclose(suggested_clusters, n_clusters, 2), ('Correct clusters is {}, OptimalK suggested {}'
                                                            .format(n_clusters, suggested_clusters))
Esempio n. 11
0
def test_alternative_clusting_method(ClusterModel):
    """
    Test that users can supply alternative clustering method as dep injection
    """
    def clusterer(X: np.ndarray, k: int, another_test_arg):
        """
        Function to wrap a sklearn model as a clusterer for OptimalK
        First two arguments are always the data matrix, and k, and can supply
        """
        m = ClusterModel()
        m.fit(X)
        assert another_test_arg == "test"
        return m.cluster_centers_, m.predict(X)

    optimalk = OptimalK(
        n_jobs=-1,
        parallel_backend="joblib",
        clusterer=clusterer,
        clusterer_kwargs={"another_test_arg": "test"},
    )
    X, y = make_blobs(n_samples=50, n_features=2, centers=3)
    n_clusters = optimalk(X, n_refs=3, cluster_array=np.arange(1, 5))
    assert isinstance(n_clusters, int)
Esempio n. 12
0
"""Calculate number of clusters by use of the 
Gap statistic. Uses: https://github.com/milesgranger/gap_statistic
and based on their Example.ipynb.
"""

import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from gap_statistic import OptimalK
from sklearn.datasets.samples_generator import make_blobs
from sklearn.cluster import KMeans

# Initialise OptimalK class
#optimalK = OptimalK(parallel_backend='rust')
optimalK = OptimalK()
optimalK

# Make some test data
#X, y = make_blobs(n_samples=int(1e5), n_features=2, centers=3, random_state=25)
#print('Data shape: ', X.shape)
#print(X, type(X))
#X = np.array([[100., 1.], [200.,1.],[220.,1.],[230.,1.], [500.,1.], [600.,1.]])
X = np.array([[100.], [200.], [220.], [230.], [580.], [600.]])
#X = np.array([[100.],[200.],[300.],[400.], [500.], [600.]])
#X = np.array([[100.],[180.],[300.],[410.], [500.], [610.]])
print(X, type(X))
# Call OptimalK to determine best number of clusters
print('Calculating optimal number of clusters')
n_clusters = optimalK(X, cluster_array=np.arange(1, 6), n_refs=100)
print('Optimal clusters: ', n_clusters)
Esempio n. 13
0
# -*- coding: utf-8 -*-
"""
Test performance of gap statistic on normal data.  
@author: ysirotin
"""

import numpy as np
import scipy as sc
from gap_statistic import OptimalK
import matplotlib.pyplot as plt

optimalK = OptimalK(parallel_backend='rust')

a1 = 0
mu1 = 0.0
sig1 = 0.1

a2 = 0
mu2 = 1.0
sig2 = 0.1

a3 = 0
mu3 = 2.0
sig3 = 0.1

N = 10000

# two bumps
x = np.linspace(-2, 3, 100)

fig, ax = plt.subplots(2, 2)
Esempio n. 14
0
def kmeans_find_num_clusters(X, method='elbow', n_clust_min=2, n_clust_max=20, inc=1):

    if method in ['elbow', 'silhouette', 'pred_strength']:
        # For the silhouette coefficient method, mininum number of clusters must be 2:
        if method == 'silhouette':
            n_clust_min = max(n_clust_min, 2)

        # Initialize lists for different parameters:
        results_list = []

        # Create train and test sets for the prediction strength:
        if method == 'pred_strength':
            np.random.seed(42)
            msk = np.random.rand(X.shape[0]) < 0.8
            X_train, X_test = X[msk, :], X[~msk, :]

        for jj in range(n_clust_min, n_clust_max+1, inc):
            # Run k-Means:
            model = cluster.KMeans(n_clusters=jj, random_state=42, verbose=0)
            model.fit(X)

            if method == 'elbow':
                # Save the inertia statistic from the clustering algorithm:
                results_list.append(model.inertia_)

            elif method == 'silhouette':
                # Calculate and save the Silhouette score for the current clustering:
                silh_coef = metrics.silhouette_score(X, model.labels_, metric='euclidean')
                results_list.append(silh_coef)

            elif method == 'pred_strength':
                # Calculate prediction strength:
                model_train = cluster.KMeans(n_clusters=jj, random_state=42).fit(X_train)
                model_test = cluster.KMeans(n_clusters=jj, random_state=42).fit(X_test)
                pred_str = get_prediction_strength(jj, model_train.cluster_centers_, X_test, model_test.labels_)
                results_list.append(pred_str)

        if method == 'elbow':
            # Use elbow of inertia curve as initial guess for optimal cluster number:
            num_clusters = np.arange(n_clust_min, n_clust_max + 1, inc)
            # sec_derivative = np.zeros(len(results_list))
            # for ii in range(1, len(results_list) - 1):
            #     sec_derivative[ii] = results_list[ii+1] + results_list[ii-1] - 2 * results_list[ii]
            # best_clust_num = num_clusters[1 + np.argmax(sec_derivative[1:-1])]
            # print('Best cluster number (by inertia - OLD): {}'.format(best_clust_num))

            kneedle = KneeLocator(num_clusters, results_list, S=1.0, curve="convex", direction="decreasing")
            # print('Knee / Elbow:', round(kneedle.knee, 2), round(kneedle.elbow, 2))
            best_clust_num = int(round(kneedle.elbow))
            # print('Best cluster number (by inertia): {}'.format(best_clust_num))
        
        elif method == 'silhouette':
            best_clust_num = np.nanargmax(np.array(results_list)) + n_clust_min

        elif method == 'pred_strength':
            xx = np.where(np.array(results_list) > 0.8)[0]
            best_clust_num = xx[-1] + n_clust_min

    elif method == 'gap_stat':
        optimalK = OptimalK()
        best_clust_num = optimalK(X, cluster_array=np.arange(n_clust_min, n_clust_max+1, inc))
        results_list = optimalK.gap_df["gap_value"].to_list()

    print('Best cluster number (by {}): {}'.format(method, best_clust_num))

    return results_list, best_clust_num
Esempio n. 15
0
# CH-Index
k = [2, 3, 4, 5, 6, 7, 8]
scores = []

for i in k:
    y_pred = KMeans(n_clusters=i, max_iter=1000,
                    random_state=43).fit_predict(X)
    score = metrics.calinski_harabaz_score(X, y_pred)
    scores.append(score)
    print(score)

plt.plot(k, scores, 'o-')
plt.title('CALINSKI-HARABASZ')
plt.show()

# Gap Statistic
# https://github.com/milesgranger/gap_statistic/blob/master/Example.ipynb
from gap_statistic import OptimalK
optimalK = OptimalK(parallel_backend='None')
n_clusters = optimalK(X, cluster_array=np.arange(1, 10))

plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3)
plt.scatter(
    optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters,
    optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value)

plt.xlabel('Cluster Count')
plt.ylabel('Gap Value')
plt.title('Gap Values by Cluster Count')
plt.show()
Esempio n. 16
0
sel = sqlalchemy.select([Customers])\
    .where(Customers.id.__eq__(customer_id))
with Insert.engine.begin() as connection:
    res = connection.execute(sel).fetchone()
    correct_k = res.correct_k

#%%

# Dimensionality reduction
X_reduced_mds = UnsupervisedCluster._dimensionality_reduction(X,
                                                              method='MDS',
                                                              n_components=2)
_max_clusters = UnsupervisedCluster._get_max_nc(X_reduced_mds)

# Optimalk clustering
optimalK = OptimalK(parallel_backend='multiprocessing')
optimalk_result_MDS = optimalK(X_reduced_mds,
                               cluster_array=np.arange(1, _max_clusters, 1))
optimalk_gap_values_MDS = optimalK.gap_df
optimalk_result_X = optimalK(X.astype(np.float32),
                             cluster_array=np.arange(1, _max_clusters, 1))
optimalk_gap_values_X = optimalK.gap_df

# Optimalk2 clustering
optimalK2 = OptimalKCluster()
optimalk_result_MDS2, optimalk_gap_values_MDS2 = optimalK2.optimalK(
    X_reduced_mds, nrefs=5, max_clusters=_max_clusters)
optimalk_result_X2, optimalk_gap_values_X2 = optimalK2.optimalK(
    X, nrefs=5, max_clusters=_max_clusters)

#%%
#%% Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
from gap_statistic import OptimalK
#%% Importing the dataset
dataset = pd.read_csv('clustering/pcs.csv', index_col=0)
X = dataset.values
names = dataset.index
#%% Using the elbow method to find the optimal number of clusters
wcss = []
for i in range(1, 20):
    print(i)
    kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 14)
    kmeans.fit(X)
    wcss.append(kmeans.inertia_)
plt.plot(range(1, 20), wcss)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('WCSS')
plt.show()
#%% Gap Statistic
optimalK = OptimalK(n_jobs=4, parallel_backend='joblib')
n_clusters = optimalK(X, cluster_array=np.arange(1, 50))
test = optimalK.gap_df
optimalK.plot_results()
#%% Training the K-Means model on the dataset
best_model = []
best_wcss = 160000
Esempio n. 18
0
def gap_stat(data, cluster_nums):
    from gap_statistic import OptimalK

    optimalK = OptimalK()
    return optimalK(data, cluster_array=cluster_nums)
    dist = DistanceMetric.get_metric(metric)
    print("MDS Metric: {}".format(metric))

    for i in range(nDifferentDataSet):

        data = generateOneClusterData(DEFAULT_NUMBER_OF_FEATURES,
                                      DEFAULT_NUMBER_OF_RECORDS_PER_CLASS,
                                      DEFAULT_FEATURE_MEAN_RANGE,
                                      i,
                                      distribution="normal")
        precomputedMetricData = dist.pairwise(data)

        mds = MDS(n_components=8, n_jobs=-1, dissimilarity="precomputed")
        mdsData = mds.fit_transform(precomputedMetricData)

        optimalK = OptimalK(parallel_backend='joblib', n_jobs=-1)
        clusterCount = optimalK(mdsData,
                                n_refs=3,
                                cluster_array=np.arange(1, 10))
        clusterCounts[i] = clusterCount
        stress[i, j] = mds.stress_

    meanClusterCount[j] = np.mean(clusterCounts)
    stdClusterCount[j] = np.std(clusterCounts)

    meanStress[j] = np.mean(stress[:, j])
    stdStress[j] = np.std(stress[:, j])

saveDir = os.path.join("data", "MDS-stressPerMetric.npy")
np.save(saveDir, stress)
Esempio n. 20
0
 def cluster_optimal_number(self, matrix):
     optimalk = OptimalK(parallel_backend='joblib')
     k = optimalk(matrix, cluster_array=np.arange(1, 30))
     print('\nOptimal number of clusters is:', k)
     self.optimal_cluster_nb = k
     return self.optimal_cluster_nb