def test_bad_init_config(): """ Cannot define own clustering function and try to use Rust backend """ with pytest.raises(ValueError): OptimalK(parallel_backend="rust", clusterer=lambda x, k: print("just testing"))
def gap_optimalk(matrix): optimalk = OptimalK(parallel_backend='joblib') k = optimalk(matrix, cluster_array=np.arange(1, 20)) print('\nOptimal number of clusters is ', k) return k
def test_dunders(): """ Test that implemented dunder methods don't return errors """ from gap_statistic import OptimalK optimalK = OptimalK() optimalK.__str__() optimalK.__repr__() optimalK._repr_html_()
def test_optimalk_cluster_array_values_error(): """ Test ValueError when cluster_array contains values less than 1 """ from gap_statistic import OptimalK # Create optimalK instance optimalK = OptimalK(parallel_backend=None, n_jobs=-1) # Create data X, y = make_blobs(n_samples=int(1e3), n_features=2, centers=3) with pytest.raises(ValueError) as excinfo: optimalK(X, cluster_array=[0, -1, 1, 2, 3]) assert "cluster_array contains values less than 1" in str(excinfo.value)
def test_optimalk_cluster_array_empty_error(): """ Test ValueError when cluster_array is empty. """ from gap_statistic import OptimalK # Create optimalK instance optimalK = OptimalK(parallel_backend=None, n_jobs=-1) # Create data X, y = make_blobs(n_samples=int(1e3), n_features=2, centers=3) with pytest.raises(ValueError) as excinfo: optimalK(X, cluster_array=[]) assert "The supplied cluster_array has no values." in str(excinfo.value)
def test_optimalk_cluster_array_vs_data_sizes_error(): """ Test ValueError when cluster_array is larger than dataset. """ import numpy as np from gap_statistic import OptimalK # Create optimalK instance optimalK = OptimalK(parallel_backend=None, n_jobs=-1) # Create data X, y = make_blobs(n_samples=5, n_features=2, centers=3) with pytest.raises(ValueError) as excinfo: optimalK(X, cluster_array=np.arange(1, 10)) assert "The number of suggested clusters to try" in str(excinfo.value)
def _estimate_k(self, include_bic: bool, include_gap: bool): """ Estimate the best k -number of clusters- using various methods. Returns ------- k : int An average estimation of three methods: bic, the gap statistic and GMeans gauusians. Note: the data would be L2-normalised before proceeding """ gmeans = GMeans(random_state=None, max_depth=500) gmeans.fit(self.data) k_gaussian = len(unique(gmeans.labels_)) if include_gap: # Define a custom clusterer for the Gap statistic def ms(X, k): c = MeanShift() c.fit(X) return c.cluster_centers_, c.predict(X) gap = OptimalK(clusterer=ms) k_gap = gap(X=self.data, cluster_array=range(2, len(self.data)-1)) if include_bic: k_bic = len(unique(self._cluster_xmeans())) est_k = round((k_bic + k_gap + k_gaussian) / 3) return (est_k, [est_k, k_bic, k_gap, k_gaussian]) else: est_k = round((k_gap + k_gaussian) / 2) return (est_k, [est_k, k_gap, k_gaussian]) # TODO: None gap stats would generate errors when averaging # to form k-trends else: if include_bic: k_bic = len(unique(self._cluster_xmeans())) est_k = round((k_bic + k_gaussian) / 3) return (est_k, [est_k, k_bic, None, k_gaussian]) else: est_k = round(k_gaussian) return (est_k, [est_k, None, k_gaussian])
def test_cluster(data, k_min=200, k_max=380, k_incerement = 100, n_references=5 ): gap, reference_inertia, ondata_inertia = compute_gap(KMeans(), data, k_min=k_min,k_max=k_max, k_incerement = k_incerement, n_references=n_references) plt.plot(range(1, k_max + 1), reference_inertia, '-o', label='reference') plt.plot(range(1, k_max + 1), ondata_inertia, '-o', label='data') plt.xlabel('k') plt.ylabel('log(inertia)') plt.show() plt.savefig('gap_clustering.jpg') # Define the OptimalK instance, but pass in our own clustering function optimalk = OptimalK(clusterer=special_clustering_func) # Use the callable instance as normal. n_clusters = optimalk(X, n_refs=3, cluster_array=range(k_min, k_max, k_incerement))
def test_optimalk_rust_ext(): """ Test core functionality of OptimalK using all backends. """ # Create optimalK instance optimalK = OptimalK(parallel_backend="rust", n_jobs=1) # Create data X, y = make_blobs(n_samples=int(1e3), n_features=2, centers=3) suggested_clusters = optimalK(X, n_refs=3, cluster_array=np.arange(1, 10)) assert np.allclose( suggested_clusters, 3, 2), "Correct clusters is {}, OptimalK suggested {}".format( 3, suggested_clusters)
def test_optimalk(parallel_backend, n_jobs, n_clusters): """ Test core functionality of OptimalK using all backends. """ import numpy as np from sklearn.datasets.samples_generator import make_blobs from gap_statistic import OptimalK # Create optimalK instance optimalK = OptimalK(parallel_backend=parallel_backend, n_jobs=n_jobs) # Create data X, y = make_blobs(n_samples=int(1e3), n_features=2, centers=3) suggested_clusters = optimalK(X, n_refs=3, cluster_array=np.arange(1, 10)) assert np.allclose(suggested_clusters, n_clusters, 2), ('Correct clusters is {}, OptimalK suggested {}' .format(n_clusters, suggested_clusters))
def test_alternative_clusting_method(ClusterModel): """ Test that users can supply alternative clustering method as dep injection """ def clusterer(X: np.ndarray, k: int, another_test_arg): """ Function to wrap a sklearn model as a clusterer for OptimalK First two arguments are always the data matrix, and k, and can supply """ m = ClusterModel() m.fit(X) assert another_test_arg == "test" return m.cluster_centers_, m.predict(X) optimalk = OptimalK( n_jobs=-1, parallel_backend="joblib", clusterer=clusterer, clusterer_kwargs={"another_test_arg": "test"}, ) X, y = make_blobs(n_samples=50, n_features=2, centers=3) n_clusters = optimalk(X, n_refs=3, cluster_array=np.arange(1, 5)) assert isinstance(n_clusters, int)
"""Calculate number of clusters by use of the Gap statistic. Uses: https://github.com/milesgranger/gap_statistic and based on their Example.ipynb. """ import sys import numpy as np import pandas as pd import matplotlib.pyplot as plt from gap_statistic import OptimalK from sklearn.datasets.samples_generator import make_blobs from sklearn.cluster import KMeans # Initialise OptimalK class #optimalK = OptimalK(parallel_backend='rust') optimalK = OptimalK() optimalK # Make some test data #X, y = make_blobs(n_samples=int(1e5), n_features=2, centers=3, random_state=25) #print('Data shape: ', X.shape) #print(X, type(X)) #X = np.array([[100., 1.], [200.,1.],[220.,1.],[230.,1.], [500.,1.], [600.,1.]]) X = np.array([[100.], [200.], [220.], [230.], [580.], [600.]]) #X = np.array([[100.],[200.],[300.],[400.], [500.], [600.]]) #X = np.array([[100.],[180.],[300.],[410.], [500.], [610.]]) print(X, type(X)) # Call OptimalK to determine best number of clusters print('Calculating optimal number of clusters') n_clusters = optimalK(X, cluster_array=np.arange(1, 6), n_refs=100) print('Optimal clusters: ', n_clusters)
# -*- coding: utf-8 -*- """ Test performance of gap statistic on normal data. @author: ysirotin """ import numpy as np import scipy as sc from gap_statistic import OptimalK import matplotlib.pyplot as plt optimalK = OptimalK(parallel_backend='rust') a1 = 0 mu1 = 0.0 sig1 = 0.1 a2 = 0 mu2 = 1.0 sig2 = 0.1 a3 = 0 mu3 = 2.0 sig3 = 0.1 N = 10000 # two bumps x = np.linspace(-2, 3, 100) fig, ax = plt.subplots(2, 2)
def kmeans_find_num_clusters(X, method='elbow', n_clust_min=2, n_clust_max=20, inc=1): if method in ['elbow', 'silhouette', 'pred_strength']: # For the silhouette coefficient method, mininum number of clusters must be 2: if method == 'silhouette': n_clust_min = max(n_clust_min, 2) # Initialize lists for different parameters: results_list = [] # Create train and test sets for the prediction strength: if method == 'pred_strength': np.random.seed(42) msk = np.random.rand(X.shape[0]) < 0.8 X_train, X_test = X[msk, :], X[~msk, :] for jj in range(n_clust_min, n_clust_max+1, inc): # Run k-Means: model = cluster.KMeans(n_clusters=jj, random_state=42, verbose=0) model.fit(X) if method == 'elbow': # Save the inertia statistic from the clustering algorithm: results_list.append(model.inertia_) elif method == 'silhouette': # Calculate and save the Silhouette score for the current clustering: silh_coef = metrics.silhouette_score(X, model.labels_, metric='euclidean') results_list.append(silh_coef) elif method == 'pred_strength': # Calculate prediction strength: model_train = cluster.KMeans(n_clusters=jj, random_state=42).fit(X_train) model_test = cluster.KMeans(n_clusters=jj, random_state=42).fit(X_test) pred_str = get_prediction_strength(jj, model_train.cluster_centers_, X_test, model_test.labels_) results_list.append(pred_str) if method == 'elbow': # Use elbow of inertia curve as initial guess for optimal cluster number: num_clusters = np.arange(n_clust_min, n_clust_max + 1, inc) # sec_derivative = np.zeros(len(results_list)) # for ii in range(1, len(results_list) - 1): # sec_derivative[ii] = results_list[ii+1] + results_list[ii-1] - 2 * results_list[ii] # best_clust_num = num_clusters[1 + np.argmax(sec_derivative[1:-1])] # print('Best cluster number (by inertia - OLD): {}'.format(best_clust_num)) kneedle = KneeLocator(num_clusters, results_list, S=1.0, curve="convex", direction="decreasing") # print('Knee / Elbow:', round(kneedle.knee, 2), round(kneedle.elbow, 2)) best_clust_num = int(round(kneedle.elbow)) # print('Best cluster number (by inertia): {}'.format(best_clust_num)) elif method == 'silhouette': best_clust_num = np.nanargmax(np.array(results_list)) + n_clust_min elif method == 'pred_strength': xx = np.where(np.array(results_list) > 0.8)[0] best_clust_num = xx[-1] + n_clust_min elif method == 'gap_stat': optimalK = OptimalK() best_clust_num = optimalK(X, cluster_array=np.arange(n_clust_min, n_clust_max+1, inc)) results_list = optimalK.gap_df["gap_value"].to_list() print('Best cluster number (by {}): {}'.format(method, best_clust_num)) return results_list, best_clust_num
# CH-Index k = [2, 3, 4, 5, 6, 7, 8] scores = [] for i in k: y_pred = KMeans(n_clusters=i, max_iter=1000, random_state=43).fit_predict(X) score = metrics.calinski_harabaz_score(X, y_pred) scores.append(score) print(score) plt.plot(k, scores, 'o-') plt.title('CALINSKI-HARABASZ') plt.show() # Gap Statistic # https://github.com/milesgranger/gap_statistic/blob/master/Example.ipynb from gap_statistic import OptimalK optimalK = OptimalK(parallel_backend='None') n_clusters = optimalK(X, cluster_array=np.arange(1, 10)) plt.plot(optimalK.gap_df.n_clusters, optimalK.gap_df.gap_value, linewidth=3) plt.scatter( optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].n_clusters, optimalK.gap_df[optimalK.gap_df.n_clusters == n_clusters].gap_value) plt.xlabel('Cluster Count') plt.ylabel('Gap Value') plt.title('Gap Values by Cluster Count') plt.show()
sel = sqlalchemy.select([Customers])\ .where(Customers.id.__eq__(customer_id)) with Insert.engine.begin() as connection: res = connection.execute(sel).fetchone() correct_k = res.correct_k #%% # Dimensionality reduction X_reduced_mds = UnsupervisedCluster._dimensionality_reduction(X, method='MDS', n_components=2) _max_clusters = UnsupervisedCluster._get_max_nc(X_reduced_mds) # Optimalk clustering optimalK = OptimalK(parallel_backend='multiprocessing') optimalk_result_MDS = optimalK(X_reduced_mds, cluster_array=np.arange(1, _max_clusters, 1)) optimalk_gap_values_MDS = optimalK.gap_df optimalk_result_X = optimalK(X.astype(np.float32), cluster_array=np.arange(1, _max_clusters, 1)) optimalk_gap_values_X = optimalK.gap_df # Optimalk2 clustering optimalK2 = OptimalKCluster() optimalk_result_MDS2, optimalk_gap_values_MDS2 = optimalK2.optimalK( X_reduced_mds, nrefs=5, max_clusters=_max_clusters) optimalk_result_X2, optimalk_gap_values_X2 = optimalK2.optimalK( X, nrefs=5, max_clusters=_max_clusters) #%%
#%% Importing the libraries import numpy as np import matplotlib.pyplot as plt import pandas as pd from sklearn.cluster import KMeans from gap_statistic import OptimalK #%% Importing the dataset dataset = pd.read_csv('clustering/pcs.csv', index_col=0) X = dataset.values names = dataset.index #%% Using the elbow method to find the optimal number of clusters wcss = [] for i in range(1, 20): print(i) kmeans = KMeans(n_clusters = i, init = 'k-means++', random_state = 14) kmeans.fit(X) wcss.append(kmeans.inertia_) plt.plot(range(1, 20), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() #%% Gap Statistic optimalK = OptimalK(n_jobs=4, parallel_backend='joblib') n_clusters = optimalK(X, cluster_array=np.arange(1, 50)) test = optimalK.gap_df optimalK.plot_results() #%% Training the K-Means model on the dataset best_model = [] best_wcss = 160000
def gap_stat(data, cluster_nums): from gap_statistic import OptimalK optimalK = OptimalK() return optimalK(data, cluster_array=cluster_nums)
dist = DistanceMetric.get_metric(metric) print("MDS Metric: {}".format(metric)) for i in range(nDifferentDataSet): data = generateOneClusterData(DEFAULT_NUMBER_OF_FEATURES, DEFAULT_NUMBER_OF_RECORDS_PER_CLASS, DEFAULT_FEATURE_MEAN_RANGE, i, distribution="normal") precomputedMetricData = dist.pairwise(data) mds = MDS(n_components=8, n_jobs=-1, dissimilarity="precomputed") mdsData = mds.fit_transform(precomputedMetricData) optimalK = OptimalK(parallel_backend='joblib', n_jobs=-1) clusterCount = optimalK(mdsData, n_refs=3, cluster_array=np.arange(1, 10)) clusterCounts[i] = clusterCount stress[i, j] = mds.stress_ meanClusterCount[j] = np.mean(clusterCounts) stdClusterCount[j] = np.std(clusterCounts) meanStress[j] = np.mean(stress[:, j]) stdStress[j] = np.std(stress[:, j]) saveDir = os.path.join("data", "MDS-stressPerMetric.npy") np.save(saveDir, stress)
def cluster_optimal_number(self, matrix): optimalk = OptimalK(parallel_backend='joblib') k = optimalk(matrix, cluster_array=np.arange(1, 30)) print('\nOptimal number of clusters is:', k) self.optimal_cluster_nb = k return self.optimal_cluster_nb