def test_edge(): test_df = pd.DataFrame({'X1': np.zeros(10), 'X2': np.ones(10)}) centers, labels = fit(test_df, 1) print(labels) assert centers.all() == np.array([0, 1]).all()
def test_center(): # Helper data iris = datasets.load_iris() # loading the iris dataset features = iris.data test_df1 = pd.DataFrame({'X1': features[:, 2], 'X2': features[:, 3]}) test_df2 = pd.DataFrame({'X1': np.arange(9), 'X2': np.arange(9)}) # getting centers of clusters centers1, _ = fit(test_df2, 1) centers2, _ = fit(test_df1, 2) assert centers2.all() == np.array([[4.92525253, 1.68181818], [1.49215686, 0.2627451]]).all(), \ "Centers did not match" assert centers1.all() == np.array([4, 4]).all(), "Centers did not match"
def silhouette(X, k_array): """ Plots a graph of silhouette scores for each k value in the given array using fit. Returns a list of each k value in k_array paired with its corresponding silhouette score. Parameters ---------- X : 2-d array, shape=(n_samples, n_features) - The data to be clustered. k_array : array - An array of all contending k values. Returns ------- 1-d array - An array containing silhouette scores in the same order as k_array. Altair chart object - An Altair chart displaying silhouette scores with their corresponding k values. Examples -------- >>> X = np.array([[1, 2], [1, 4], [1, 0], ... [10, 2], [10, 4], [10, 0]]) >>> k_array = [2, 3, 4, 5] >>> silhouette(X, k_array) """ if (X.dtype != "float" and X.dtype != "int"): raise ValueError("Input X must be numeric") if not (isinstance(k_array[0], int)): raise ValueError("Input k_array must be type int") scores = [] for i in range(len(k_array)): centers, labels = fit(X, k_array[i]) score = sil_score(X, labels) scores.append([k_array[i], score]) scores = pd.DataFrame(scores) scores.rename(columns={0: "k", 1: "Score"}, inplace=True) chart = (alt.Chart(scores).mark_line().encode( alt.X('k:O', axis=alt.Axis(title='k')), alt.Y('Score:Q', axis=alt.Axis(title='Silhouette score')), ).properties(title="Silhouette scores", width=800)) return (scores["Score"], chart)
def test_exceptions(): # Helper data test_df4 = "this is a python package" test_df2 = pd.DataFrame({'X1': np.arange(9), 'X2': np.arange(9)}) test_df5 = pd.DataFrame({'X1': [1, 2, 3, 4], 'X2': [1, 2, "A", "BC"]}) K = -2 num_init = 0 max_iteration = 4.5 # checking the exception handling of the function try: fit(test_df4, 2) print("Should throw an error if data is not in " "a dataframe or numpy array") raise except ValueError: pass try: fit(test_df2, K) except ValueError: pass try: fit(test_df5, 2) except ValueError: pass try: fit(test_df2, 1, n_init=num_init) except ValueError: pass try: fit(test_df2, 1, max_iter=max_iteration) except ValueError: pass
def test_label(): # Helper data test_df3 = pd.DataFrame({ 'X1': np.concatenate((np.arange(5, 10), np.arange(15, 20)), axis=0), 'X2': np.concatenate((np.arange(5, 10), np.arange(15, 20)), axis=0) }) # getting the labels for the helper data _, labels = fit(test_df3, 2) assert labels.all() == np.concatenate((np.zeros(5), np.ones(5)), axis=0).all(), "labels did not match"
def elbow(X, centers_list): """ Creates a plot of inertia vs number of cluster centers as per the elbow method. Calculates and returns the inertia values for all cluster centers. Useful for identifying the optimal number of clusters while using k-means clustering algorithm. Parameters ---------- X : array-like, shape=(n_samples, n_features) Input data that is to be clustered. centers_list : list or 1-d array-like A list of all possible numbers of cluster centers Returns ------- tuple A tuple of an altair plot object containing a line plot of k (number of cluster centers) vs inertia and inertia for all k. Examples -------- >>> from Kmeans_python.elbow import elbow >>> import numpy as np >>> X = np.array([[1, 2], [1, 4], [1, 0], ... [10, 2], [10, 4], [10, 0]]) >>> centers = [2, 3, 4, 5] >>> elbow(X, centers) >>> (alt.Chart(...), [2.8284271247461903, 2.8284271247461903, 1.4142135623730951, 0.0]) """ # Check if number of centers is contained in an array or list if not ((isinstance(centers_list, list)) | (isinstance(centers_list, np.ndarray))): raise ValueError("Invalid input type for list of numbers of clusters.\ centers_list must be list or a numpy array.") # Ensure input arguments are valid if not ((isinstance(X, pd.DataFrame)) | (isinstance(X, np.ndarray))): raise ValueError("Invalid input type for samples. X must be \ pandas dataframe or a numpy array.") # Check if there are atleast two samples if not X.shape[0] >= 2: raise ValueError("At least two samples should be there in data") # Prompt user to reshape if data has only one feature if len(X.shape) == 1: raise ValueError("If you have only one feature in the dataset\ please reshape your data using X.reshape(-1, 1)") # Check if number of centers are numeric values data = np.reshape(centers_list, -1) if not any([isinstance(x, int) or isinstance(x, np.int64) for x in data]): raise ValueError( "Invalid input type for centers. Centers_list must contain \ only numeric values.") # Check if all number of centers are integers for k in centers_list: if int(k) != np.ceil(k): raise ValueError("Number of centers should be integers") # Check if data points are numbers data = np.reshape(np.array(X), -1) if not any([isinstance(x, int) or isinstance(x, np.int64) for x in data]): raise ValueError("Invalid input type for samples. X must contain \ only numeric values.") # Check if the range of number of centers is valid if (np.min(centers_list) < 1) | (np.max(centers_list) > X.shape[0]): raise ValueError("Invalid values in list of numbers of clusters. \ Number of clusters should be between 1 and number of samples") if isinstance(X, pd.DataFrame): X = X.to_numpy() # Convert all integer types to int centers_list = [int(x) for x in centers_list] # Iterate through centers list and get inertia inertia = [] for k in centers_list: # Fit Kmeans algorithm to get cluster centers and labels centers, labels = fit(X, k, n_init=10, max_iter=200) # Compute inertia for cluster in range(k): x_cluster = X[np.where(labels == cluster)] cluster_inertia = np.linalg.norm(x_cluster - centers[cluster]) inertia.append(np.sum(cluster_inertia)) # Save results to a dataframe results = pd.DataFrame({"k": centers_list, "inertia": inertia}) # Create a plot object of K vs Inertia p = alt.Chart(results).mark_line().encode( alt.X("k:Q", title="k"), alt.Y("inertia:Q", title="Inertia")).properties( title="Optimal K Using Elbow Method", width=700, height=300).configure_axis( labelFontSize=20, titleFontSize=20).configure_title(fontSize=20) return p, inertia