def test_k_means_perfect_init(): km = KMeansConstrained(init=centers.copy(), n_clusters=n_clusters, random_state=42, n_init=1) km.fit(X) _check_fitted_model(km)
def test_k_means_n_init(): rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 2)) # two regression tests on bad n_init argument # previous bug: n_init <= 0 threw non-informative TypeError (#3858) assert_raises_regex(ValueError, "n_init", KMeansConstrained(n_init=0).fit, X) assert_raises_regex(ValueError, "n_init", KMeansConstrained(n_init=-1).fit, X)
def test_k_means_copyx(): # Check if copy_x=False returns nearly equal X after de-centering. my_X = X.copy() km = KMeansConstrained(copy_x=False, n_clusters=n_clusters, random_state=42) km.fit(my_X) _check_fitted_model(km) # check if my_X is centered assert_array_almost_equal(my_X, X)
def test_transform(): km = KMeansConstrained(n_clusters=n_clusters) km.fit(X) X_new = km.transform(km.cluster_centers_) for c in range(n_clusters): assert_equal(X_new[c, c], 0) for c2 in range(n_clusters): if c != c2: assert_greater(X_new[c, c2], 0)
def test_k_means_fortran_aligned_data(): # Check the KMeans will work well, even if X is a fortran-aligned data. X = np.asfortranarray([[0, 0], [0, 1], [0, 1]]) centers = np.array([[0, 0], [0, 1]]) labels = np.array([0, 1, 1]) km = KMeansConstrained(n_init=1, init=centers, random_state=42, n_clusters=2) km.fit(X) assert_array_equal(km.cluster_centers_, centers) assert_array_equal(km.labels_, labels)
def test_k_means_init_centers(): # This test is used to check KMeans won't mutate the user provided input # array silently even if input data and init centers have the same type X_small = np.array([[1.1, 1.1], [-7.5, -7.5], [-1.1, -1.1], [7.5, 7.5]]) init_centers = np.array([[0.0, 0.0], [5.0, 5.0], [-5.0, -5.0]]) for dtype in [np.int32, np.int64, np.float32, np.float64]: X_test = dtype(X_small) init_centers_test = dtype(init_centers) assert_array_equal(init_centers, init_centers_test) km = KMeansConstrained(init=init_centers_test, n_clusters=3, n_init=1) km.fit(X_test) assert_equal(False, np.may_share_memory(km.cluster_centers_, init_centers))
def test_sparse_validate_centers(): from sklearn.datasets import load_iris iris = load_iris() X = iris.data # Get a local optimum centers = KMeansConstrained(n_clusters=4).fit(X).cluster_centers_ # Test that a ValueError is raised for validate_center_shape classifier = KMeansConstrained(n_clusters=3, init=centers, n_init=1) assert_raises(ValueError, classifier.fit, X)
def fit(self, X): n_samples, n_features = X.shape assert self.size_max * self.n_clusters >= n_samples clf = KMeansConstrained(self.n_clusters, size_min=self.size_min, size_max=self.size_max, distance_func=self.distance_func) clf.fit(X) self.clf = clf self.cluster_centers_ = self.clf.cluster_centers_ self.labels_ = self.clf.labels_
def test_float_precision(): km = KMeansConstrained(n_init=1, random_state=30) inertia = {} X_new = {} centers = {} for dtype in [np.float64, np.float32]: X_test = X.astype(dtype) km.fit(X_test) # dtype of cluster centers has to be the dtype of the input # data assert_equal(km.cluster_centers_.dtype, dtype) inertia[dtype] = km.inertia_ X_new[dtype] = km.transform(X_test) centers[dtype] = km.cluster_centers_ # ensure the extracted row is a 2d array assert_equal(km.predict(X_test[:1]), km.labels_[0]) if hasattr(km, 'partial_fit'): km.partial_fit(X_test[0:3]) # dtype of cluster centers has to stay the same after # partial_fit assert_equal(km.cluster_centers_.dtype, dtype) # compare arrays with low precision since the difference between # 32 and 64 bit sometimes makes a difference up to the 4th decimal # place assert_array_almost_equal(inertia[np.float32], inertia[np.float64], decimal=4) assert_array_almost_equal(X_new[np.float32], X_new[np.float64], decimal=4) assert_array_almost_equal(centers[np.float32], centers[np.float64], decimal=4)
def test_sparse_k_means_init_centers(): from sklearn.datasets import load_iris iris = load_iris() X = iris.data # Get a local optimum centers = KMeansConstrained(n_clusters=3, size_min=50).fit(X).cluster_centers_ # Fit starting from a local optimum shouldn't change the solution np.testing.assert_allclose( centers, KMeansConstrained(n_clusters=3, size_min=50, init=centers, n_init=1).fit(X).cluster_centers_)
def test_sparse_validate_centers(): from sklearn.datasets import load_iris iris = load_iris() X = iris.data # Get a local optimum centers = KMeansConstrained(n_clusters=4).fit(X).cluster_centers_ # Test that a ValueError is raised for validate_center_shape classifier = KMeansConstrained(n_clusters=3, init=centers, n_init=1) msg = "The shape of the initial centers \(\(4L?, 4L?\)\) " \ "does not match the number of clusters 3" assert_raises_regex(ValueError, msg, classifier.fit, X)
def test_k_means_plus_plus_init_2_jobs(): if sys.version_info[:2] < (3, 4): raise SkipTest( "Possible multi-process bug with some BLAS under Python < 3.4") km = KMeansConstrained(init="k-means++", n_clusters=n_clusters, n_jobs=2, random_state=42).fit(X) _check_fitted_model(km)
def test_k_means_non_collapsed(): # Check k_means with a bad initialization does not yield a singleton # Starting with bad centers that are quickly ignored should not # result in a repositioning of the centers to the center of mass that # would lead to collapsed centers which in turns make the clustering # dependent of the numerical unstabilities. my_X = np.array([[1.1, 1.1], [0.9, 1.1], [1.1, 0.9], [0.9, 1.1]]) array_init = np.array([[1.0, 1.0], [5.0, 5.0], [-5.0, -5.0]]) km = KMeansConstrained(init=array_init, n_clusters=3, random_state=42, n_init=1) km.fit(my_X) # centers must not been collapsed assert_equal(len(np.unique(km.labels_)), 3) centers = km.cluster_centers_ assert (np.linalg.norm(centers[0] - centers[1]) >= 0.1).all() assert (np.linalg.norm(centers[0] - centers[2]) >= 0.1).all() assert (np.linalg.norm(centers[1] - centers[2]) >= 0.1).all()
def __fit_clusters(self, column: np.array) -> List[float]: """ Fit the clusters for a given feature. Arguments: column (np.array): All the values for a single feature. Returns: The cluster centers for this feature. """ column = np.sort(column) distinct_counter = counter(column) max_clusters = sum(min(count, self.__min_cluster_size) for count in distinct_counter.values()) // \ self.__min_cluster_size for num_clusters in range(max_clusters, 0, -1): clustering = KMeansConstrained(n_clusters = num_clusters, size_min = self.__min_cluster_size, random_state = self.__random_generator) clusters = clustering.fit_predict(column[:, np.newaxis]) if self.__correct_clustering(column, clusters): return self.__cluster_centers(column, clusters)
def fit(self, X): n_samples, n_features = X.shape minsize = n_samples // self.n_clusters maxsize = (n_samples + self.n_clusters - 1) // self.n_clusters clf = KMeansConstrained(self.n_clusters, size_min=minsize, size_max=maxsize, distance_func=self.distance_func) if minsize != maxsize: warnings.warn( "Cluster minimum and maximum size are {} and {}, respectively". format(minsize, maxsize)) clf.fit(X) self.clf = clf self.cluster_centers_ = self.clf.cluster_centers_ self.labels_ = self.clf.labels_
def test_k_means_new_centers(): # Explore the part of the code where a new center is reassigned X = np.array([[0, 0, 1, 1], [0, 0, 0, 0], [0, 1, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 1, 0, 0]]) labels = [0, 1, 2, 1, 1, 2] bad_centers = np.array([[+0, 1, 0, 0], [.2, 0, .2, .2], [+0, 0, 0, 0]]) km = KMeansConstrained(n_clusters=3, init=bad_centers, n_init=1, max_iter=10, random_state=1) for i in range(2): km.fit(X) this_labels = km.labels_ # Reorder the labels so that the first instance is in cluster 0, # the second in cluster 1, ... this_labels = np.unique(this_labels, return_index=True)[1][this_labels] np.testing.assert_array_equal(this_labels, labels)
def test_score(): km1 = KMeansConstrained(n_clusters=n_clusters, max_iter=1, random_state=42, n_init=1) s1 = km1.fit(X).score(X) km2 = KMeansConstrained(n_clusters=n_clusters, max_iter=10, random_state=42, n_init=1) s2 = km2.fit(X).score(X) assert_greater(s2, s1)
def subgroup_by_cluster_constrained(object_states: List[ObjectState], n_member: int = 5) -> List[List[int]]: """Generate subgroup based on constrained K-means clustering on object's position Args: object_states (List[ObjectState]): array of object states n_member (int, optional): max number of member per subgroup. Default to 5. Returns: List[List[int]]: 2D array, each row contains indices of objects that belong to same subgroup """ n_cluster = math.ceil(len(object_states) / n_member) features = [[obj.x, obj.y] for obj in object_states] kmeans = KMeansConstrained(n_clusters=n_cluster, size_max=min(n_member, len(object_states)), random_state=42) labels = kmeans.fit_predict(features) groups = [] for label in set(labels): indices = np.flatnonzero(labels == label) groups.append(indices.tolist()) return groups
def test_k_means_explicit_init_shape(): # test for sensible errors when giving explicit init # with wrong number of features or clusters rnd = np.random.RandomState(0) X = rnd.normal(size=(40, 3)) # mismatch of number of features km = KMeansConstrained(n_init=1, init=X[:, :2], n_clusters=len(X)) msg = "does not match the number of features of the data" assert_raises_regex(ValueError, msg, km.fit, X) # for callable init km = KMeansConstrained(n_init=1, init=lambda X_, k, random_state: X_[:, :2], n_clusters=len(X)) assert_raises_regex(ValueError, msg, km.fit, X) # mismatch of number of clusters msg = "does not match the number of clusters" km = KMeansConstrained(n_init=1, init=X[:2, :], n_clusters=3) assert_raises_regex(ValueError, msg, km.fit, X) # for callable init km = KMeansConstrained(n_init=1, init=lambda X_, k, random_state: X_[:2, :], n_clusters=3) assert_raises_regex(ValueError, msg, km.fit, X)
def test_n_init(): # Check that increasing the number of init increases the quality n_runs = 5 n_init_range = [1, 5, 10] inertia = np.zeros((len(n_init_range), n_runs)) for i, n_init in enumerate(n_init_range): for j in range(n_runs): km = KMeansConstrained(n_clusters=n_clusters, init="random", n_init=n_init, random_state=j).fit(X) inertia[i, j] = km.inertia_ inertia = inertia.mean(axis=1) failure_msg = ("Inertia %r should be decreasing" " when n_init is increasing.") % list(inertia) for i in range(len(n_init_range) - 1): assert (inertia[i] >= inertia[i + 1]).all(), failure_msg
def test_predict(): km = KMeansConstrained(n_clusters=n_clusters, random_state=42) km.fit(X) # sanity check: predict centroid labels pred = km.predict(km.cluster_centers_) assert_array_equal(pred, np.arange(n_clusters)) # sanity check: re-predict labeling for training set samples pred = km.predict(X) assert_array_equal(pred, km.labels_) # re-predict labels for training set using fit_predict pred = km.fit_predict(X) assert_array_equal(pred, km.labels_)
def test_k_means_plus_plus_init(): km = KMeansConstrained(init="k-means++", n_clusters=n_clusters, random_state=42).fit(X) _check_fitted_model(km)
# -*- coding: utf-8 -*- """ Created on Sat Oct 17 19:13:48 2020 @author: lcota """ from k_means_constrained import KMeansConstrained clf = KMeansConstrained(n_clusters=2, size_min=2, size_max=5, random_state=0) clf.fit(X) clf.cluster_centers_ clf.predict([[0, 0], [4, 4]])
!pip install k-means-constrained from k_means_constrained import KMeansConstrained df=pd.read_csv("https://raw.githubusercontent.com/JavierLilly/Proyecto_Eco/main/BDC_DATA.csv") #Estandarizando las coordenadas data= df[['lat','lon']].values.astype('float32',copy=False) scaler = StandardScaler().fit(data) data_scal = scaler.transform(data) df_ = df df_[['lat','lon']]=data_scal #Construyendo el modelo de clustering min - max size coor = df_[['lat','lon']] model = KMeansConstrained(n_clusters=6,size_min=600,size_max=700,random_state=5565280).fit(coor) y = model.predict(coor) # Predicion df_['cluster'] = y #Gráfica Todos con Frecuencia >=1 cdict={0:'red',1:'black',2:'yellow',3:'green',4:'blue',5:'grey'} plt.figure(figsize=(10,10)) sns.set() for g in np.unique(y): plt.scatter(coor['lat'][y==g], coor['lon'][y==g], c = cdict[g], label = g, s = 60) # plt.scatter(df['lat'][df['Frecuencia']==2],df['lon'][df['Frecuencia']==2],c='purple',s=80,alpha = .5) # plt.scatter(df['lat'][df['Frecuencia']==3],df['lon'][df['Frecuencia']==3],c='brown',s=150,) plt.legend() #Reducimos los datos
if len(scannedSides) > 5: for i in range(len(scannedSides)): scannedSidesWithLabels[sideLabels[i]] = scannedSides[i] # Map over scanned sides and get an array of all BGR values for each square allCubes = [] for face in scannedSides: for square in face: allCubes.append([ square["avgColor"][0], square["avgColor"][1], square["avgColor"][2] ]) # https://joshlk.github.io/k-means-constrained/ # Calculate Kmeans and cluster colors with min/max size of 9 kmeans = KMeansConstrained(n_clusters=6, size_min=9, size_max=9) k = kmeans.fit labels = kmeans.fit_predict(allCubes) # Object to hold all colors cube = { "front": [], "left": [], "back": [], "right": [], "up": [], "down": [] } # Loop over the cluster data and get cube map based on cluster for i in range(len(labels)):
def test_max_iter_error(): km = KMeansConstrained(max_iter=-1) assert_raise_message(ValueError, 'Number of iterations should be', km.fit, X)
def test_fit_transform(): X1 = KMeansConstrained(n_clusters=3, random_state=51).fit(X).transform(X) X2 = KMeansConstrained(n_clusters=3, random_state=51).fit_transform(X) assert_array_equal(X1, X2)
def make_groups(df, total_students, students_per_group): df.drop(["Name", "Email"], axis=1, inplace=True) df = pd.get_dummies(df, columns=['Year', 'Interests'], drop_first=False) def encode(df): def skill_encoder(df): for i in range(len(df.iloc[:, 2])): if df.iloc[i, 2] == 4: df.iloc[i, 2] = 2 elif df.iloc[i, 2] == 5: df.iloc[i, 2] = 1 def availability_encoder(df): for i in range(len(df.iloc[:, 1])): if df.iloc[i, 1] == "00:00 - 6:00": df.iloc[i, 1] = 0 elif df.iloc[i, 1] == "6:00 - 12:00": df.iloc[i, 1] = 1 elif df.iloc[i, 1] == "12:00 - 18:00": df.iloc[i, 1] = 2 elif df.iloc[i, 1] == "18:00 - 24:00": df.iloc[i, 1] = 3 def timezone_encoder(df): for i in range(len(df.iloc[:, 0])): if df.iloc[i, 0] == "GMT–8 (Pacific Time)": df.iloc[i, 0] = 0 elif df.iloc[i, 0] == "GMT–6 (CST)": df.iloc[i, 0] = 1 elif df.iloc[i, 0] == "GMT–5 (EST)": df.iloc[i, 0] = 2 elif df.iloc[i, 0] == "GMT–3 (South America)": df.iloc[i, 0] = 3 elif df.iloc[i, 0] == "GMT+0 (GMT)": df.iloc[i, 0] = 4 elif df.iloc[i, 0] == "GMT+1 (CET)": df.iloc[i, 0] = 5 elif df.iloc[i, 0] == "GMT+3 (Eastern Europe/Middle East)": df.iloc[i, 0] = 6 elif df.iloc[i, 0] == "GMT+5 (South Asia)": df.iloc[i, 0] = 7 elif df.iloc[i, 0] == "GMT+8 (East Asia)": df.iloc[i, 0] = 8 elif df.iloc[i, 0] == "GMT+10 (Australia)": df.iloc[i, 0] = 9 elif df.iloc[i, 0] == "GMT+12 (New Zealand)": df.iloc[i, 0] = 10 df["Timezone"] = df["Timezone"].astype(int) df["Timezone"] = df["Timezone"].astype(str) timezone_encoder(df) df["Availability"] = df["Availability"].astype(str) availability_encoder(df) df["Skill"] = df["Skill"].astype(int) skill_encoder(df) return df df = encode(df) n_groups = total_students // students_per_group min_students = 0 max_students = 0 if total_students % students_per_group == 0: min_students = students_per_group max_students = students_per_group else: n_groups += 1 min_students = total_students - ( students_per_group * (total_students // students_per_group)) max_students = students_per_group groups = KMeansConstrained(n_clusters=n_groups, size_min=min_students, size_max=max_students) groups.fit_predict(df) return groups.labels_.astype(int).tolist()
def test_k_means_random_init(): km = KMeansConstrained(init="random", n_clusters=n_clusters, random_state=42) km.fit(X) _check_fitted_model(km)
def test_k_means_invalid_init(): km = KMeansConstrained(init="invalid", n_init=1, n_clusters=n_clusters) assert_raises(ValueError, km.fit, X)