def test_seuclidean(): with pytest.warns(None) as record: km = KMedoids(2, metric="seuclidean", method="pam") km.fit(np.array([0, 0, 0, 1]).reshape((4, 1))) km.predict(np.array([0, 0, 0, 1]).reshape((4, 1))) km.transform(np.array([0, 0, 0, 1]).reshape((4, 1))) assert len(record) == 0
def cross_validation(n=200): accuracy_sum_train, accuracy_sum_test, diff_sum_train, diff_sum_test = ( 0, ) * 4 for i in range(n): model_sklearn = KMedoids(n_clusters=3, random_state=12412).fit(X_train) labels_sklearn_kmedoid = model_sklearn.predict(X_test) accuracy, diff = callculate_accuracy('K-medoid, Own Implementation', labels_sklearn_kmedoid) accuracy_sum_test += accuracy diff_sum_test += diff labels_sklearn_kmedoid = model_sklearn.predict(X_train) accuracy, diff = callculate_accuracy('K-medoid, Own Implementation', labels_sklearn_kmedoid) accuracy_sum_train += accuracy diff_sum_train += diff return (accuracy_sum_test / n, diff_sum_test / n, accuracy_sum_train / n, diff_sum_train / n)
def test_kmedoids_fit_predict_transform(): rng = np.random.RandomState(seed) model = KMedoids(random_state=rng) labels1 = model.fit_predict(X) assert len(labels1) == 100 assert_array_equal(labels1, model.labels_) labels2 = model.predict(X) assert_array_equal(labels1, labels2) Xt1 = model.fit_transform(X) assert_array_equal(Xt1.shape, (100, model.n_clusters)) Xt2 = model.transform(X) assert_array_equal(Xt1, Xt2)
def vae_plot(self): z_list = torch.Tensor(1, 2) poses = [] for input, output in self.generator: for inp in input: poses.append(inp) mu, logvar = self.model.encode(input) z = self.model.reparameterize(mu, logvar) z2 = z[:, -1, :] z_list = torch.cat((z_list.double(), z2.double()), 0) indices = np.random.randint(low=1, high=z_list.shape[0], size=1000) coords = np.array([z_list[ind, :].detach().numpy() for ind in indices]) # # k-means clustering for coloring # kmeans = KMeans(n_clusters=5).fit(coords) # y_kmeans = kmeans.predict(coords) # plt.scatter(coords[:,0], coords[:,1], c=y_kmeans, cmap='viridis') # plt.show() # # # draw each mean pose # centers = kmeans.cluster_centers_ # recons = [self.model.decode(torch.from_numpy(center)).detach().numpy().reshape(19,2) for center in centers] # k-medoids clustering for coloring kmedoids = KMedoids(n_clusters=5).fit(coords) y_kmedoids = kmedoids.predict(coords) plt.scatter(coords[:, 0], coords[:, 1], c=y_kmedoids, cmap='viridis') plt.show() recons = [] for center in kmedoids.cluster_centers_: c = np.array(center) for i in range(len(coords)): if np.array_equal(c, coords[i]): recons.append(poses[indices[i] - 1].detach().numpy().reshape(19, 2)) self.draw_poses(np.array(recons))
def test_precomputed(): """Test the 'precomputed' distance metric.""" rng = np.random.RandomState(seed) X_1 = [[1.0, 0.0], [1.1, 0.0], [0.0, 1.0], [0.0, 1.1]] D_1 = euclidean_distances(X_1) X_2 = [[1.1, 0.0], [0.0, 0.9]] D_2 = euclidean_distances(X_2, X_1) kmedoids = KMedoids(metric="precomputed", n_clusters=2, random_state=rng) kmedoids.fit(D_1) assert_allclose(kmedoids.inertia_, 0.2) assert_array_equal(kmedoids.medoid_indices_, [2, 0]) assert_array_equal(kmedoids.labels_, [1, 1, 0, 0]) assert kmedoids.cluster_centers_ is None med_1, med_2 = tuple(kmedoids.medoid_indices_) predictions = kmedoids.predict(D_2) assert_array_equal(predictions, [med_1 // 2, med_2 // 2]) transformed = kmedoids.transform(D_2) assert_array_equal(transformed, D_2[:, kmedoids.medoid_indices_])
# https://scikit-learn-extra.readthedocs.io/en/latest/generated/sklearn_extra.cluster.KMedoids.html from sklearn_extra.cluster import KMedoids import numpy as np X = np.asarray([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) kmedoids = KMedoids(n_clusters=2, random_state=0).fit(X) print("labels") print(kmedoids.labels_) # Predict the closest cluster for each sample in the list of arguments print("predictions") print(kmedoids.predict([[0, 0], [4, 4]])) # Displays the cluster centers, here called medoids, which are points from the original dataset print("Cluster centers") print(kmedoids.cluster_centers_) print("inertia") print(kmedoids.inertia_)
# train_indexes.append(151) # train_indexes.append(152) test_indexes = [x for x in list(range(0, 150)) if x not in list(train_indexes)] X_train = X[train_indexes, :] X_test = X[test_indexes, :] y = y[test_indexes] #k-medoid, own implementation model = k_medoids(k=3, max_iter=200) model.fit(X_train) labels_own_kmedoid = model.predict(X_test) #k-medoid, sklearn model_sklearn = KMedoids(n_clusters=3, random_state=12412).fit(X_train) labels_sklearn_kmedoid = model_sklearn.predict(X_test) #kmeans, sklearn model_kmeans = KMeans(n_clusters=3).fit(X_train) labels_sklearn_kmeans = model_kmeans.predict(X_test) # In[147]: #Plot the identified clusters and compare with our result fig, axes = plt.subplots(1, 4, figsize=(20, 7), dpi=200) axes[0].scatter(X_test[:, 2], X_test[:, 3], c=y, cmap='Pastel1', edgecolor='k') axes[1].scatter(X_test[:, 2], X_test[:, 3], c=labels_own_kmedoid, cmap='Accent',
class ArgumentClusterer: english_clusterer = None greek_clusterer = None def __init__(self, n_components=2): #self.__pca = PCA(n_components=n_components, random_state=0) self.__clusterer = None self.__medoid_texts = None def fit(self, x, output_filename_suffix='output.pdf'): x = np.array(x) num_samples, num_features = x.shape[0], x.shape[1] self.__pca = PCA(n_components=min(num_samples, num_features), random_state=0) x_transformed = self.__pca.fit_transform(x) visualizer = KElbowVisualizer(KMedoids(random_state=0), k=(1, num_samples), timings=False, locate_elbow=True) visualizer.fit(x_transformed) best_n_clusters = visualizer.elbow_value_ if visualizer.elbow_value_ is not None else 1 self.__clusterer = KMedoids(n_clusters=best_n_clusters, random_state=0) self.__clusterer.fit(x_transformed) def predict(self, x): x_transformed = self.__pca.transform(x) return self.__clusterer.predict(x_transformed) def get_medoid_indices(self): return self.__clusterer.medoid_indices_.tolist() # Sort different arguments into similar clusters. @staticmethod @counter def suggest_clusters(discussions, lang_det, en_nlp, el_nlp): # The workspace doesn't have enough discussions, early exit. if len(discussions) < 3: return {'greek_clusters': {}, 'english_clusters': {}} # Fit all clusterers for all discussions of a single workspace. ArgumentClusterer.fit_clusterers(discussions, lang_det, en_nlp, el_nlp) english_clusters = { label: { 'nodes': [], 'texts': [], 'summary': '', 'medoid_text': '' } for label in map( str, ArgumentClusterer.english_clusterer.__clusterer.labels_) } if ArgumentClusterer.english_clusterer is not None else {} greek_clusters = { label: { 'nodes': [], 'texts': [], 'summary': '', 'medoid_text': '' } for label in map( str, ArgumentClusterer.greek_clusterer.__clusterer.labels_) } if ArgumentClusterer.greek_clusterer is not None else {} for discussion in discussions: if discussion['Position'] in ['Issue', 'Solution']: continue text = discussion['DiscussionText'] language = detect_language(lang_det, text) text = remove_punctuation_and_whitespace(text) if language == 'english': if ArgumentClusterer.english_clusterer is None: continue predicted = str( ArgumentClusterer.english_clusterer.predict( [en_nlp.tokenizer(text).vector])[0]) english_clusters[predicted]['nodes'].append(discussion['id']) english_clusters[predicted]['texts'].append(text) english_clusters[predicted][ 'medoid_text'] = ArgumentClusterer.english_clusterer.__medoid_texts[ predicted] elif language == 'greek': if ArgumentClusterer.greek_clusterer is None: continue predicted = str( ArgumentClusterer.greek_clusterer.predict( [el_nlp.tokenizer(text).vector])[0]) greek_clusters[predicted]['nodes'].append(discussion['id']) greek_clusters[predicted]['texts'].append(text) greek_clusters[predicted][ 'medoid_text'] = ArgumentClusterer.greek_clusterer.__medoid_texts[ predicted] # Run textrank on non-empty aggregated text from each cluster for each language. for en_cluster in english_clusters.keys(): en_text = '. '.join(english_clusters[en_cluster]['texts']) if en_text != '': en_doc = run_textrank(en_text, en_nlp) english_clusters[en_cluster][ 'summary'] = text_summarization( en_doc, en_nlp, config.top_n, config.top_sent) for el_cluster in greek_clusters.keys(): el_text = '. '.join(greek_clusters[el_cluster]['texts']) if el_text != '': el_doc = run_textrank(el_text, el_nlp) greek_clusters[el_cluster]['summary'] = text_summarization( el_doc, el_nlp, config.top_n, config.top_sent) return { 'greek_clusters': greek_clusters, 'english_clusters': english_clusters } @staticmethod @counter def fit_clusterers(discussions, lang_det, en_nlp, el_nlp): english_clusterer = None greek_clusterer = None english_texts, greek_texts = [], [] for discussion in discussions: if discussion['Position'] in ['Issue']: continue text = discussion['DiscussionText'] language = detect_language(lang_det, text) text = remove_punctuation_and_whitespace(text) if language == 'english': english_texts.append(text) elif language == 'greek': greek_texts.append(text) if len(english_texts) > 2: # Initialize the English Clusterer. english_clusterer = ArgumentClusterer() # Calculate the embeddings for each text of this discussion. english_embeddings = [ en_nlp.tokenizer(text).vector for text in english_texts ] # Fit the clusterer using the textual embeddings of this discussion. english_clusterer.fit(english_embeddings, 'english.pdf') # Find the medoids of each cluster from each language. english_clusterer.__medoid_texts = { str(english_clusterer.__clusterer.labels_[i]): english_texts[i] for i in english_clusterer.__clusterer.medoid_indices_ } if len(greek_texts) > 2: # Initialize the Greek Clusterer. greek_clusterer = ArgumentClusterer() # Calculate the embeddings for each text of this discussion. greek_embeddings = [ el_nlp.tokenizer(text).vector for text in greek_texts ] # Fit the clusterer using the textual embeddings of this discussion. greek_clusterer.fit(greek_embeddings, 'greek.pdf') # Find the medoids of each cluster from each language. greek_clusterer.__medoid_texts = { str(greek_clusterer.__clusterer.labels_[i]): greek_texts[i] for i in greek_clusterer.__clusterer.medoid_indices_ } ArgumentClusterer.english_clusterer = english_clusterer ArgumentClusterer.greek_clusterer = greek_clusterer
class DFKMedoids(BaseEstimator, ClusterMixin): def __init__(self, cluster_name='KMedoids', columns=None, eval_inertia=False, eval_silhouette=False, eval_chi=False, eval_dbi=False, eval_sample_size=None, **kwargs): self.cluster_name = cluster_name self.columns = columns self.model = KMedoids(**kwargs) self.eval_inertia = eval_inertia self.eval_silhouette = eval_silhouette self.eval_chi = eval_chi self.eval_dbi = eval_dbi self.eval_sample_size = eval_sample_size self.transform_cols = None self.eval_df = None self.centroid_df = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] # Evaluation if any([self.eval_inertia, self.eval_silhouette, self.eval_chi, self.eval_dbi]): inertias = [] silhouettes = [] chis = [] dbis = [] self.eval_df = pd.DataFrame({ 'n_cluster': [x+1 for x in range(self.model.n_clusters)], }) self.eval_df['centroid'] = self.eval_df['n_cluster'].apply(lambda x: []) tmp_X = X[self.transform_cols].copy() index = 0 for n_cluster in tqdm(self.eval_df['n_cluster'].values): model = copy.deepcopy(self.model) model.n_clusters = n_cluster model.fit(tmp_X) # Cluster centroid self.eval_df.at[index, 'centroid'] = model.cluster_centers_ # Reference: https://blog.cambridgespark.com/how-to-determine-the-optimal-number-of-clusters-for-k-means-clustering-14f27070048f if self.eval_inertia: inertias.append(model.inertia_) # Reference: https://towardsdatascience.com/clustering-metrics-better-than-the-elbow-method-6926e1f723a6 if self.eval_silhouette: silhouettes.append(np.nan if n_cluster <= 1 else silhouette_score(tmp_X, model.labels_, sample_size=self.eval_sample_size, metric='euclidean', random_state=model.random_state)) # Reference: https://stats.stackexchange.com/questions/52838/what-is-an-acceptable-value-of-the-calinski-harabasz-ch-criterion if self.eval_chi: chis.append(np.nan if n_cluster <= 1 else calinski_harabasz_score(tmp_X, model.labels_)) # Reference: https://stackoverflow.com/questions/59279056/davies-bouldin-index-higher-or-lower-score-better if self.eval_dbi: dbis.append(np.nan if n_cluster <= 1 else davies_bouldin_score(tmp_X, model.labels_)) index += 1 if self.eval_inertia: self.eval_df['inertia'] = inertias if self.eval_silhouette: self.eval_df['silhouette'] = silhouettes if self.eval_chi: self.eval_df['calinski_harabasz'] = chis if self.eval_dbi: self.eval_df['davies_bouldin'] = dbis # Train else: self.model.fit(X[self.transform_cols]) self.centroid_df = pd.DataFrame( self.model.cluster_centers_, columns=self.transform_cols ) self.centroid_df['Cluster'] = [f'Cluster {x}' for x in np.unique(self.model.labels_)] self.centroid_df.set_index('Cluster', inplace=True) self.centroid_df.index.name = None return self def predict(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = X.copy() new_X[self.cluster_name] = self.model.predict(X[self.transform_cols]) new_X[self.cluster_name] = 'Cluster ' + new_X[self.cluster_name].astype(str) return new_X def fit_predict(self, X, y=None): return self.fit(X).predict(X) def predict_proba(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") # Measure distance to centroid prob_df = pd.DataFrame( DistanceMetric.get_metric('euclidean').pairwise(X[self.transform_cols], self.centroid_df), columns=[f'{self.cluster_name} Cluster {x}' for x in range(len(self.centroid_df))] ) # Convert to probability prob_df = prob_df.divide(prob_df.sum(axis=1), axis=0) prob_df = 1 - prob_df new_X = pd.concat([X, prob_df], axis=1) return new_X
XC = X[-recent_cluster_num:] yC = y[-recent_cluster_num:] num_test = 60 a_class, a_kmeans, a_kmedians, a_gmm, a_spectral, test_count = [], [], [], [], [], [] for m in range(1, num_test + 1): classification_pred, clustering_pred_kmeans, clustering_pred_kmedians, clustering_pred_gmm, clustering_pred_spectral, real = [], [], [], [], [], [] XC = X[-recent_cluster_num:] yC = y[-recent_cluster_num:] for i in range(1, m + 1): kmeans = KMeans(n_clusters=5, random_state=0).fit(XC) kmedians = KMedoids(n_clusters=5, random_state=0).fit(XC) gmm = GaussianMixture(n_components=5, random_state=0).fit(XC) XC_p = X[recent_cluster_num + i, :].reshape(1, 27) label_kmeans = kmeans.predict(XC_p) label_kmedians = kmedians.predict(XC_p) label_gmm = gmm.predict(XC_p) centroid_kmeans = kmeans.cluster_centers_[label_kmeans] centroid_kmedians = kmedians.cluster_centers_[label_kmedians] centroid_gmm = gmm.means_[label_gmm] clf = LinearDiscriminantAnalysis() class_p = clf.fit(X, y).predict(XC_p) clus_p_kmeans = clf.fit(XC, yC).predict(centroid_kmeans) clus_p_kmedians = clf.fit(XC, yC).predict(centroid_kmedians) clus_p_gmm = clf.fit(XC, yC).predict(centroid_gmm) classification_pred.append(class_p) clustering_pred_kmeans.append(clus_p_kmeans) clustering_pred_kmedians.append(clus_p_kmedians) clustering_pred_gmm.append(clus_p_gmm) real.append(y[recent_cluster_num + i])
def kmedoids_clustering(country, dbData, thermalFields, clusters, floor_area_outlier_borders, energy_consumption_outlier_borders): # https://hackersandslackers.com/json-into-pandas-dataframes/ # json_normalize has as default separator '.', since we have float numbers in our data, we set the column separator for the normalization to '_' data_df = pd.json_normalize(dbData, sep="_") print(data_df.describe().transpose()) # https://datatofish.com/k-means-clustering-python/ slim_data_df = pd.DataFrame( data_df, columns=[ 'ratedDwelling_spatialData_totalFloorArea_value', thermalFields ]) print(slim_data_df.head) # fitting the data is quite important, the clusters are now more like circles; the non-fitted data was more like strapes. slim_fitted_df = StandardScaler().fit_transform(slim_data_df) # remove the outliers which we detected already visually by running a kmeans plot before print("=== Outliers") # however, first convert the data back into a dataframe slim_data_df_optimised = pd.DataFrame( slim_fitted_df, columns=[ 'ratedDwelling_spatialData_totalFloorArea_value', thermalFields ]) print(slim_data_df_optimised.shape) print(slim_data_df_optimised) print(country + ' outlier borders floor_area' + str(floor_area_outlier_borders)) print(country + ' outlier borders energy_consumption' + str(energy_consumption_outlier_borders)) print('Outliers') slim_data_df_optimised_floor_area = slim_data_df_optimised[ slim_data_df_optimised['ratedDwelling_spatialData_totalFloorArea_value'] >= floor_area_outlier_borders[0]] slim_data_df_optimised_energy_consumption = slim_data_df_optimised[ slim_data_df_optimised[thermalFields] > energy_consumption_outlier_borders[0]] print(slim_data_df_optimised_floor_area) if country == 'England': slim_data_df_optimised_energy_consumption_lower = slim_data_df_optimised[ slim_data_df_optimised[thermalFields] < energy_consumption_outlier_borders[1]] print(slim_data_df_optimised_energy_consumption_lower) frames = [ slim_data_df_optimised_energy_consumption, slim_data_df_optimised_energy_consumption_lower ] slim_data_df_optimised_energy_consumption = pd.concat(frames, sort=False) print(slim_data_df_optimised_energy_consumption) print("Outliers indexes") index_outliers_floorArea = slim_data_df_optimised_floor_area.index.values index_outliers_thermal = slim_data_df_optimised_energy_consumption.index.values print("index_outliers_floorArea") print(index_outliers_floorArea) print("index_outliers_thermal") print(index_outliers_thermal) # Removing the Outliers slim_data_df_optimised = slim_data_df_optimised.drop( index=index_outliers_floorArea) slim_data_df_optimised = slim_data_df_optimised.drop( index=index_outliers_thermal) print("scaled data after droping the outliers") print(slim_data_df_optimised.shape) print("rechecking") print(slim_data_df_optimised[slim_data_df_optimised_floor_area]) print(slim_data_df_optimised[slim_data_df_optimised_energy_consumption]) print("transforming the optimised dataset back to an array") slim_data_df_optimised_as_array = slim_data_df_optimised.to_numpy() print(slim_data_df_optimised_as_array.shape) print(type(slim_data_df_optimised_as_array)) # remove the same index rows in the original data data_df = data_df.drop(index=index_outliers_floorArea) data_df = data_df.drop(index=index_outliers_thermal) print("original data after droping the outliers") print(data_df.shape) print("=== Outliers END") # preset the number of clusters kmedoids = KMedoids(n_clusters=clusters, random_state=0).fit(slim_data_df_optimised_as_array) print("labels") print(kmedoids.labels_) # Displays the cluster centers, here called medoids, which are points from the original dataset print("Cluster centers") print(kmedoids.cluster_centers_) print("inertia") print(kmedoids.inertia_) print("predictions") print(kmedoids.predict([[-0.98965, -0.3211], [0.6, -0.300]])) scatter = plt.scatter(slim_data_df_optimised_as_array[:, 0], slim_data_df_optimised_as_array[:, 1], c=kmedoids.labels_.astype(float), s=50, alpha=0.5) centroids = kmedoids.cluster_centers_ plt.scatter(centroids[:, 0], centroids[:, 1], c='red', s=50) plt.xlabel("floor area") plt.ylabel("energy consumption") plt.title(country + ": " + str(len(slim_data_df_optimised_as_array) + 1) + " dwellings") plt.legend(*scatter.legend_elements(), loc='upper right', title="Clusters") here = os.path.dirname(os.path.abspath(__file__)) filename = os.path.join( Path(here).parent, 'plots', 'kmedoid_plots', country + '_kmedoid_plot.png') plt.savefig(filename) # function that creates a dataframe for cluster centers with a column for cluster number P = pd_centers( ['ratedDwelling_spatialData_totalFloorArea_value', thermalFields], centroids) print('Centroids (x,y,label) for the fitted data') print(P) # we have the array with labels (cluster numbers) # so we add a new column with cluster numbers to the original data labels = kmedoids.labels_ data_df['cluster_number'] = labels print('original data and the corresponfing cluster numbers') print(data_df) # display the rating level from the original data and the cluster number form the fitted data rating_vs_cluster_data_df = pd.DataFrame( data_df, columns=['awardedRating_ratingLevel', 'cluster_number']) print(rating_vs_cluster_data_df) # group by rating level print( rating_vs_cluster_data_df.groupby( ["awardedRating_ratingLevel", "cluster_number"])['cluster_number'].count()) # => there is no clear relation btw. rating_level and cluster number return 'the END'
def kmedoids(self, score_df, col_name): kmedoids = KMedoids(n_clusters = self.clust_num, random_state = self.random_state) kmedoids.fit(score_df[[col_name]]) res_cluster = kmedoids.predict(score_df[[col_name]]) return res_cluster
plt.scatter(p1, p2, c=y_kmeans, s=50) plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200) plt.show() print("sum of sq dist = ", kmeans.inertia_) print("purity : ", purity_score(y, y_kmeans)) print("homogenousity score: ", hs(y, y_kmeans)) vals = [] for i in k: km = KMeans(n_clusters=i, random_state=0).fit(X) vals.append(km.inertia_) plt.plot(k, vals) plt.show() kmedoids = KMedoids(n_clusters=4, random_state=0).fit(X) centers = kmedoids.cluster_centers_ labels = kmedoids.labels_ y_kmed = kmedoids.predict(X) y_kmeans = kmedoids.predict(X) plt.scatter(p1, p2, c=y_kmeans, s=50) plt.scatter(centers[:, 0], centers[:, 1], c='black', s=200) plt.show() print("sum of sq dist = ", kmedoids.inertia_) print("purity : ", purity_score(y, y_kmed)) print("homegenousity score : ", hs(y, y_kmed)) vals = [] for i in k: km = KMedoids(n_clusters=i, random_state=0).fit(X) vals.append(km.inertia_) plt.plot(k, vals) plt.show() gmm = GMM(n_components=4).fit(X)