def test_seuclidean(): with pytest.warns(None) as record: km = KMedoids(2, metric="seuclidean", method="pam") km.fit(np.array([0, 0, 0, 1]).reshape((4, 1))) km.predict(np.array([0, 0, 0, 1]).reshape((4, 1))) km.transform(np.array([0, 0, 0, 1]).reshape((4, 1))) assert len(record) == 0
def test_kmedoids_iris(): """Test kmedoids on the Iris dataset""" rng = np.random.RandomState(seed) X_iris = load_iris()["data"] ref_model = KMeans(n_clusters=3).fit(X_iris) avg_dist_to_closest_centroid = (ref_model.transform(X_iris).min( axis=1).mean()) for init in ["random", "heuristic", "k-medoids++"]: distance_metric = "euclidean" model = KMedoids(n_clusters=3, metric=distance_metric, init=init, random_state=rng) model.fit(X_iris) # test convergence in reasonable number of steps assert model.n_iter_ < (len(X_iris) // 10) distances = PAIRWISE_DISTANCE_FUNCTIONS[distance_metric](X_iris) avg_dist_to_random_medoid = np.mean(distances.ravel()) avg_dist_to_closest_medoid = model.inertia_ / X_iris.shape[0] # We want distance-to-closest-medoid to be reduced from average # distance by more than 50% assert avg_dist_to_random_medoid > 2 * avg_dist_to_closest_medoid # When K-Medoids is using Euclidean distance, # we can compare its performance to # K-Means. We want the average distance to cluster centers # to be similar between K-Means and K-Medoids assert_allclose(avg_dist_to_closest_medoid, avg_dist_to_closest_centroid, rtol=0.1)
def test_kmedoids_empty_clusters(): """When a cluster is empty, it should throw a warning.""" rng = np.random.RandomState(seed) X = [[1], [1], [1]] kmedoids = KMedoids(n_clusters=2, random_state=rng) with pytest.warns(UserWarning, match="Cluster 1 is empty!"): kmedoids.fit(X)
def test_kmedoid_results(method, init): expected = np.hstack([np.zeros(50), np.ones(50)]) km = KMedoids(n_clusters=2, init=init, method=method, random_state=rng) km.fit(X_cc) # This test use data that are not perfectly separable so the # accuracy is not 1. Accuracy around 0.85 assert (np.mean(km.labels_ == expected) > 0.8) or (1 - np.mean(km.labels_ == expected) > 0.8)
def sklearn_kmedoids(ds, numClusters, numSamples): km = KMedoids(n_clusters=numClusters, random_state=0) df = ds.df[["x1", "x2"]] df = df[:numSamples] km.fit(df[["x1", "x2"]].to_numpy()) return pd.DataFrame(km.labels_, columns=["cluster"])
def run_KMedoids(n_clusters, pca_components, data, components): clustering = KMedoids(n_clusters=7, random_state=0) clustering.fit(pca_components) df_seg_pca_kmedoids = pd.concat( [data.reset_index(drop=True), pd.DataFrame(pca_components)], axis=1) df_seg_pca_kmedoids.columns.values[(-1 * components):] = [ "Component " + str(i + 1) for i in range(components) ] df_seg_pca_kmedoids['Cluster'] = clustering.labels_ return df_seg_pca_kmedoids
def test_outlier_robustness(): rng = np.random.RandomState(seed) kmeans = KMeans(n_clusters=2, random_state=rng) kmedoids = KMedoids(n_clusters=2, random_state=rng) X = [[-11, 0], [-10, 0], [-9, 0], [0, 0], [1, 0], [2, 0], [1000, 0]] kmeans.fit(X) kmedoids.fit(X) assert_array_equal(kmeans.labels_, [0, 0, 0, 0, 0, 0, 1]) assert_array_equal(kmedoids.labels_, [0, 0, 0, 1, 1, 1, 1])
def test_max_iter(): """Test that warning message is thrown when max_iter is reached.""" rng = np.random.RandomState(seed) X_iris = load_iris()["data"] model = KMedoids( n_clusters=10, init="random", random_state=rng, max_iter=1 ) msg = "Maximum number of iteration reached before" with pytest.warns(UserWarning, match=msg): model.fit(X_iris)
def test_kmedoid_nclusters(method, init): n_clusters = 50 km = KMedoids( n_clusters=n_clusters, init=init, method=method, max_iter=1, random_state=rng, ) km.fit(X_cc) assert len(np.unique(km.medoid_indices_)) == n_clusters
def test_build(): X, y = fetch_20newsgroups_vectorized(return_X_y=True) # Select only the first 500 samples X = X[:500] y = y[:500] # Precompute cosine distance matrix diss = cosine_distances(X) # run build ske = KMedoids(20, "precomputed", init="build", max_iter=0) ske.fit(diss) assert ske.inertia_ <= 230 assert len(np.unique(ske.labels_)) == 20
def find_optimal_clusters_and_display(pca_components): wcss = [] max_clusters = 21 for i in range(1, max_clusters): kmedoids_pca = KMedoids(n_clusters=i, random_state=0) kmedoids_pca.fit(pca_components) wcss.append(kmedoids_pca.inertia_) n_clusters = KneeLocator([i for i in range(1, max_clusters)], wcss, curve='convex', direction='decreasing').knee st.write("Optimal number of clusters", n_clusters) return n_clusters
class Clustering: def __init__(self, data): self.states = data.keys() self.kmeans = KMedoids(n_clusters=3) self.kmeans.fit(np.array(tuple(data.values())).reshape(-1, 1)) self.mapping = list( np.argsort(np.squeeze(self.kmeans.cluster_centers_))) def cluster(self): result = [[], [], []] for state, cluster in zip(self.states, self.kmeans.labels_): result[self.mapping.index(cluster)].append(state) return result
def find_cluster_centres(text, num_clusters): corpus = nltk.sent_tokenize(text) corpus_embeddings = embedder.encode(corpus) clustering_model = KMedoids(n_clusters=num_clusters, random_state=0, metric="cosine") clustering_model.fit(corpus_embeddings) cluster_center_embeddings = clustering_model.cluster_centers_ cluster_centers = [] for center_embedding in cluster_center_embeddings: for index, sentence_embedding in enumerate(corpus_embeddings): if np.array_equal(sentence_embedding, center_embedding): if corpus[index] not in cluster_centers: cluster_centers.append(corpus[index]) return cluster_centers
def test_array_like_init(): centroids = np.array([X_cc[0], X_cc[50]]) expected = np.hstack([np.zeros(50), np.ones(50)]) km = KMedoids(n_clusters=len(centroids), init=centroids) km.fit(X_cc) # # This test use data that are not perfectly separable so the # # accuracy is not 1. Accuracy around 0.85 assert (np.mean(km.labels_ == expected) > 0.8) or (1 - np.mean(km.labels_ == expected) > 0.8) # Override n_clusters if array-like init method is used km = KMedoids(n_clusters=len(centroids) + 2, init=centroids) km.fit(X_cc) assert len(km.cluster_centers_) == len(centroids)
def test_clara_consistency_iris(): # test that CLARA is PAM when full sample is used rng = np.random.RandomState(seed) X_iris = load_iris()["data"] clara = CLARA( n_clusters=3, n_sampling_iter=1, n_sampling=len(X_iris), random_state=rng, ) model = KMedoids(n_clusters=3, init="build", random_state=rng) model.fit(X_iris) clara.fit(X_iris) assert np.sum(model.labels_ == clara.labels_) == len(X_iris)
def test_kmedoids_fit_naive(): n_clusters = 3 metric = "euclidean" model = KMedoids(n_clusters=n_clusters, metric=metric) Xnaive = np.asarray([[1, 0, 0], [0, 1, 0], [0, 0, 1]]) model.fit(Xnaive) assert_array_equal(model.cluster_centers_, [[1, 0, 0], [0, 1, 0], [0, 0, 1]]) assert_array_equal(model.labels_, [0, 1, 2]) assert model.inertia_ == 0.0 # diagonal must be zero, off-diagonals must be positive X_new = model.transform(Xnaive) for c in range(n_clusters): assert X_new[c, c] == 0 for c2 in range(n_clusters): if c != c2: assert X_new[c, c2] > 0
def compare_k_med(k_list, X): # Run clustering with different k and check the metrics silhouette_list = [] inertia_list = [] for p in k_list: print("Calculating silhouette score for k =", p) clusters = KMedoids(n_clusters=p, metric='precomputed', random_state=2248, init='k-medoids++') clusters.fit(X) # The higher (up to 1) the better s = round(silhouette_score(X, clusters.labels_, metric="precomputed"), 4) silhouette_list.append(s) i = clusters.inertia_ inertia_list.append(i) # The higher (up to 1) the better key = silhouette_list.index(max(silhouette_list)) k = k_list.__getitem__(key) print("Kmed best silhouette =", max(silhouette_list), " for k =", k) return k, silhouette_list, inertia_list
def test_precomputed(): """Test the 'precomputed' distance metric.""" rng = np.random.RandomState(seed) X_1 = [[1.0, 0.0], [1.1, 0.0], [0.0, 1.0], [0.0, 1.1]] D_1 = euclidean_distances(X_1) X_2 = [[1.1, 0.0], [0.0, 0.9]] D_2 = euclidean_distances(X_2, X_1) kmedoids = KMedoids(metric="precomputed", n_clusters=2, random_state=rng) kmedoids.fit(D_1) assert_allclose(kmedoids.inertia_, 0.2) assert_array_equal(kmedoids.medoid_indices_, [2, 0]) assert_array_equal(kmedoids.labels_, [1, 1, 0, 0]) assert kmedoids.cluster_centers_ is None med_1, med_2 = tuple(kmedoids.medoid_indices_) predictions = kmedoids.predict(D_2) assert_array_equal(predictions, [med_1 // 2, med_2 // 2]) transformed = kmedoids.transform(D_2) assert_array_equal(transformed, D_2[:, kmedoids.medoid_indices_])
def test_medoids_indices(): rng = np.random.RandomState(seed) X_iris = load_iris()["data"] clara = CLARA( n_clusters=3, n_sampling_iter=1, n_sampling=len(X_iris), random_state=rng, ) model = KMedoids(n_clusters=3, init="build", random_state=rng) centroids = np.array([X_iris[0], X_iris[50]]) array_like_model = KMedoids(n_clusters=len(centroids), init=centroids, max_iter=0) model.fit(X_iris) clara.fit(X_iris) array_like_model.fit(X_iris) assert_array_equal(X_iris[model.medoid_indices_], model.cluster_centers_) assert_array_equal(X_iris[clara.medoid_indices_], clara.cluster_centers_) assert_array_equal(centroids, array_like_model.cluster_centers_)
def define_k(answers): kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 300, "random_state": None, } sse = [] for k in range(1, 11): kmeans = KMedoids(n_clusters=k, metric=distance_function, random_state=None).fit(answers) kmeans.fit(scaled_answers) sse.append(kmeans.inertia_) plt.style.use("fivethirtyeight") plt.plot(range(1, 11), sse) plt.xticks(range(1, 11)) plt.xlabel("Number of Clusters") plt.ylabel("SSE") plt.show() kl = KneeLocator(range(1, 11), sse, curve="convex", direction="decreasing") kl.elbow return kl.elbow
def kmedoids_dm(input_data, cluster_no): start = time.time() dataset = pd.read_csv(input_data, sep=',', error_bad_lines=False, index_col=False, dtype='unicode') tf_idf_vectorizer = TfidfVectorizer(stop_words='english', max_features=20000) kmed = KMedoids(n_clusters=cluster_no, random_state=0) kmed.fit(dataset) labels = kmed.labels_ dbi = davies_bouldin_score(dataset, labels) si = silhouette_score(dataset, labels) print("Runtime: ") print(time.time() - start) return dbi, si
def models(self): """ Get sentences embeddings and generate cluster according to the number of cluster previously defined. An UMAP dimension reduction and a Kmenoid with cosine distance are performed for this task. Returns: sklearn.model: fitted umap model with attribute 'sentence_embeddings' Returns: sklearn.model: fitted kmenoid model from umaped 'sentence_embeddings' """ umap_model = umap.UMAP(n_neighbors=15, n_components=self.n_umap, metric="cosine") umap_model = umap_model.fit(self.sentence_embeddings) umap_embeddings = umap_model.transform(self.sentence_embeddings) kmenoid_model = KMedoids(n_clusters=self.n_clusters, metric="cosine", init="random", random_state=15) cluster = kmenoid_model.fit(umap_embeddings) return cluster, umap_model
def clusterization(self, cols=None, method="k_means", visualize=True, n_clusters=None): if not self.is_standardize: raise ValueError("You should standardize your columns first.") if method == "k_means": logger.info("=" * 27) logger.info("Clustering using K-Means") logger.info("=" * 27) kmeans_kwargs = { "init": "random", "n_init": 10, "max_iter": 300, "random_state": 42, } sse = [] kmeans_silhouette_coefficients = [] for k in range(2, 11): kmeans = KMeans(n_clusters=k, **kmeans_kwargs) kmeans.fit(self.dataset[cols]) sse.append(kmeans.inertia_) score = silhouette_score(self.dataset[cols], kmeans.labels_) kmeans_silhouette_coefficients.append(score) if visualize: # plt.style.use("fivethirtyeight") plt.plot(range(2, 11), sse) plt.xticks(range(2, 11)) plt.title("K-Means") plt.xlabel("Number of Clusters") plt.ylabel("SSE") plt.show() # plt.style.use("fivethirtyeight") plt.plot(range(2, 11), kmeans_silhouette_coefficients) plt.xticks(range(2, 11)) plt.title("K-Means") plt.xlabel("Number of Clusters") plt.ylabel("Silhouette Coefficient") plt.show() kl = KneeLocator(range(2, 11), sse, curve="convex", direction="decreasing") number_clusters_best = kl.elbow logger.info( f"Best number of clusters using elbow method: {number_clusters_best}" ) logger.info("") logger.info( f"See the graph Silhouette coefficient vs number of clusters to define \ the best amount of clusters in your case. \ (Silhouette coefficient goes from -1 to 1, near to 1 is better)" ) logger.info("") elif method == "k_medoids": logger.info("=" * 27) logger.info("Clustering using K-Medoids") logger.info("=" * 27) kmedoids_kwargs = { "metric": "euclidean", } sse = [] kmedoids_silhouette_coefficients = [] for k in range(2, 11): kmedoids = KMedoids(n_clusters=k, **kmedoids_kwargs) kmedoids.fit(self.dataset[cols]) sse.append(kmedoids.inertia_) score = silhouette_score(self.dataset[cols], kmedoids.labels_) kmedoids_silhouette_coefficients.append(score) if visualize: # plt.style.use("fivethirtyeight") plt.plot(range(2, 11), sse) plt.xticks(range(2, 11)) plt.title("K-Medoids") plt.xlabel("Number of Clusters") plt.ylabel("SSE") plt.show() # plt.style.use("fivethirtyeight") plt.plot(range(2, 11), kmedoids_silhouette_coefficients) plt.xticks(range(2, 11)) plt.title("K-Medoids") plt.xlabel("Number of Clusters") plt.ylabel("Silhouette Coefficient") plt.show() kl = KneeLocator(range(2, 11), sse, curve="convex", direction="decreasing") number_clusters_best = kl.elbow logger.info( f"Best number of clusters using elbow method: {number_clusters_best}" ) logger.info("") logger.info( f"See the graph Silhouette coefficient vs number of clusters to define \ the best amount of clusters in your case. \ (Silhouette coefficient goes from -1 to 1, near to 1 is better)" ) logger.info("") elif method == "dbscan": logger.info("=" * 27) logger.info("Clustering using DBScan") logger.info("=" * 27) silhouette_eps_ncluster = {} for eps in np.linspace(0.1, 4, 10): dbscan = DBSCAN(eps=eps) dbscan.fit(self.dataset[cols]) if len(set(dbscan.labels_)) > 1: # Silhouette score requires at least 2 clusters to be calculated. # Rows marked with dbscan.labels_=-1 don"t belong to a real cluster # but are considered noise. score = round( silhouette_score(self.dataset[cols], dbscan.labels_), 4) nclusters = len(set(dbscan.labels_)) silhouette_eps_ncluster[score] = ((eps, nclusters)) if visualize: y, tup = zip(*silhouette_eps_ncluster.items()) x = [eps for eps, nclusters in tup] # plt.style.use("fivethirtyeight") plt.plot(x, y) plt.xticks(np.linspace(0.1, 4, 10)) plt.title("DBScan") plt.xlabel("eps") plt.ylabel("Silhouette Coefficient") plt.show() nclusters_best = silhouette_eps_ncluster.get( max(silhouette_eps_ncluster.keys()), -1)[1] logger.info( f"Best number of clusters using Silhouette over multiple eps: {nclusters_best}" ) logger.info("") # TODO: Add column with the id of the cluster each row belongs to # TODO: Implement scatter plot of clusters. else: raise ValueError("Clustering method not implemented.")
def test_elbow(X, dtw_value, seed): print(len(X)) distortions = [] silhouette_value = [] dists = dtw_value print(dists) if seed == -1: for seed in range(0, 21): cur_silhouette = [seed] cur_distortions = [seed] for i in range(2, 15): print(i) km = KMedoids(n_clusters=i, random_state=seed, metric="precomputed", init='k-medoids++', max_iter=30000) km.fit(dists) # 记录误差和 cur_distortions.append(km.inertia_) y_pred = km.fit_predict(dists) np.fill_diagonal(dists, 0) score = silhouette_score(dists, y_pred, metric="precomputed") cur_silhouette.append(score) distortions.append(cur_distortions) silhouette_value.append(cur_silhouette) with open(r".//res//grid_distortions_destination.csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) for row in distortions: writer.writerow(row) print(row) with open(r".//res//grid_silhouette_destination.csv", "w", encoding='UTF-8', newline='') as csvfile: writer = csv.writer(csvfile) for row in silhouette_value: writer.writerow(row) print(row) else: csv_reader = csv.reader( open(".//res//grid_distortions_destination.csv", encoding='UTF-8')) for row in csv_reader: distortions.append([float(item) for item in row]) csv_reader = csv.reader( open(".//res//grid_silhouette_destination.csv", encoding='UTF-8')) for row in csv_reader: silhouette_value.append([float(item) for item in row]) chosen_distortions = distortions[seed][1:] chosen_silhouette = silhouette_value[seed][1:] plt.figure(1) plt.plot(range(2, 15), chosen_distortions, marker='o') plt.xlabel('Number of clusters') plt.ylabel('Distortion') plt.savefig(r'.//res//grid_distortions_destination.png') plt.close() plt.figure(1) plt.bar(range(2, 15), chosen_silhouette, color='grey') plt.xlabel('Number of clusters') plt.ylabel('Silhouette score') plt.savefig(r'.//res//grid_silhouette_destination.png')
features = pd.read_csv('./../Data/specPowerDatamartTransform.csv') #PCA #dicha funcion scale lo que hace es centrar y escalar los datos scaled_data = preprocessing.scale(features) #Planteamos los datos como la relacion lineal de solamente dos componentes pca = PCA(n_components=2) dataset_questions_pca = pca.fit_transform(scaled_data) #Analizamos la cantidad de cluster a partir de la informacion obtenida de la relacion lineal del pca #Aplico el metodo del codo sobre el conjunto de datos wcss = [] for i in range(1, 7): kmedoids = KMedoids(n_clusters=i, random_state=0) kmedoids.fit(dataset_questions_pca) sse = kmedoids.inertia_ print("Clusters", i, "SSE", sse) wcss.append(sse) plt.plot(range(1, 7), wcss) plt.title('The Elbow Method') plt.xlabel('Number of clusters') plt.ylabel('WCSS') plt.show() #Aplico k-means sobre el conjunto brindado por pca kmedoids = KMedoids(n_clusters=3, random_state=0) y_kmedoids = kmedoids.fit_predict(dataset_questions_pca) initial_centroids = kmedoids.cluster_centers_ print("Centroides iniciales")
class ArgumentClusterer: english_clusterer = None greek_clusterer = None def __init__(self, n_components=2): #self.__pca = PCA(n_components=n_components, random_state=0) self.__clusterer = None self.__medoid_texts = None def fit(self, x, output_filename_suffix='output.pdf'): x = np.array(x) num_samples, num_features = x.shape[0], x.shape[1] self.__pca = PCA(n_components=min(num_samples, num_features), random_state=0) x_transformed = self.__pca.fit_transform(x) visualizer = KElbowVisualizer(KMedoids(random_state=0), k=(1, num_samples), timings=False, locate_elbow=True) visualizer.fit(x_transformed) best_n_clusters = visualizer.elbow_value_ if visualizer.elbow_value_ is not None else 1 self.__clusterer = KMedoids(n_clusters=best_n_clusters, random_state=0) self.__clusterer.fit(x_transformed) def predict(self, x): x_transformed = self.__pca.transform(x) return self.__clusterer.predict(x_transformed) def get_medoid_indices(self): return self.__clusterer.medoid_indices_.tolist() # Sort different arguments into similar clusters. @staticmethod @counter def suggest_clusters(discussions, lang_det, en_nlp, el_nlp): # The workspace doesn't have enough discussions, early exit. if len(discussions) < 3: return {'greek_clusters': {}, 'english_clusters': {}} # Fit all clusterers for all discussions of a single workspace. ArgumentClusterer.fit_clusterers(discussions, lang_det, en_nlp, el_nlp) english_clusters = { label: { 'nodes': [], 'texts': [], 'summary': '', 'medoid_text': '' } for label in map( str, ArgumentClusterer.english_clusterer.__clusterer.labels_) } if ArgumentClusterer.english_clusterer is not None else {} greek_clusters = { label: { 'nodes': [], 'texts': [], 'summary': '', 'medoid_text': '' } for label in map( str, ArgumentClusterer.greek_clusterer.__clusterer.labels_) } if ArgumentClusterer.greek_clusterer is not None else {} for discussion in discussions: if discussion['Position'] in ['Issue', 'Solution']: continue text = discussion['DiscussionText'] language = detect_language(lang_det, text) text = remove_punctuation_and_whitespace(text) if language == 'english': if ArgumentClusterer.english_clusterer is None: continue predicted = str( ArgumentClusterer.english_clusterer.predict( [en_nlp.tokenizer(text).vector])[0]) english_clusters[predicted]['nodes'].append(discussion['id']) english_clusters[predicted]['texts'].append(text) english_clusters[predicted][ 'medoid_text'] = ArgumentClusterer.english_clusterer.__medoid_texts[ predicted] elif language == 'greek': if ArgumentClusterer.greek_clusterer is None: continue predicted = str( ArgumentClusterer.greek_clusterer.predict( [el_nlp.tokenizer(text).vector])[0]) greek_clusters[predicted]['nodes'].append(discussion['id']) greek_clusters[predicted]['texts'].append(text) greek_clusters[predicted][ 'medoid_text'] = ArgumentClusterer.greek_clusterer.__medoid_texts[ predicted] # Run textrank on non-empty aggregated text from each cluster for each language. for en_cluster in english_clusters.keys(): en_text = '. '.join(english_clusters[en_cluster]['texts']) if en_text != '': en_doc = run_textrank(en_text, en_nlp) english_clusters[en_cluster][ 'summary'] = text_summarization( en_doc, en_nlp, config.top_n, config.top_sent) for el_cluster in greek_clusters.keys(): el_text = '. '.join(greek_clusters[el_cluster]['texts']) if el_text != '': el_doc = run_textrank(el_text, el_nlp) greek_clusters[el_cluster]['summary'] = text_summarization( el_doc, el_nlp, config.top_n, config.top_sent) return { 'greek_clusters': greek_clusters, 'english_clusters': english_clusters } @staticmethod @counter def fit_clusterers(discussions, lang_det, en_nlp, el_nlp): english_clusterer = None greek_clusterer = None english_texts, greek_texts = [], [] for discussion in discussions: if discussion['Position'] in ['Issue']: continue text = discussion['DiscussionText'] language = detect_language(lang_det, text) text = remove_punctuation_and_whitespace(text) if language == 'english': english_texts.append(text) elif language == 'greek': greek_texts.append(text) if len(english_texts) > 2: # Initialize the English Clusterer. english_clusterer = ArgumentClusterer() # Calculate the embeddings for each text of this discussion. english_embeddings = [ en_nlp.tokenizer(text).vector for text in english_texts ] # Fit the clusterer using the textual embeddings of this discussion. english_clusterer.fit(english_embeddings, 'english.pdf') # Find the medoids of each cluster from each language. english_clusterer.__medoid_texts = { str(english_clusterer.__clusterer.labels_[i]): english_texts[i] for i in english_clusterer.__clusterer.medoid_indices_ } if len(greek_texts) > 2: # Initialize the Greek Clusterer. greek_clusterer = ArgumentClusterer() # Calculate the embeddings for each text of this discussion. greek_embeddings = [ el_nlp.tokenizer(text).vector for text in greek_texts ] # Fit the clusterer using the textual embeddings of this discussion. greek_clusterer.fit(greek_embeddings, 'greek.pdf') # Find the medoids of each cluster from each language. greek_clusterer.__medoid_texts = { str(greek_clusterer.__clusterer.labels_[i]): greek_texts[i] for i in greek_clusterer.__clusterer.medoid_indices_ } ArgumentClusterer.english_clusterer = english_clusterer ArgumentClusterer.greek_clusterer = greek_clusterer
def clustering(self, clusterNo): km = KMedoids(clusterNo, metric='precomputed', init='k-medoids++') km.fit(self.metric) self.labels_ = km.labels_[:len(self.classes)]
alpha=0.9).run() scores = orig_scores.copy() cos_dists = squareform(pdist(X, metric='cosine')) np.fill_diagonal(scores, np.inf) # >> k = int(0.1 * n_train) print('-' * 50) # KMedoids kmed = KMedoids(n_clusters=k, metric='precomputed', max_iter=1000) train = kmed.fit(orig_scores.max() - orig_scores).medoid_indices_ pred_idx = orig_scores[:, train].argmax(axis=-1) print('kmedoid (diff)', metric_fn(y, y[train][pred_idx])) # Heuristic train = scores.mean(axis=0).argsort()[-k:] pred_idx = orig_scores[:, train].argmax(axis=-1) print('heuristic (diff)', metric_fn(y, y[train][pred_idx])) # Random train = np.random.choice(X.shape[0], k, replace=False) pred_idx = orig_scores[:, train].argmax(axis=-1) print('random (diff)', metric_fn(y, y[train][pred_idx])) print('-' * 50)
plt.style.use("fivethirtyeight") plt.plot(range(2, 11), kmeans_silhouette_coefficients) plt.xticks(range(2, 11)) plt.title("K-Means") plt.xlabel("Number of Clusters") plt.ylabel("Silhouette Coefficient") plt.show() # Clustering - Kmedoids (initial approach with 3 clusters) kmedoids = KMedoids( metric="euclidean", n_clusters=3, ) kmedoids.fit(standardized_features) # The lowest Sum of Squared Error (SSE) value kmedoids.inertia_ # Final locations of the centroid kmedoids.cluster_centers_ # The number of iterations required to converge kmedoids.n_iter_ # How many clusters should be calculated? # Using elbow method print("="*27) print("Clustering using K-Medoids") print("="*27)
class DFKMedoids(BaseEstimator, ClusterMixin): def __init__(self, cluster_name='KMedoids', columns=None, eval_inertia=False, eval_silhouette=False, eval_chi=False, eval_dbi=False, eval_sample_size=None, **kwargs): self.cluster_name = cluster_name self.columns = columns self.model = KMedoids(**kwargs) self.eval_inertia = eval_inertia self.eval_silhouette = eval_silhouette self.eval_chi = eval_chi self.eval_dbi = eval_dbi self.eval_sample_size = eval_sample_size self.transform_cols = None self.eval_df = None self.centroid_df = None def fit(self, X, y=None): self.columns = X.columns if self.columns is None else self.columns self.transform_cols = [x for x in X.columns if x in self.columns] # Evaluation if any([self.eval_inertia, self.eval_silhouette, self.eval_chi, self.eval_dbi]): inertias = [] silhouettes = [] chis = [] dbis = [] self.eval_df = pd.DataFrame({ 'n_cluster': [x+1 for x in range(self.model.n_clusters)], }) self.eval_df['centroid'] = self.eval_df['n_cluster'].apply(lambda x: []) tmp_X = X[self.transform_cols].copy() index = 0 for n_cluster in tqdm(self.eval_df['n_cluster'].values): model = copy.deepcopy(self.model) model.n_clusters = n_cluster model.fit(tmp_X) # Cluster centroid self.eval_df.at[index, 'centroid'] = model.cluster_centers_ # Reference: https://blog.cambridgespark.com/how-to-determine-the-optimal-number-of-clusters-for-k-means-clustering-14f27070048f if self.eval_inertia: inertias.append(model.inertia_) # Reference: https://towardsdatascience.com/clustering-metrics-better-than-the-elbow-method-6926e1f723a6 if self.eval_silhouette: silhouettes.append(np.nan if n_cluster <= 1 else silhouette_score(tmp_X, model.labels_, sample_size=self.eval_sample_size, metric='euclidean', random_state=model.random_state)) # Reference: https://stats.stackexchange.com/questions/52838/what-is-an-acceptable-value-of-the-calinski-harabasz-ch-criterion if self.eval_chi: chis.append(np.nan if n_cluster <= 1 else calinski_harabasz_score(tmp_X, model.labels_)) # Reference: https://stackoverflow.com/questions/59279056/davies-bouldin-index-higher-or-lower-score-better if self.eval_dbi: dbis.append(np.nan if n_cluster <= 1 else davies_bouldin_score(tmp_X, model.labels_)) index += 1 if self.eval_inertia: self.eval_df['inertia'] = inertias if self.eval_silhouette: self.eval_df['silhouette'] = silhouettes if self.eval_chi: self.eval_df['calinski_harabasz'] = chis if self.eval_dbi: self.eval_df['davies_bouldin'] = dbis # Train else: self.model.fit(X[self.transform_cols]) self.centroid_df = pd.DataFrame( self.model.cluster_centers_, columns=self.transform_cols ) self.centroid_df['Cluster'] = [f'Cluster {x}' for x in np.unique(self.model.labels_)] self.centroid_df.set_index('Cluster', inplace=True) self.centroid_df.index.name = None return self def predict(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") new_X = X.copy() new_X[self.cluster_name] = self.model.predict(X[self.transform_cols]) new_X[self.cluster_name] = 'Cluster ' + new_X[self.cluster_name].astype(str) return new_X def fit_predict(self, X, y=None): return self.fit(X).predict(X) def predict_proba(self, X): if self.transform_cols is None: raise NotFittedError(f"This {self.__class__.__name__} instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.") # Measure distance to centroid prob_df = pd.DataFrame( DistanceMetric.get_metric('euclidean').pairwise(X[self.transform_cols], self.centroid_df), columns=[f'{self.cluster_name} Cluster {x}' for x in range(len(self.centroid_df))] ) # Convert to probability prob_df = prob_df.divide(prob_df.sum(axis=1), axis=0) prob_df = 1 - prob_df new_X = pd.concat([X, prob_df], axis=1) return new_X