def fit(self, data, norm=False): ''' Args: data numpy.ndarray: [m, n] m samples every sample with n dimention norm boolean: False as default, ''' if not norm: self.mean = np.mean(data, axis=0) centered = data - self.mean process_data = self.l2norm(centered) else: process_data = data #clusterid, error, nfound = kcluster(process_data, nclusters=self.ncluster)#dist="u" #cdata, cmask = clustercentroids(process_data, mask=None, transpose=0, clusterid=clusterid, method='a') skm = SphericalKMeans(n_clusters=self.ncluster, verbose=0) skm.fit(process_data) self.clusters = skm.cluster_centers_ self.clusterid = skm.labels_ self.loss = skm.inertia_ scores = [] for i in range(self.ncluster): idxs = np.where(self.clusterid == i)[0].tolist() cluster_data = process_data[idxs, :] confs = np.dot(cluster_data, self.clusters[i, :].T) #print(confs) score = np.mean(confs) scores.append(score) #print(scores) self.main_id = np.argmin(scores) print(self.main_id) return self.clusters, self.clusterid, self.main_id
def calc_logit_regress_stats(inputs, outputs, plot_name, K_SIZE): skm = SphericalKMeans(n_clusters=K_SIZE) skm.fit(inputs) input_labels = skm.labels_ out_keys = list(set(outputs)) out_idx_mapping = {out: idx for idx, out in enumerate(out_keys)} #out_key_list = [out_idx_mapping[key] for key in out_keys] print(out_idx_mapping) k_center_labels = [[0] * K_SIZE for x in range(len(out_keys))] for k_c, lab in zip(input_labels, outputs): out_idx = out_idx_mapping[lab] k_center_labels[out_idx][k_c] += 1 k_center_labels = np.asarray(k_center_labels) ind = np.arange(K_SIZE) plots = [] bottom = np.zeros(K_SIZE) for x in range(len(out_keys)): plots.append(plt.bar(ind, k_center_labels[x], bottom=bottom)) bottom += k_center_labels[x] plt.title('Song genres in spherical k-means clusters') plt.xticks(ind, ["K" + str(i + 1) for i in range(K_SIZE)]) #plt.yticks(np.arange(0, 81, 10)) plt.legend(plots, out_keys) plt.savefig(plot_name)
def perform_clustering(seed, m_data, labels, n_clusters): # Singleview spherical kmeans clustering # Cluster each view separately s_kmeans = SphericalKMeans(n_clusters=n_clusters, random_state=seed, n_init=100) s_clusters_v1 = s_kmeans.fit_predict(m_data[0]) s_clusters_v2 = s_kmeans.fit_predict(m_data[1]) # Concatenate the multiple views into a single view s_data = np.hstack(m_data) s_clusters = s_kmeans.fit_predict(s_data) # Compute nmi between true class labels and singleview cluster labels s_nmi_v1 = nmi_score(labels, s_clusters_v1) s_nmi_v2 = nmi_score(labels, s_clusters_v2) s_nmi = nmi_score(labels, s_clusters) print('Singleview View 1 NMI Score: {0:.3f}\n'.format(s_nmi_v1)) print('Singleview View 2 NMI Score: {0:.3f}\n'.format(s_nmi_v2)) print('Singleview Concatenated NMI Score: {0:.3f}\n'.format(s_nmi)) # Multiview spherical kmeans clustering # Use the MultiviewKMeans instance to cluster the data m_kmeans = MultiviewSphericalKMeans(n_clusters=n_clusters, n_init=100, random_state=seed) m_clusters = m_kmeans.fit_predict(m_data) # Compute nmi between true class labels and multiview cluster labels m_nmi = nmi_score(labels, m_clusters) print('Multiview NMI Score: {0:.3f}\n'.format(m_nmi)) return m_clusters
def testSpericalKMeans(): # Find K clusters from data matrix X (n_examples x n_features) # spherical k-means skm = SphericalKMeans(n_clusters=3) skm.fit(X) print(skm.labels_)
def term_clustering(terms: List[str], wv: Dict[str, np.ndarray], n_clusters: int) -> Tuple[List[int], List[str]]: """Use spherical k-means to cluster word vectors. Args: terms: A list of terms to cluster. wv: A dictionary of word to their vectors. n_clusters: Number of output clusters. Returns: labels: A list of clustering assignment for each word. terms: A list of words, aligned with labels. """ X = [] X_terms = [] n_out_of_vocab = 0 logger.debug(f"#wv {len(wv)}") logger.debug(terms[:20]) for term in terms: try: phrase = term emb = wv[phrase] X.append(emb) X_terms.append(phrase) except KeyError as e: n_out_of_vocab += 1 logger.warning(f"{n_out_of_vocab} / {len(terms)} words out of vocab") logger.info(f"Clustering {len(X)} words") clus = SphericalKMeans(n_clusters=n_clusters) clus.fit(X) logger.info(f"Clustering complete") return clus.labels_, X_terms
def __init__(self, data, n_cluster): self.data = data self.n_cluster = n_cluster self.clus = SphericalKMeans(n_cluster) self.clusters = defaultdict(list) # cluster id -> members self.membership = None # a list contain the membership of the data points self.center_ids = None # a list contain the ids of the cluster centers self.inertia_scores = None
def doc_clustering(model, cluster_num): doc_num = len(model.docvecs.doctags.keys()) train_data = np.array( [model.docvecs['a_' + str(doc + 1)] for doc in range(doc_num)]) clusterer = SphericalKMeans(cluster_num) print('Start clustering...') clusterer.fit(train_data) print('Done.') return clusterer
def __init__(self, data, n_cluster): self.data = data self.n_cluster = n_cluster self.clus = SphericalKMeans( n_cluster) # Change by Mili (added Random State) #self.clus = KMeans(n_cluster) self.clusters = defaultdict(list) # cluster id -> members self.membership = None # a list contain the membership of the data points self.center_ids = None # a list contain the ids of the cluster centers self.inertia_scores = None self.old2new_clusterid = None
def kmeans_codebook(patches, k=30): shape = patches[0].shape x = patches.reshape(-1, shape[0] * shape[1]) # normalize #x = x / ( 1e-6 + x.sum(axis=1, keepdims=True) ) est = SphericalKMeans(k) #est = KMeans(n_clusters=k) est.fit(x) codebook = est.cluster_centers_.reshape(-1, shape[0], shape[1]) return codebook
def _init_match(self): skm = SphericalKMeans(n_clusters=self.config['cluster_nums'], init='k-means++', n_init=20) data = self.data data = data[data['qs_embed'].apply( lambda x: True if np.linalg.norm(x) > 0 else False)] skm.fit(data['qs_embed'].tolist()) data['skm_label'] = skm.labels_ data = data[['qid', 'skm_label']] self.data = pd.merge(self.data, data, how='left', on=['qid']) self.data['skm_label'] = self.data['skm_label'].fillna(-1) self._cluster_centers = skm.cluster_centers_
def cluster(self, docs, k): vecs = [] words = [] cnt = 0 for doc in docs: cnt += 1 #print('processing doc {}'.format(cnt), end='\r') ws = self.extract_keywords(doc) words.append(ws) vecs.append(self.sent2vec(ws)) print('processing doc {} over.'.format(cnt)) skm = SphericalKMeans(n_clusters=k) result = skm.fit(np.array(vecs)) return result.labels_, words
def SphericalkMeansCluster(X,nfclusters): # Find K clusters from data matrix X (n_examples x n_features) # spherical k-means skm = SphericalKMeans(nfclusters) skm.fit(X) #print(skm.cluster_centers_) #print("Labels =") #print(skm.labels_) #print("Inertia = ") #print(nfclusters,skm.inertia_) #return skm.inertia_ return skm.labels_
class Clusterer: def __init__(self, data, n_cluster): self.data = data self.n_cluster = n_cluster self.clus = SphericalKMeans(n_cluster) self.clusters = defaultdict(list) # cluster id -> members self.membership = None # a list contain the membership of the data points self.center_ids = None # a list contain the ids of the cluster centers self.inertia_scores = None def fit(self): print("bbbbbbb") self.clus.fit(self.data) print("bbbbbbb") labels = self.clus.labels_ print("bbbbbbb") for idx, label in enumerate(labels): self.clusters[label].append(idx) print("bbbbbbb") self.membership = labels print("bbbbbbb") self.center_ids = self.gen_center_idx() print("bbbbbbb") self.inertia_scores = self.clus.inertia_ print('Clustering concentration score:', self.inertia_scores) # find the idx of each cluster center def gen_center_idx(self): ret = [] for cluster_id in range(self.n_cluster): center_idx = self.find_center_idx_for_one_cluster(cluster_id) ret.append((cluster_id, center_idx)) return ret def find_center_idx_for_one_cluster(self, cluster_id): query_vec = self.clus.cluster_centers_[cluster_id] members = self.clusters[cluster_id] best_similarity, ret = -1, -1 for member_idx in members: member_vec = self.data[member_idx] cosine_sim = self.calc_cosine(query_vec, member_vec) if cosine_sim > best_similarity: best_similarity = cosine_sim ret = member_idx return ret def calc_cosine(self, vec_a, vec_b): return 1 - cosine(vec_a, vec_b)
def cluster(self, rounds = 20, clust_range = [2,12], num_cores = 1, threshold = 0.1, embeddings = []): if (len(self.embeddings_) == 0) and (len(embeddings) == 0): raise RuntimeError("No speaker embeddings available.") # If embeddings are not given if len(embeddings) == 0: embeddings = self.embeddings_ else: self.embeddings_ = embeddings # Top Two Silhouettes opt_center_num, center_dict = Top2S(embeddings, clust_range = clust_range, rounds = rounds, num_cores = num_cores, threshold = threshold) self.centers_ = center_dict self.opt_speaker_num_ = opt_center_num # Get speaker labels spkmeans = SphericalKMeans(n_clusters=len(center_dict[opt_center_num]), init = center_dict[opt_center_num], max_iter=1, n_init=1, n_jobs=1).fit(embeddings) self.speaker_labels_ = spkmeans.labels_+1
def fit( self, data: List[Iterator[float]], find_n: bool = False ) -> Dict[str, Union[List[int], Union[float, None]]]: """Cluster the input data into n clusters. Args: data: A list of vectors. find_n: If True, don't use self.n_cluster but find n using elbow analysis instead Return: A list of integers as class labels. The order of the list corresponds to the order of the input data. """ if find_n: self.n_clusters = 5 # self._get_n() if self.clus_type == 'kmeans': self.cluster = k_means(n_clusters=self.n_clusters) elif self.clus_type == 'sphericalkmeans': self.cluster = SphericalKMeans(n_clusters=self.n_clusters) elif self.clus_type == 'agglomerative': self.cluster = AgglomerativeClustering(n_clusters=self.n_clusters, affinity=self.affinity, linkage=self.linkage) self.cluster.fit(data) self._calc_density() return {'labels': self.cluster.labels_, 'density': self.compactness}
def create_clustering_methods(ngroups, g_matrix, n_init): clustering_methods = { "kmsim": (KMeansSim(n_clusters=ngroups, g_matrix=g_matrix, n_init=n_init), partial(dist_on_sphere, g_matrix=g_matrix)), "krbsim": (RepeatedBisectionSim(n_clusters=ngroups, g_matrix=g_matrix, n_init=n_init, bm='agg'), partial(dist_on_sphere, g_matrix=g_matrix)), "skm": (SphericalKMeans(n_clusters=ngroups, n_init=n_init), partial(dist_on_sphere, g_matrix=g_matrix)), # "vmfs" : (VonMisesFisherMixture(n_clusters=ngroups, n_init=n_init, posterior_type='soft'), partial(dist_on_sphere, g_matrix=g_matrix)), # "vmfh" : (VonMisesFisherMixture(n_clusters=ngroups, n_init=n_init, posterior_type='hard'), partial(dist_on_sphere, g_matrix=g_matrix)), "km": (KMeans(n_clusters=ngroups, n_init=n_init), partial(dist_on_sphere, g_matrix=g_matrix)), "lgr": (sklearn.linear_model.LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinomial', max_iter=500), partial(dist_on_sphere, g_matrix=g_matrix)), } return clustering_methods
def __init__(self, data, n_cluster, method="soft-movMF", init="random-class", n_init=10, n_jobs=1): self.data = data self.n_cluster = n_cluster self.method = method if method == "spk": self.clus = SphericalKMeans(n_clusters=n_cluster) elif method == "hard-movMF": self.clus = VonMisesFisherMixture(n_clusters=n_cluster, posterior_type='hard', init=init, n_init=n_init, n_jobs=n_jobs) elif method == "soft-movMF": self.clus = VonMisesFisherMixture(n_clusters=n_cluster, posterior_type='soft', init=init, n_init=n_init, n_jobs=n_jobs) self.clusters = { } # cluster id -> dict(element_id: distance to center) self.clusters_phrase = {} # cluster id -> representative words self.membership = None # a list contain the membership of the data points self.center_ids = None # a list contain the ids of the cluster centers self.inertia_scores = None
def visualize(self, indices = [], center_num = 0, ref_labels = [], use_colors = True): # If indices are not given if len(indices) ==0: indices = np.arange(len(self.embeddings_)) # If center number is not given if center_num == 0: center_num = self.opt_speaker_num_ # If reference labels are used if len(ref_labels) != 0: speaker_labels = ref_labels # Allow visualization of different center number configurations else: # Get speaker labels spkmeans = SphericalKMeans(n_clusters=len(self.centers_[center_num]), init = self.centers_[center_num], max_iter=1, n_init=1, n_jobs=1).fit(self.embeddings_[indices]) speaker_labels = spkmeans.labels_+1 if len(self.speaker_labels_) == 0: raise RuntimeError("Clustering not performed.") # Compute TSNE only once if len(self.emb_2d_) == 0: print("Computing TSNE transform...") tsne = TSNE(n_jobs=4) self.emb_2d_ = tsne.fit_transform(self.embeddings_) # Visualize emb_2d = self.emb_2d_[indices] speaker_labels = speaker_labels.astype(np.int) speakers = np.unique(speaker_labels) colors=cm.rainbow(np.linspace(0,1,len(speakers))) plt.figure(figsize=(7,7)) for speaker in speakers: speak_ind = np.where(speaker_labels == speaker)[0] x, y = np.transpose(emb_2d[speak_ind]) if use_colors == True: plt.scatter(x, y, c="k", edgecolors=colors[speaker-1], s=2, label=speaker) else: plt.scatter(x, y, c="k", edgecolors="k", s=2, label=speaker) plt.legend(title = "Speakers", prop={'size': 10}) if len(ref_labels) == 0: plt.title("Predicted speaker clusters") else: plt.title("Reference speaker clusters") plt.show()
def cluster_doc(doc_emb, K, method): y_pred = [] if method == "kmeans": # k-means print("Clustering using K-Means") from sklearn.cluster import KMeans km = KMeans(n_clusters=K, n_init=1) km.fit(doc_emb) y_pred = km.labels_ elif method == "skmeans": # spherical k-means print("Clustering using Spherical K-Means") from spherecluster import SphericalKMeans skm = SphericalKMeans(n_clusters=K, n_init=1) skm.fit(doc_emb) y_pred = skm.labels_ return y_pred
def silh_score(emb, guess, mode = 0): spkmeans = SphericalKMeans(n_clusters=guess, max_iter=300, n_init=1, n_jobs=1).fit(emb) emb_labels = spkmeans.labels_ centers = spkmeans.cluster_centers_ if mode == 0: return silhouette_score(emb, emb_labels, metric = "cosine"), emb_labels, centers else: return silhouette_score(emb, emb_labels, metric = "cosine")
def SphericalKMeans_model(vocab_embeddings, vocab, topics, rerank, rand, weights): spkmeans = SphericalKMeans(n_clusters=topics, random_state=rand).fit(vocab_embeddings, sample_weight=weights) m_clusters = spkmeans.predict(vocab_embeddings, sample_weight=weights) centers = np.array(spkmeans.cluster_centers_) indices = [] for i in range(topics): topk_vals = sort_closest_cossine_center(centers[i], m_clusters, vocab_embeddings, i) if rerank: indices.append(find_top_k_words(100, topk_vals, vocab)) else: indices.append(find_top_k_words(10, topk_vals, vocab)) # print(indices) return m_clusters, indices
def initialize(self): self.R12_train = np.multiply(NMTF2.R12, self.M) """spherical k-means""" skm3 = SphericalKMeans(n_clusters=self.K[2]) skm3.fit(NMTF2.R23) #Reload matrices that have already been used before self.G1 = NMTF1.G1 self.G2 = NMTF1.G2 self.G3 = skm3.cluster_centers_.transpose() self.S12 = np.linalg.multi_dot( [self.G1.transpose(), self.R12_train, self.G2]) self.S23 = np.linalg.multi_dot( [self.G2.transpose(), NMTF2.R23, self.G3]) #Save G3 for the next models NMTF2.G3 = self.G3
def semantic_sim_driver(self,time_mapping,log_filename = "yao_test1.txt",): df = pd.read_csv("eval/yao/testset_1.csv") try: df.real_year = df.year.apply(lambda x: int(time_mapping[str(x)])) except Exception as e: print(e) print(time_mapping.keys()) print(df.year.unique()) df.real_year = df.year.apply(lambda x: int(time_mapping[str(x // 10 * 10) + "s"])) labels = set(df.label.unique()) labels_mapping = { label : index for index,label in enumerate(labels) } df.label_id = df.label.apply(lambda x: labels_mapping[x]) # print(df.label_id) embeddings,known_index = self.get_embedding_in_a_year(df.word,df.real_year.tolist(),return_known_index =True) from spherecluster import SphericalKMeans scores = [] for n in [10,15,20]: skm = SphericalKMeans(n_clusters = n) skm.fit(embeddings) # print(skm.labels_.shape) # print(len(df.label_id[known_index])) # print(sum(known_index)) score = get_score(skm.labels_,df.label_id[known_index]) score1 = get_score1(skm.labels_,df.label_id[known_index]) scores.append(score) scores.append(score1) print(scores) with open(log_filename, "w", encoding="utf-8") as f: line = "\t".join(["{0:.4f}".format(s) for s in scores]) + "\n" print(line) f.write(line) return None
def get_topic_vecs(model, n_topics=20): """ Computes and returns the topic vectors of a doc2vec model. the topic vectors are simply the centroids of the classes after the documents have been clustered. They are therefore "virtual" documents that are an average of a group of similar documents. Arguments: - (gensim.models.doc2vec.Doc2Vec) model: A doc2vec model - (<float>) n_topics: The number of topics that should be found, defaults to 20. Returns: - (numpy.ndarray) topics: The topic vectors of the model """ from spherecluster import SphericalKMeans skm = SphericalKMeans(n_clusters=n_topics) # getting the data as a numpy array dv = model.docvecs.vectors_docs # carrying out K-means to group documents by topic skm.fit(dv) # extracting topic vectors (centroids of the groups) return skm.cluster_centers_
def Silhouette(X, seguradora): insurance_label = dbm.GetAccountLabel(seguradora) maxx = len(X) if maxx > 11: maxx = 11 range_of_clusters = list(range(2, maxx)) clusters_silhouette = dict() for n_clusters in range_of_clusters: # Initialize the clusterer with n_clusters value #...and a random generator # seed of 10 for reproducibility. clusterer = SKMeans(n_clusters=n_clusters, random_state=0) cluster_labels = clusterer.fit_predict(X) # The silhouette_score gives the average value for all the samples. # This gives a perspective into the density and separation #...of the formed # clusters silhouette_avg = silhouette_score(X, cluster_labels) clusters_silhouette.update({n_clusters: silhouette_avg}) # Compute the silhouette scores for each sample sample_silhouette_values = silhouette_samples(X, cluster_labels) plt.title('Silhueta media de %s' % insurance_label) plt.xlabel('Numero de clusters', fontsize=16) plt.ylabel("Silhueta media", fontsize=16) plt.plot(clusters_silhouette.keys(), clusters_silhouette.values()) plt.savefig("../analytics/%s/%s_silhuette.png" \ % (insurance_label, insurance_label)) plt.close() silhouettes = [v for v in clusters_silhouette.values()] for k, v in clusters_silhouette.iteritems(): if max(silhouettes) == v: return k
def initialize(self): self.R12_train = np.multiply(NMTF5.R12, self.M) """spherical k-means""" skm5 = SphericalKMeans(n_clusters=self.K[4]) skm5.fit(NMTF5.R25) self.G1 = NMTF1.G1 self.G2 = NMTF1.G2 self.G3 = NMTF2.G3 self.G4 = NMTF3.G4 self.G5 = skm5.cluster_centers_.transpose() self.S12 = np.linalg.multi_dot( [self.G1.transpose(), self.R12_train, self.G2]) self.S23 = np.linalg.multi_dot( [self.G2.transpose(), NMTF5.R23, self.G3]) self.S34 = np.linalg.multi_dot( [self.G3.transpose(), NMTF5.R34, self.G4]) self.S25 = np.linalg.multi_dot( [self.G2.transpose(), NMTF5.R25, self.G5])
def score_embeddings(self, min_length, max_num_speakers): """ Score embeddings. Args: min_length (int): minimal length of segment used for clustering in miliseconds max_num_speakers (int): maximal number of speakers Returns: dict: dictionary with scores for each file """ scores_dict = {} logger.info('Scoring using `{}`.'.format('PLDA' if self.plda is not None else 'cosine distance')) for embedding_set in self.embeddings: name = os.path.normpath(embedding_set.name) embeddings_all = embedding_set.get_all_embeddings() embeddings_long = embedding_set.get_longer_embeddings(min_length) if len(embeddings_long) == 0: logger.warning( 'No embeddings found longer than {} for embedding set `{}`.'.format(min_length, name)) continue size = len(embedding_set) if size > 0: logger.info('Clustering `{}` using {} long embeddings.'.format(name, len(embeddings_long))) if embedding_set.num_speakers is not None: num_speakers = embedding_set.num_speakers if self.use_l2_norm: kmeans_clustering = SphericalKMeans( n_clusters=num_speakers, n_init=1000, n_jobs=1).fit(embeddings_long) else: kmeans_clustering = sklearnKMeans( n_clusters=num_speakers, n_init=1000, n_jobs=1).fit(embeddings_long) if self.plda is None: centroids = kmeans_clustering.cluster_centers_ else: centroids = PLDAKMeans( kmeans_clustering.cluster_centers_, num_speakers, self.plda).fit(embeddings_long) else: xm = xmeans(embeddings_long, kmax=max_num_speakers) xm.process() num_speakers = len(xm.get_clusters()) kmeans_clustering = sklearnKMeans( n_clusters=num_speakers, n_init=100, n_jobs=1).fit(embeddings_long) centroids = kmeans_clustering.cluster_centers_ if self.norm is None: if self.plda is None: scores_dict[name] = cosine_similarity(embeddings_all, centroids).T else: scores_dict[name] = self.plda.score(embeddings_all, centroids) else: scores_dict[name] = self.norm.s_norm(embeddings_all, centroids) else: logger.warning('No embeddings to score in `{}`.'.format(embedding_set.name)) return scores_dict
def cluster_test(self, test_file, clusters=10): df_test1 = pd.read_csv(test_file) output = {} for K in clusters: vectors = list() y_true = list() sections = dict() idx = 0 for word, section, y in df_test1.values: sliceIdx = self.yearDict[str(y)] if word in self.vocabularies[sliceIdx]: if section not in sections: sections[section] = idx idx += 1 y_true.append(sections[section]) vectors.append(self.matrices_norm[sliceIdx][ self.vocabularies[sliceIdx][word]]) skm = SphericalKMeans(n_clusters=K, max_iter=100000) skm.fit(np.array(vectors)) metric = normalized_mutual_info_score(skm.predict( np.array(vectors)), y_true, average_method='arithmetic') y_true_bool = [(triplet1 == triplet2) for triplet2 in y_true for triplet1 in y_true] y_pred = skm.predict(np.array(vectors)) y_pred_bool = [(triplet1 == triplet2) for triplet2 in y_pred for triplet1 in y_pred] metric2 = fbeta_score(y_true_bool, y_pred_bool, beta=5) output[f'NMI({K})'] = metric output[f'F_beta-score({K})'] = metric2 return output
def sphericalKMeans(num): num = 4 file = open("Kmeans.txt", "a+") input = np.array([[-4, -2], [-3, -2], [-2, -2], [-1, -2], [1, -1], [1, 1], [2, 3], [3, 2], [3, 4], [4, 3]]) kmeans = SphericalKMeans(n_clusters=num).fit(input) file.write("Spherical K means output for cluster size : " + str(num) + "\n") file.write("Clusters index of points" + "\n") file.write(str(kmeans.labels_) + "\n") file.write("Center of Clusters\n") file.write(str(kmeans.cluster_centers_) + "\n") file.close()
def analyse(methode, preproc, true_label, nb_clusters=3, normalizer=True, scikit=True): if scikit: data = methode.fit_transform(preproc) else: data = preproc if normalizer: data = Normalizer(norm='l2', copy=False).fit_transform(data) skplt.cluster.plot_elbow_curve(SphericalKMeans(random_state=42, n_jobs=-1), data, title="Elbow Curve avec Spherical K-means", cluster_ranges=range(1, 15)) skplt.cluster.plot_elbow_curve(KMeans(random_state=42, n_jobs=-1, precompute_distances=True), data, title="Elbow Curve avec K-means", cluster_ranges=range(1, 15)) ("Fitting For Spherical K-means for ", nb_clusters, "...") skmeans = SphericalKMeans(n_clusters=nb_clusters, random_state=42, n_jobs=-1).fit(data) ("Fitting For Spherical K-means for ", nb_clusters, "...") kmeans = KMeans(n_clusters=nb_clusters, random_state=42, n_jobs=-1, precompute_distances=True).fit(data) y_pred_skmeans = skmeans.predict(data) y_pred_kmeans = kmeans.predict(data) print("Results from Spherical K-means") scoring_cluster(skmeans, true_label, y_pred_skmeans) print("Results from K-means") scoring_cluster(kmeans, true_label, y_pred_kmeans) return methode, skmeans, kmeans, data