def elbow_manual(n_clusters,X): sample,features=X.shape e=10**(-10) X = DataFrameImputer().fit_transform(X) SSE=[] SSE1=[] for i in range(1,n_clusters): Y=labeling(i,X,0.6) centers,labels=ss_kmeans_pp(Y,i,e) centers_sk,labels_sk=kmean_sklearn(i,X) # en utilisant lloyd SSE.append(np.sum(np.min(cdist(X,centers,'euclidean'),axis=1))) # en utilisant sklearn SSE1.append(np.sum(np.min(cdist(X,centers_sk,'euclidean'),axis=1))) K=np.arange(1,n_clusters) plt.plot(K,SSE,label='méthode manuel',color='blue') plt.plot(K,SSE1,label='méthode sklearn',color='orange') plt.xticks(np.arange(1, n_clusters, 1)) kn = KneeLocator(K, SSE1, curve='convex', direction='decreasing') # plotting dashed_vline on knee plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') plt.show() plt.legend()
distance += SSE(k, X, centers) return distance / N # ============================================================================= # Main # ============================================================================= if __name__ == "__main__": # Uploading ML dataset base = pd.read_csv('BigML_Dataset.csv', sep=',') X = base.iloc[:, 1:] # Imputing based on mean for numeric, and most frequent for strings X = DataFrameImputer().fit_transform(X) X.fillna(X.mean()) quanti_X = X.drop(['shops_used'], axis='columns') k, e = 5, 10**(-10) #kmeans++ t0 = time() centers, clusters = kmeans_pp(quanti_X, k, e) t1 = time() print('En utilisant kmeans++ : %f' % (t1 - t0)) #comparaison avec kmeans de sklearn t2 = time() resultat = kmean_sklearn(k, quanti_X) t3 = time()
return centers, clusters # ============================================================================= # Main # ============================================================================= if __name__ == "__main__": # Uploading ML dataset base = pd.read_csv('BigML_Dataset.csv', sep=',') X = base.iloc[:, 1:] # Imputing based on mean for numeric, and most frequent for strings X = DataFrameImputer().fit_transform(X) X.fillna(X.mean()) k, e = 5, 10**(-10) #kmeans++ t0 = time() centers, clusters = kmeans_pp(X, k, e) t1 = time() print('En utilisant kmeans++ : %f' % (t1 - t0)) #comparaison avec kmeans de sklearn t2 = time() resultat = kmean_sklearn(k, X) t3 = time() print('En utilisant kmeans++ de sklearn : %f' % (t3 - t2))