def inertie_intra(k,X,e,N,perc): distance=0 for i in range(1,N+1): Y=labeling(k,X,perc) centers,clusters=ss_kmeans_pp(Y,k,e) distance+=SSE(k,Y,centers) return distance/N
def elbow_manual(n_clusters,X): sample,features=X.shape e=10**(-10) X = DataFrameImputer().fit_transform(X) SSE=[] SSE1=[] for i in range(1,n_clusters): Y=labeling(i,X,0.6) centers,labels=ss_kmeans_pp(Y,i,e) centers_sk,labels_sk=kmean_sklearn(i,X) # en utilisant lloyd SSE.append(np.sum(np.min(cdist(X,centers,'euclidean'),axis=1))) # en utilisant sklearn SSE1.append(np.sum(np.min(cdist(X,centers_sk,'euclidean'),axis=1))) K=np.arange(1,n_clusters) plt.plot(K,SSE,label='méthode manuel',color='blue') plt.plot(K,SSE1,label='méthode sklearn',color='orange') plt.xticks(np.arange(1, n_clusters, 1)) kn = KneeLocator(K, SSE1, curve='convex', direction='decreasing') # plotting dashed_vline on knee plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed') plt.show() plt.legend()
# Uploading ML dataset base = pd.read_csv('BigML_Dataset.csv', sep=',') X = base.iloc[:, 1:] # Imputing based on mean for numeric, and most frequent for strings X = DataFrameImputer().fit_transform(X) X.fillna(X.mean()) k, e = 5, 10**(-10) # On choisi le pourcentage unlabeled fraction = 0.4 #labelisation des données Y = labeling(k, X, fraction) #kmeans++ semi supervisés enfin t0 = time() centers, clusters = ss_kmeans_pp(Y, k, e) t1 = time() print('En utilisant kmeans ++ semi supevisée : %f' % (t1 - t0)) #max : 0.185409, min : 0.117940, 4 essais #comparaison avec kmeans++ sklearn t2 = time() resultat = kmean_sklearn(k, X) t3 = time() print('En utilisant kmeans ++ de sklearn : %f' % (t3 - t2)) #max : 5.153863, min : 4.261078, 4 essais
X=base.iloc[:,1:] # Imputing based on mean for numeric, and most frequent for strings X = DataFrameImputer().fit_transform(X) X.fillna(X.mean()) quanti_X=X.drop(['shops_used'], axis='columns') k,e=5,10**(-10) # On choisi le pourcentage unlabeled fraction=0.60 #labelisation des données Y=labeling(k,quanti_X,fraction) #kmeans++ semi supervisés t0=time() centers,clusters = ss_kmeans_pp(Y,k,e) t1=time() print('En utilisant kmeans ++ semi supevisée : %f' %(t1-t0)) #max : 0.185409, min : 0.117940, 4 essais """ Graphique comparaison des clusters""" # Afficher la variable qualitative: shops_used # sns.lmplot('min_distance_to_shops', 'products_purchased', data=X, hue='shops_used',fit_reg=False,scatter_kws={"s": 10}) # plt.show() # plt.title('Distribution de la variable qualitative: shops_used')