def inertie_intra(k,X,e,N,perc):
    distance=0
    for i in range(1,N+1):
        Y=labeling(k,X,perc)
        centers,clusters=ss_kmeans_pp(Y,k,e)
        distance+=SSE(k,Y,centers)
    return distance/N
def elbow_manual(n_clusters,X):
    sample,features=X.shape
    e=10**(-10)

    X = DataFrameImputer().fit_transform(X)    
    SSE=[]
    SSE1=[]
    for i in range(1,n_clusters):
        Y=labeling(i,X,0.6)
        centers,labels=ss_kmeans_pp(Y,i,e)
        centers_sk,labels_sk=kmean_sklearn(i,X)
    
        # en utilisant lloyd
        SSE.append(np.sum(np.min(cdist(X,centers,'euclidean'),axis=1)))
        # en utilisant sklearn
        SSE1.append(np.sum(np.min(cdist(X,centers_sk,'euclidean'),axis=1)))
    
    K=np.arange(1,n_clusters)    
    plt.plot(K,SSE,label='méthode manuel',color='blue')
    plt.plot(K,SSE1,label='méthode sklearn',color='orange')
    plt.xticks(np.arange(1, n_clusters, 1))
    kn = KneeLocator(K, SSE1, curve='convex', direction='decreasing')    
    # plotting dashed_vline on knee
    plt.vlines(kn.knee, plt.ylim()[0], plt.ylim()[1], linestyles='dashed')
    plt.show()
    plt.legend()
コード例 #3
0
    # Uploading ML dataset
    base = pd.read_csv('BigML_Dataset.csv', sep=',')

    X = base.iloc[:, 1:]

    # Imputing based on mean for numeric, and most frequent for strings
    X = DataFrameImputer().fit_transform(X)
    X.fillna(X.mean())

    k, e = 5, 10**(-10)

    # On choisi le pourcentage unlabeled
    fraction = 0.4

    #labelisation des données
    Y = labeling(k, X, fraction)

    #kmeans++ semi supervisés enfin
    t0 = time()
    centers, clusters = ss_kmeans_pp(Y, k, e)
    t1 = time()
    print('En utilisant kmeans ++ semi supevisée : %f' %
          (t1 - t0))  #max : 0.185409, min : 0.117940, 4 essais

    #comparaison avec kmeans++ sklearn
    t2 = time()
    resultat = kmean_sklearn(k, X)
    t3 = time()
    print('En utilisant kmeans ++ de sklearn : %f' %
          (t3 - t2))  #max : 5.153863, min : 4.261078, 4 essais
    
    X=base.iloc[:,1:]
    
    
    
    # Imputing based on mean for numeric, and most frequent for strings
    X = DataFrameImputer().fit_transform(X)
    X.fillna(X.mean())
    quanti_X=X.drop(['shops_used'], axis='columns')
    k,e=5,10**(-10)
    
    # On choisi le pourcentage unlabeled
    fraction=0.60
    
    #labelisation des données
    Y=labeling(k,quanti_X,fraction)
    
    #kmeans++ semi supervisés 
    t0=time()
    centers,clusters = ss_kmeans_pp(Y,k,e)
    t1=time()
    print('En utilisant kmeans ++ semi supevisée : %f' %(t1-t0)) #max : 0.185409, min : 0.117940, 4 essais
    
 
    
    """ Graphique comparaison des clusters"""
    # Afficher la variable qualitative: shops_used
    
#    sns.lmplot('min_distance_to_shops', 'products_purchased', data=X, hue='shops_used',fit_reg=False,scatter_kws={"s": 10})
#    plt.show()
#    plt.title('Distribution de la variable qualitative: shops_used')