Ejemplo n.º 1
0
def plot_cluster_distances(estimator, dataset, version):
    visualizer = InterclusterDistance(estimator)
    visualizer.fit(data.DATA[dataset][version]['x_train'])
    visualizer.show(
        f'{PLOTS_FOLDER}/{dataset}/{version}/{dataset}_{version}_{estimator.__class__.__name__}_cluster_distances_k{estimator.n_clusters}.png'
    )
    plt.clf()
Ejemplo n.º 2
0
def intercluster_distance(ax=None):
    X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True)

    viz = InterclusterDistance(KMeans(9), ax=ax)
    viz.fit(X)
    viz.finalize()

    return viz
Ejemplo n.º 3
0
def cluster_distances(model, X, graph):
    visualizer = InterclusterDistance(
        model,
        legend=True,
        legend_loc='upper left',
        title=" KMeans Intercluster Distance Map for " + graph)
    visualizer.fit(X)
    visualizer.show()
Ejemplo n.º 4
0
def ica(training_set, test_set, y_train):
    # https://www.ritchieng.com/machine-learning-dimensionality-reduction-feature-transform/
    ica_avg_kurtosis_curve(training_set)

    km = KMeans(n_clusters=2,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    ica = FastICA(n_components=10, random_state=RAND, max_iter=1000)

    X_train = ica.fit_transform(training_set)

    plot_silhouette(km, X_train, title="ICA(10), K=2")
    visualizer = InterclusterDistance(km)
    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # F

    km = KMeans(n_clusters=5,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    ica = FastICA(n_components=10, random_state=RAND, max_iter=1000)

    X_train = ica.fit_transform(training_set)

    plot_silhouette(km, X_train, title="ICA(10), K=5")

    km = KMeans(n_clusters=3,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    ica = FastICA(n_components=10, random_state=RAND, max_iter=1000)

    X_train = ica.fit_transform(training_set)

    plot_silhouette(km, X_train, title="ICA(10)" ", K=3")

    km = KMeans(n_clusters=2,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)
    km.fit(X_train)

    hs = metrics.homogeneity_score(y_train, km.labels_)
    print("homogenatity score for K=2:", hs)

    y_train_inverse = (~y_train.astype(bool)).astype(int)

    hs = metrics.homogeneity_score(y_train_inverse, km.labels_)
    print("homogenatity score for K=2: (inverse)", hs)
def cluster_distance_map(text, model, cv):
    path = 'models/{}'.format(model)
    pipe = load(path)
    kmeans = pipe.named_steps['kmeans']
    svd = pipe.named_steps['truncatedsvd']
    X = svd.fit_transform(cv)
    visualizer = InterclusterDistance(
        kmeans,
        embedding='mds',
    )
    visualizer.fit(X)
    visualizer.show(outpath="plots/ClusterMap.png")
    plt.close()
Ejemplo n.º 6
0
def distance_yellowbrick(
    X,
    y,
    features,
):
    plt.switch_backend('agg')
    plt.clf()
    X_train, X_test, y_train, y_test = train_test_split(X[features],
                                                        y,
                                                        stratify=y,
                                                        test_size=0.01)
    X = pd.DataFrame(X_test, columns=features)
    y = pd.Series(y_test)
    n_clusters = y.nunique()
    model = MiniBatchKMeans(n_clusters)
    visualizer_dist = InterclusterDistance(model)
    visualizer_dist.fit(X)
    visualizer_dist.finalize()

    return plt
Ejemplo n.º 7
0
def kMeans():
    # citation: https://realpython.com/k-means-clustering-python/
    digits = load_digits()

    # features
    digits_features = digits.data[:, 0:-1]
    # label
    label = digits.data[:, -1]

    scaler = StandardScaler()
    scaled_features = scaler.fit_transform(digits_features)

    # citation: hands on machine learning
    gm = GaussianMixture(covariance_type='spherical',
                         n_components=8,
                         n_init=10)
    gm.fit(scaled_features)
    print("GM Converged", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(scaled_features)
    gm.predict_proba(scaled_features)
    gm.score_samples(scaled_features)

    aic = []
    bic = []

    for i in range(21):
        gm = GaussianMixture(covariance_type='spherical',
                             n_components=20,
                             n_init=10)
        gm.fit(scaled_features)
        aic.append(gm.aic(scaled_features))
        bic.append(gm.bic(scaled_features))

    plt.plot(aic, label="AIC")
    plt.plot(bic, label="BIC")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Information Criterion")
    plt.legend()
    plt.show()

    # x_centered = digits_features - digits_features.mean(axis=0)
    # U, s, Vt = np.linalg.svd(x_centered)
    # c1 = Vt.T[:, 0]
    # c2 = Vt.T[:, 1]

    # W2 = Vt.T[:, :2]
    # X2D = x_centered.dot(W2)

    # pca = PCA()
    # pca.fit(scaled_features)
    # cumsum = np.cumsum(pca.explained_variance_ratio_)
    # d = np.argmax(cumsum >= 0.95) + 1

    # pca = PCA(n_components=0.95)
    # X_reduced = pca.fit_transform(scaled_features)

    explained_variance = []
    for i in range(63):
        pca = PCA(n_components=i)
        pca.fit(scaled_features)
        cumsum = np.cumsum(pca.explained_variance_ratio_)

    plt.plot(cumsum, label="Explained Variance Ratio")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Dimensions")
    plt.ylabel("Explained Variance Ratio")
    plt.legend()
    plt.show()

    digits_trainingX, digits_testingX, digits_trainingY, digits_testingY = train_test_split(
        digits_features, label)

    # ica
    # citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn

    error = []

    for i in range(1, 50):
        pca = PCA(n_components=i)
        pca.fit(digits_trainingX)
        U, S, VT = np.linalg.svd(digits_trainingX - digits_trainingX.mean(0))
        x_train_pca = pca.transform(digits_trainingX)
        x_train_pca2 = (digits_trainingX - pca.mean_).dot(pca.components_.T)
        x_projected = pca.inverse_transform(x_train_pca)
        x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_
        loss = ((digits_trainingX - x_projected)**2).mean()
        error.append(loss)

    plt.clf()
    plt.figure(figsize=(15, 15))
    plt.title("reconstruction error")
    plt.plot(error, 'r')
    plt.xticks(range(len(error)), range(1, 50), rotation='vertical')
    plt.xlim([-1, len(error)])
    plt.show()

    clf = MLPClassifier(alpha=0.001,
                        hidden_layer_sizes=(8, ),
                        random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)
    print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred))

    k_acc = []
    k_gm = []
    time_arr = []
    for k in range(1, 15):
        kmeans = KMeans(n_clusters=k)
        X_train = kmeans.fit_transform(digits_trainingX)
        X_test = kmeans.transform(digits_testingX)
        start_time = time.time()
        clf = MLPClassifier(alpha=0.001,
                            hidden_layer_sizes=(8, ),
                            random_state=1,
                            solver='lbfgs')
        clf.fit(X_train, digits_trainingY)
        total_time = time.time() - start_time
        y_pred = clf.predict(X_test)
        score = accuracy_score(digits_testingY, y_pred)
        k_acc.append(score)
        time_arr.append(total_time)

    plt.plot(k_acc, label="K-Means")
    plt.plot(time_arr, label="Computation Time")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("k # of clusters")
    plt.ylabel("NN Accuracy")
    plt.legend()
    plt.show()

    acc = []
    acc_ica = []
    acc_rca = []
    for i in range(1, 40):
        pca = PCA(n_components=i)
        X_train = pca.fit_transform(digits_trainingX)
        X_test = pca.transform(digits_testingX)
        clf = MLPClassifier(alpha=0.001,
                            hidden_layer_sizes=(8, ),
                            random_state=1,
                            solver='lbfgs')
        clf.fit(X_train, digits_trainingY)
        y_pred = clf.predict(X_test)
        score = accuracy_score(digits_testingY, y_pred)
        acc.append(score)

        ica = FastICA(n_components=i)
        x_train_i = ica.fit_transform(digits_trainingX)
        x_test_i = ica.transform(digits_testingX)
        clf.fit(x_train_i, digits_trainingY)
        y_pred_i = clf.predict(x_test_i)
        score_i = accuracy_score(digits_testingY, y_pred_i)
        acc_ica.append(score_i)

        rca = GaussianRandomProjection(n_components=i)
        x_train_r = rca.fit_transform(digits_trainingX)
        x_test_r = rca.transform(digits_testingX)
        clf.fit(x_train_r, digits_trainingY)
        y_pred_r = clf.predict(x_test_r)
        score_r = accuracy_score(digits_testingY, y_pred_r)
        acc_rca.append(score_r)

    plt.plot(acc, label="PCA")
    plt.plot(acc_ica, label="ICA")
    plt.plot(acc_rca, label="RCA")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Components")
    plt.ylabel("NN Accuracy")
    plt.legend()
    plt.show()
    # cumsum = np.cumsum(pca.explained_variance_ratio_)
    # d = np.argmax(cumsum >= 0.95) + 1

    # randomized projections
    rnd_pca = PCA(n_components=50, svd_solver="randomized")
    X_reduced_rand = rnd_pca.fit_transform(scaled_features)

    # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py
    # k best
    scaler = MinMaxScaler()
    digits_indices = np.arange(digits_features.shape[-1])
    scaled_features_norm = scaler.fit_transform(scaled_features)
    k_selected = SelectKBest(f_classif, k=50)
    k_selected.fit(scaled_features_norm, label)
    scores = -np.log10(k_selected.pvalues_)
    plt.bar(digits_indices - .45,
            scores,
            width=.2,
            label=r'Univariate score ($-Log(p_{value})$)')
    plt.xlabel("Features")
    plt.ylabel("F-Score")
    plt.show()

    gm = GaussianMixture(covariance_type='spherical',
                         n_components=8,
                         n_init=10)
    gm.fit(X_reduced_inc)
    print("GM Converged - PCA Inc", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(X_reduced_inc)
    gm.predict_proba(X_reduced_inc)
    gm.score_samples(X_reduced_inc)

    kmeans = KMeans(init="random",
                    n_clusters=63,
                    n_init=10,
                    max_iter=300,
                    random_state=42)
    kmeans.fit(scaled_features)

    # the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    # final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    # num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    # labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
    }

    sse = []
    for k in range(1, 63):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    kl = KneeLocator(range(1, 63), sse, curve="convex", direction="decreasing")

    # optimal k (number of clusters) for this dataset
    print("Elbow", kl.elbow)

    clf = MLPClassifier(alpha=0.001,
                        hidden_layer_sizes=(8, ),
                        random_state=1,
                        solver='lbfgs')
    clf.fit(digits_trainingX, digits_trainingY)
    y_pred = clf.predict(digits_testingX)

    model = KMeans(n_clusters=5)
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(digits_testingX)

    print("Accuracy Score Normal", accuracy_score(digits_testingY, y_pred))
    print("Accuracy Score K-Means", accuracy_score(digits_testingY, labels))

    elbow_visualizer = KElbowVisualizer(model, k=(2, 63))
    elbow_visualizer.fit(digits_features)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(digits_features)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(digits_features)
    ic_visualizer.show()

    # gmm = GaussianMixture(n_components=7).fit(digits_features)
    # labels = gmm.predict(digits_features)
    # plt.scatter(digits_features[:, 0], digits_features[:, 1], c=labels, s=40, cmap='viridis')
    # plt.show()

    # digits_features_pd = pd.DataFrame(data=digits_features[1:, 1:],
    # index=digits_features[1:,0],
    # columns=digits_features[0,1:])

    # pd.plotting.scatter_matrix(digits_features_pd)

    # probs = GaussianMixture.predict_proba(digits_features)
    # print(probs[:5].round(3))

    kmeans = KMeans(init="random",
                    n_clusters=18,
                    n_init=10,
                    max_iter=300,
                    random_state=42)
    kmeans.fit(X_reduced_inc)

    # the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    # final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    # num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    # labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init": "random",
        "n_init": 10,
        "max_iter": 300,
        "random_state": 42,
    }

    sse = []
    for k in range(1, 18):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    kl = KneeLocator(range(1, 18), sse, curve="convex", direction="decreasing")

    # optimal k (number of clusters) for this dataset
    print("Elbow", kl.elbow)

    model = KMeans()
    elbow_visualizer = KElbowVisualizer(model, k=(2, 18))
    elbow_visualizer.fit(X_reduced_inc)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(X_reduced_inc)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(X_reduced_inc)
    ic_visualizer.show()
Ejemplo n.º 8
0
def icdm():
    X, _ = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True)
    oz = InterclusterDistance(KMeans(9), ax=newfig())
    oz.fit(X)
    savefig(oz, "icdm")
Ejemplo n.º 9
0
model.fit(X_scaled)
print("Predicted labels ----")
model.predict(X_scaled)
df['cluster'] = model.predict(X_scaled)
 
plt.figure(figsize=(12,9))
 
model=MiniBatchKMeans(n_clusters=2).fit(X_scaled)
 
visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
visualizer.fit(X_scaled)      
visualizer.show()
 
plt.figure(figsize=(12,9))
 
visualizer = InterclusterDistance(model, min_size=10000)
visualizer.fit(X_scaled)
visualizer.show()
 
df = pd.concat([df,X_scaled], axis=1)

"""
k-prototype 聚类算法
"""

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
%matplotlib inline
import pandas as pd
import numpy as np
from sklearn import preprocessing
Ejemplo n.º 10
0
        st.text(classification_report(data_target, pred))

        #Confusion matrix
        plot_confusion_matrix(data_target, pred, figsize=(7, 5), cmap="PuBuGn")
        bottom, top = plt.ylim()
        plt.ylim(bottom + 0.5, top - 0.5)
        st.pyplot()

        # Elbow Method
        visualizer = KElbowVisualizer(KmeansClus, k=(1, 10))
        visualizer.fit(data_feature)
        visualizer.show()
        st.pyplot()

        # Inter Cluster Distances
        visualizer_inter = InterclusterDistance(KmeansClus)
        visualizer_inter.fit(data_feature)
        visualizer_inter.show()
        st.pyplot()
    except:
        st.write("Fill all parameters.")

########################################
# Mini-Batch k-means
########################################
if ML_option == "Mini-Batch k-means":
    try:
        # Mini Batch parameters
        Nk = st.number_input("Number of clusters: ", min_value=1, step=1)
        MBatchClus = MiniBatchKMeans(n_clusters=Nk)
        MBatchClus.fit(data_feature)
Ejemplo n.º 11
0
def kMeans():

    twitterX, twitterY, twitter_dataset, scaled_features = preprocess()

    gm = GaussianMixture(covariance_type='tied', n_components=18, n_init=10)
    gm.fit(scaled_features)
    print("GM Converged", gm.converged_)
    print("GM Convergence Iterations", gm.n_iter_)
    print("GM weights", gm.weights_)

    gm.predict(scaled_features)
    gm.predict_proba(scaled_features)
    gm.score_samples(scaled_features)

    aic = []
    bic = []

    for i in range(10):
        gm = GaussianMixture(covariance_type='spherical', n_components=9, n_init=10)
        gm.fit(scaled_features)
        aic.append(gm.aic(scaled_features))
        bic.append(gm.bic(scaled_features))

    plt.plot(aic, label="AIC")
    plt.plot(bic, label="BIC")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Clusters")
    plt.ylabel("Information Criterion")
    plt.legend()
    plt.show()

    twitter_trainingX, twitter_testingX, twitter_trainingY, twitter_testingY = train_test_split(twitterX, twitterY)

    error = []

    #citation: https://stackoverflow.com/questions/36566844/pca-projection-and-reconstruction-in-scikit-learn


    for i in range(1, 8):
        pca = FastICA(n_components=i)
        pca.fit(twitter_trainingX)
        U, S, VT = np.linalg.svd(twitter_trainingX - twitter_trainingX.mean(0))
        x_train_pca = pca.transform(twitter_trainingX)
        x_train_pca2 = (twitter_trainingX - pca.mean_).dot(pca.components_.T)
        x_projected = pca.inverse_transform(x_train_pca)
        x_projected2 = x_train_pca.dot(pca.components_) + pca.mean_
        loss = ((twitter_trainingX - x_projected) ** 2).mean()
        error.append(loss)

    plt.clf()
    plt.figure(figsize=(15, 15))
    plt.title("reconstruction error")
    plt.plot(error, 'r')
    plt.xticks(range(len(error)), range(1, 8), rotation='vertical')
    plt.xlim([-1, len(error)])
    plt.show()


    clf = MLPClassifier(alpha=0.001, hidden_layer_sizes=(8,), random_state=1,
                        solver='lbfgs')
    clf.fit(twitter_trainingX, twitter_trainingY)
    y_pred = clf.predict(twitter_testingX)

    print("Accuracy Score Normal", accuracy_score(twitter_testingY, y_pred))

    kmeans = KMeans(
        init="random",
        n_clusters=3,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(twitter_testingX)

    print("Accuracy Score K-Means", accuracy_score(twitter_testingY, labels))

    for i in range(9):
        pca = PCA(n_components=i)
        pca.fit(scaled_features)
        cumsum = np.cumsum(pca.explained_variance_ratio_)

    plt.plot(cumsum, label="Explained Variance Ratio")
    # plt.xticks(np.arange(min(x), max(x) + 1, 1.0))
    # plt.xticks(range(1,18))
    plt.xlabel("Number of Dimensions")
    plt.ylabel("Explained Variance Ratio")
    plt.legend()
    plt.show()

    # ica
    num_batches = 100
    inc_pca = IncrementalPCA(n_components=5)
    for X_batch in np.array_split(scaled_features, num_batches):
        inc_pca.partial_fit(X_batch)
    X_reduced_inc = inc_pca.transform(scaled_features)

    # randomized projections
    rnd_pca = PCA(n_components=5, svd_solver="randomized")
    X_reduced_rand = rnd_pca.fit_transform(scaled_features)

    # citation: https://scikit-learn.org/stable/auto_examples/feature_selection/plot_feature_selection.html#sphx-glr-auto-examples-feature-selection-plot-feature-selection-py
    # k best
    scaler = MinMaxScaler()
    digits_indices = np.arange(twitterX.shape[-1])
    scaled_features_norm = scaler.fit_transform(scaled_features)
    k_selected = SelectKBest(f_classif, k=8)
    k_selected.fit(scaled_features_norm, twitterY)
    scores = -np.log10(k_selected.pvalues_)
    plt.bar(digits_indices - .45, scores, width=.2, label=r'Univariate score ($-Log(p_{value})$)')
    plt.xlabel("Features")
    plt.ylabel("F-Score")
    plt.show()

    digits

    kmeans = KMeans(
        init="random",
        n_clusters=5,
        n_init=10,
        max_iter=300,
        random_state=42
    )
    kmeans.fit(scaled_features)
    labels = kmeans.fit_predict(twitter_dataset)

    #the lowest SSE value
    print("KMeans Inertia", kmeans.inertia_)

    #final locations of the centroid
    print("KMeans Cluster Centers", kmeans.cluster_centers_)

    #num of iterations required to converge
    print("KMeans Iterations Required To Converge", kmeans.n_iter_)

    #labels
    print("KMeans Labels", kmeans.labels_[:5])

    kmeans_kwargs = {
        "init":"random",
        "n_init":10,
        "max_iter":300,
        "random_state":42,
    }

    sse = []
    for k in range(1, 18):
        kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
        kmeans.fit(scaled_features)
        sse.append(kmeans.inertia_)

    model = KMeans(n_clusters=9)
    elbow_visualizer = KElbowVisualizer(model, k=(2, 18))
    elbow_visualizer.fit(twitterX)
    elbow_visualizer.show()

    silhouette_visualizer = SilhouetteVisualizer(model, colors='yellowbrick')
    silhouette_visualizer.fit(twitterX)
    silhouette_visualizer.show()

    ic_visualizer = InterclusterDistance(model)
    ic_visualizer.fit(twitterX)
    ic_visualizer.show()

    X = twitter_dataset[:, []]
    plt.scatter()
def intercluster(X):
    model = KMeans(3)
    visualizer = InterclusterDistance(model)

    visualizer.fit(X)
    visualizer.show()
Ejemplo n.º 13
0
def cluster_metrics(i_patches, a_patches, g_patches, city_names, K, save_path,
                    g_indices):
    # intra-cluster distances: ssd of samples to the nearest cluster centre
    sum_of_squared_distances = []
    silhouette_scores = []
    calinski_harabasz_scores = []
    davies_bouldin_scores = []
    k_mean_list = []
    for k in K:
        model, k_means, A = get_kmeans144_result(a_patches, k)
        k_mean_list.append(k_means)
        sum_of_squared_distances.append(k_means.inertia_)

        labels = k_means.labels_
        score = metrics.silhouette_score(A, labels, metric='euclidean')
        silhouette_scores.append(score)

        score = metrics.calinski_harabasz_score(A, labels)
        calinski_harabasz_scores.append(score)

        score = metrics.davies_bouldin_score(A, labels)
        davies_bouldin_scores.append(score)

        mydict = dict_cluster(i_patches, a_patches, g_patches, city_names,
                              k_means)
        save_path_k = '{}_{}'.format(save_path, k)
        gt_ratio = gt_metric(mydict, save_path_k)

    plot_figure(K, sum_of_squared_distances, save_path,
                'sum_of_squared_distances')
    plot_figure(K, silhouette_scores, save_path, 'silhouette_scores')
    plot_figure(K, calinski_harabasz_scores, save_path,
                'calinski_harabasz_scores')
    plot_figure(K, davies_bouldin_scores, save_path, 'davies_bouldin_score')

    ssd_best_index = sum_of_squared_distances.index(
        max(sum_of_squared_distances))
    sil_best_index = silhouette_scores.index(max(silhouette_scores))
    ch_best_index = calinski_harabasz_scores.index(
        max(calinski_harabasz_scores))
    db_best_index = davies_bouldin_scores.index(max(davies_bouldin_scores))
    #gtr_best_index = gt_ratio.index(max(gt_ratio))

    all_indices = [
        ssd_best_index, sil_best_index, ch_best_index, db_best_index
    ]  #, gtr_best_index] #, axis=None)
    best_k = np.array(K)[np.unique(all_indices)]

    for ind in range(len(K)):  #best_k:
        # Visualize output clusters of K means in 2D
        k_means = k_mean_list[ind]
        visualizer = InterclusterDistance(k_means)
        visualizer.fit(A)  # Fit the data to the visualizer
        #visualizer.show()  # Finalize and render the figure
        visualizer.show(
            outpath='{}_{}_InterclusterDistance.png'.format(save_path, ind))
        visualizer.poof()

        # Visualize through TSNE
    A_embedded = TSNE().fit_transform(A)
    plt.figure()
    palette = sns.color_palette("bright", 2)
    y_ = np.asarray(g_indices)
    y = y_.astype(np.float32)
    sns.scatterplot(A_embedded[:, 0],
                    A_embedded[:, 1],
                    hue=y,
                    legend='full',
                    palette=palette)
    plt.savefig('{}_tsne.png'.format(save_path))

    return
Ejemplo n.º 14
0
# Clustering Evaluation Imports
from functools import partial

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs as sk_make_blobs

from yellowbrick.cluster import InterclusterDistance

# Helpers for easy dataset creation
N_SAMPLES = 1000
N_FEATURES = 12
SHUFFLE = True

# Make blobs partial
make_blobs = partial(sk_make_blobs, n_samples=N_SAMPLES, n_features=N_FEATURES, shuffle=SHUFFLE)


if __name__ == '__main__':
    # Make 8 blobs dataset
    X, y = make_blobs(centers=12)

    # Instantiate the clustering model and visualizer
    # Instantiate the clustering model and visualizer
    visualizer = InterclusterDistance(KMeans(9))

    visualizer.fit(X) # Fit the training data to the visualizer
    visualizer.poof(outpath="images/icdm.png") # Draw/show/poof the data
Ejemplo n.º 15
0
)

# https://www.scikit-yb.org/en/latest/api/cluster/elbow.html
visualizer = KElbowVisualizer(model, k=(1, 20))

visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(outpath="charts/income.k-means.PCA.KElbowVisualizer.png")
visualizer.poof()

model = KMeans(
    n_clusters=4,
    random_state=0,
    n_jobs=-1,
)
visualizer = InterclusterDistance(model)

visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(outpath="charts/income.k-means.PCA.InterclusterDistance.png")
visualizer.poof()

model = KMeans(n_clusters=4, random_state=0)
visualizer = SilhouetteVisualizer(model)

visualizer.fit(results)  # Fit the data to the visualizer
# Finalize and render the figure
visualizer.show(outpath="charts/income.k-means.PCA.SilhouetteVisualizer.png")

lowest_bic = np.infty
bic = []
                visualizerRadViz = RadViz(classes=classes,
                                          features=features,
                                          title=' ')
                visualizerRadViz.fit(X, y)  # Fit the data to the visualizer
                visualizerRadViz.transform(X)  # Transform the data
                locationFileNameRVZ = os.path.join('/home/ak/Documents/Research/Papers/figures',str(symbols[symbolIdx]) \
                                                   +'_idx_'+str(idx)+'_label_'+str(labelsIdx)+'_date_'+str(dateIdx)+'_radviz.png')
                visualizerRadViz.show(outpath=locationFileNameRVZ)
                plt.show()

                ## MDS

                # Instantiate the clustering model and visualizer
                model = KMeans(6)
                plt.figure()
                plt.xlabel('features', fontsize=12)
                plt.ylabel('features', fontsize=12)

                plt.xticks(fontsize=14)
                plt.yticks(fontsize=12)
                visualizerID = InterclusterDistance(model)
                visualizerID.fit(X)  # Fit the data to the visualizer

                locationFileNameID = os.path.join(
                    '/home/ak/Documents/Research/Papers/figures',
                    str(symbols[symbolIdx]) + '_idx_' + str(idx) +
                    '_KMeans_MDS.png')
                visualizerID.show(outpath=locationFileNameID
                                  )  # Finalize and render the figure
                plt.show()
Ejemplo n.º 17
0
def pca(training_set, test_set):
    pca = PCA()

    pca.fit_transform(training_set)
    pca.transform(test_set)

    explained_variance = pca.explained_variance_ratio_
    components = 16
    print("for " + str(components) + " components")
    top_n = explained_variance[:components]
    print(top_n)
    print("captures ")
    print(np.sum(top_n))
    print("percent")

    pca_cum_variance(pca)

    pca = PCA(n_components=16)
    X_train = pca.fit_transform(training_set)
    X_test = pca.transform(test_set)

    distortions = []
    for i in range(1, 11):
        km = KMeans(n_clusters=i,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        km.fit(X_train)
        distortions.append(km.inertia_)

    plt.plot(range(1, 11), distortions, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.title("Distortion vs # Clusters PCA-20")

    plt.tight_layout()
    plt.show()

    km = KMeans(n_clusters=3,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    plot_silhouette(km, X_train, title="PCA20, K=3")

    visualizer = InterclusterDistance(km)
    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # F

    km = KMeans(n_clusters=2,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    plot_silhouette(km, X_train, title="PCA20, K=2")

    visualizer = InterclusterDistance(km)
    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # F

    km = KMeans(n_clusters=4,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    plot_silhouette(km, X_train, title="PCA20, K=4")

    km = KMeans(n_clusters=5,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)

    plot_silhouette(km, X_train, title="PCA20, K=5")
Ejemplo n.º 18
0
def plain_clustering():
    distortions = []
    for i in range(1, 11):
        km = KMeans(n_clusters=i,
                    init='k-means++',
                    n_init=10,
                    max_iter=300,
                    random_state=RAND)
        km.fit(X_train)
        distortions.append(km.inertia_)

    plt.plot(range(1, 11), distortions, marker='o')
    plt.xlabel('Number of clusters')
    plt.ylabel('Distortion')
    plt.tight_layout()
    plt.show()

    km = KMeans(n_clusters=3,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)
    y_km = km.fit_predict(X_train)

    visualizer = InterclusterDistance(km)

    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # Finalize and render the figure

    # cluster_labels = np.unique(y_km)
    # n_clusters = cluster_labels.shape[0]
    # silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
    # y_ax_lower, y_ax_upper = 0, 0
    # yticks = []
    # for i, c in enumerate(cluster_labels):
    #     c_silhouette_vals = silhouette_vals[y_km == c]
    #     c_silhouette_vals.sort()
    #     y_ax_upper += len(c_silhouette_vals)
    #     color = cm.jet(float(i) / n_clusters)
    #     plt.barh(range(y_ax_lower, y_ax_upper), c_silhouette_vals, height=1.0,
    #              edgecolor='none', color=color)
    #
    #     yticks.append((y_ax_lower + y_ax_upper) / 2.)
    #     y_ax_lower += len(c_silhouette_vals)
    #
    # silhouette_avg = np.mean(silhouette_vals)
    # plt.axvline(silhouette_avg, color="red", linestyle="--")
    #
    # plt.yticks(yticks, cluster_labels + 1)
    # plt.ylabel('Cluster')
    # plt.xlabel('Silhouette coefficient')
    #
    # plt.tight_layout()
    # # plt.savefig('images/11_04.png', dpi=300)
    # plt.show()

    km = KMeans(n_clusters=2,
                init='k-means++',
                n_init=10,
                max_iter=300,
                random_state=RAND)
    y_km = km.fit_predict(X_train)
    visualizer = InterclusterDistance(km)
    visualizer.fit(X_train)  # Fit the data to the visualizer
    visualizer.show()  # F

    cluster_labels = np.unique(y_km)
    n_clusters = cluster_labels.shape[0]
    silhouette_vals = silhouette_samples(X, y_km, metric='euclidean')
    y_ax_lower, y_ax_upper = 0, 0
    yticks = []
    for i, c in enumerate(cluster_labels):
        c_silhouette_vals = silhouette_vals[y_km == c]
        c_silhouette_vals.sort()
        y_ax_upper += len(c_silhouette_vals)
        color = cm.jet(float(i) / n_clusters)
        plt.barh(range(y_ax_lower, y_ax_upper),
                 c_silhouette_vals,
                 height=1.0,
                 edgecolor='none',
                 color=color)

        yticks.append((y_ax_lower + y_ax_upper) / 2.)
        y_ax_lower += len(c_silhouette_vals)

    silhouette_avg = np.mean(silhouette_vals)
    plt.axvline(silhouette_avg, color="red", linestyle="--")

    plt.yticks(yticks, cluster_labels + 1)
    plt.ylabel('Cluster')
    plt.xlabel('Silhouette coefficient')

    plt.tight_layout()
    # plt.savefig('images/11_04.png', dpi=300)
    plt.show()
Ejemplo n.º 19
0
# Clustering Evaluation Imports
from functools import partial

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs as sk_make_blobs

from yellowbrick.cluster import InterclusterDistance

# Helpers for easy dataset creation
N_SAMPLES = 1000
N_FEATURES = 12
SHUFFLE = True

# Make blobs partial
make_blobs = partial(sk_make_blobs,
                     n_samples=N_SAMPLES,
                     n_features=N_FEATURES,
                     shuffle=SHUFFLE)

if __name__ == '__main__':
    # Make 8 blobs dataset
    X, y = make_blobs(centers=12)

    # Instantiate the clustering model and visualizer
    # Instantiate the clustering model and visualizer
    visualizer = InterclusterDistance(KMeans(9))

    visualizer.fit(X)  # Fit the training data to the visualizer
    visualizer.poof(outpath="images/icdm.png")  # Draw/show/poof the data
Ejemplo n.º 20
0
plt.title("K-Means (Dot Size = Silhouette Distance)", fontsize=20)
plt.xlabel('Annual Income (K)', fontsize=22)
plt.ylabel('Spending Score', fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)

# plt.savefig('out/mall-kmeans-5-silhouette-size.png');

visualizer = SilhouetteVisualizer(k_means)
visualizer.fit(X)
visualizer.poof()
fig = visualizer.ax.get_figure()
# fig.savefig('out/mall-kmeans-5-silhouette.png', transparent=False);

# Instantiate the clustering model and visualizer
visualizer = InterclusterDistance(k_means)
visualizer.fit(X)  # Fit the training data to the visualizer
visualizer.poof()  # Draw/show/poof the data
# plt.savefig('out/mall-kmeans-5-tsne.png', transparent=False);

# Elbow Method (Manual)
inertias = {}
silhouettes = {}
for k in range(2, 11):
    kmeans = KMeans(init='k-means++',
                    n_init=10,
                    n_clusters=k,
                    max_iter=1000,
                    random_state=42).fit(X)
    inertias[
        k] = kmeans.inertia_  # Inertia: Sum of distances of samples to their closest cluster center
Ejemplo n.º 21
0
plt.xlabel('Number of clusters')
plt.ylabel('WCCS')
plt.show()
kmeans = KMeans(n_clusters=3,
                init='k-means++',
                max_iter=300,
                n_init=10,
                random_state=0)
pred_y_train = kmeans.fit_predict(X_train)
print("K Cluster Train Accuracy")
homo_score = metrics.homogeneity_score(pred_y_train, Y_train_encoded)
print("Homogeneity Score")
print(homo_score)
print((accuracy_score(pred_y_train, Y_train_encoded)))

visualizer = InterclusterDistance(kmeans)
#visualizer.fit(X_train)
#visualizer.show()

pred_y_test = kmeans.fit_predict(X_validation)
print("K Cluster Test Accuracy")
homo_score_test = metrics.homogeneity_score(pred_y_test, Y_test_encoded)
print("Homogeneity Score")
print(homo_score_test)

print((accuracy_score(pred_y_test, Y_test_encoded)))

visualizer.fit(X_validation)
visualizer.show()

#Using K means Cluster as Features