Ejemplo n.º 1
0
def example_2():
    digits_dataset = load_digits()

    digits_data = digits_dataset['data']
    digits_target = digits_dataset['target']

    X_tr, X_ts, y_tr, y_ts = train_test_split(digits_data,
                                              digits_target,
                                              test_size=0.40,
                                              random_state=42,
                                              stratify=digits_target)

    transform = UMAP(n_components=2,
                     random_state=42,
                     n_neighbors=30,
                     min_dist=0.0)
    X_tr = transform.fit_transform(X_tr)
    X_ts = transform.transform(X_ts)

    s = KNeighborsClassifier(n_neighbors=30)
    c = KMeans()

    reval = FindBestClustCV(s=s, c=c, nfold=5, nclust_range=[2, 15], nrand=100)

    metrics, nclustbest, _ = reval.best_nclust(X_tr,
                                               iter_cv=10,
                                               strat_vect=y_tr)

    plot_metrics(metrics, title='Reval performance digits dataset')

    out = reval.evaluate(X_tr, X_ts, nclust=nclustbest)
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

    print(f"Best number of clusters: {nclustbest}")
    print(f"Test set prediction ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}")
    print(f'AMI (true labels vs predicted labels) = '
          f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    print(f"Validation set normalized stability (misclassification):"
          f"{metrics['val'][nclustbest]}")
    print(f'Test set ACC = {out.test_acc} '
          f'(true labels vs predicted labels)')

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r')
    plt.title("Test set true labels (digits dataset)")
    plt.show()

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r')
    plt.title("Test set clustering labels (digits dataset)")
    plt.show()
Ejemplo n.º 2
0
def example_1():
    data = make_blobs(1000, 2, 5, center_box=(-20, 20), random_state=42)
    plt.figure(figsize=(6, 4))
    plt.scatter(data[0][:, 0], data[0][:, 1], c=data[1], cmap='rainbow_r')
    plt.title("Blobs dataset (N=1000)")
    plt.show()

    X_tr, X_ts, y_tr, y_ts = train_test_split(data[0],
                                              data[1],
                                              test_size=0.30,
                                              random_state=42,
                                              stratify=data[1])

    classifier = KNeighborsClassifier(n_neighbors=5)
    clustering = KMeans()

    findbestclust = FindBestClustCV(nfold=10,
                                    nclust_range=[2, 7],
                                    s=classifier,
                                    c=clustering,
                                    nrand=100)
    metrics, nbest, _ = findbestclust.best_nclust(X_tr,
                                                  iter_cv=10,
                                                  strat_vect=y_tr)
    out = findbestclust.evaluate(X_tr, X_ts, nbest)

    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

    print(f"Best number of clusters: {nbest}")
    print(f"Test set prediction ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}")
    print(f'AMI (true labels vs predicted labels) = '
          f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    print(f"Validation set normalized stability (misclassification):"
          f"{metrics['val'][nbest]}")
    print(f'Test set ACC = {out.test_acc} '
          f'(true labels vs predicted labels)')

    plot_metrics(metrics,
                 title="Reval performance blobs dataset",
                 legend_loc=2)

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r')
    plt.title("Test set true labels (blobs dataset)")
    plt.show()

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r')
    plt.title("Test set clustering labels (blobs dataset)")
    plt.show()
Ejemplo n.º 3
0
                                          test_size=0.30,
                                          random_state=42,
                                          stratify=data1[1])
# Apply relative clustering validation with KNN and Hierarchical clustering
classifier = KNeighborsClassifier()
clustering = AgglomerativeClustering()

findbestclust = FindBestClustCV(nfold=10,
                                nclust_range=list(range(2, 7)),
                                s=classifier,
                                c=clustering,
                                nrand=100)
metrics, nbest = findbestclust.best_nclust(data=X_tr, strat_vect=y_tr)
out = findbestclust.evaluate(X_tr, X_ts, nbest)

plot_metrics(metrics, title="Reval performance for synthetic dataset with 10 features")

data2 = make_blobs(1000, 20, centers=5, cluster_std=5, random_state=42)

plt.scatter(data2[0][:, 0], data2[0][:, 1],
            c=data2[1], cmap='rainbow_r')
plt.title('True labels for 20-feature dataset')

X_tr, X_ts, y_tr, y_ts = train_test_split(data2[0],
                                          data2[1],
                                          test_size=0.30, random_state=42,
                                          stratify=data2[1])

findbestclust = FindBestClustCV(nfold=10, nclust_range=list(range(2, 7)),
                                s=classifier, c=clustering, nrand=100)
metrics, nbest = findbestclust.best_nclust(data=X_tr, strat_vect=y_tr)
clustering = AgglomerativeClustering()

findbestclust = FindBestClustCV(nfold=2,
                                nclust_range=list(range(2, 12)),
                                s=classifier,
                                c=clustering,
                                nrand=10,
                                n_jobs=1)

metrics, nbest = findbestclust.best_nclust(mnist_tr,
                                           iter_cv=10,
                                           strat_vect=label_tr)
out = findbestclust.evaluate(mnist_tr, mnist_ts, nbest)

plot_metrics(
    metrics,
    title="Relative clustering validation performance on MNIST dataset")

perm_lab = kuhn_munkres_algorithm(label_ts.astype(int), out.test_cllab)

plt.scatter(mnist_ts[:, 0],
            mnist_ts[:, 1],
            c=perm_lab,
            s=0.1,
            cmap='rainbow_r')
plt.title("Predicted labels for MNIST test set")

print(f"Best number of clusters: {nbest}")
print(f"Test set external ACC: "
      f"{1 - zero_one_loss(label_ts.astype(int), perm_lab)}")
print(f'AMI = {adjusted_mutual_info_score(label_ts.astype(int), perm_lab)}')
                                c=clustering,
                                nrand=100)
metrics, nbest, _ = findbestclust.best_nclust(X_tr, y_tr)
out = findbestclust.evaluate(X_tr, X_ts, nbest)

perm_lab = _kuhn_munkres_algorithm(y_ts, out.test_cllab)

print(f"Best number of clusters: {nbest}")
print(f"Test set external ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}")
print(f'AMI = {adjusted_mutual_info_score(y_ts, out.test_cllab)}')
print(
    f"Validation set normalized stability (misclassification): {metrics['val'][nbest]}"
)
print(f'Test set ACC = {out.test_acc}')

plot_metrics(metrics, title="Reval performance")

plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r')
plt.title("True labels for test set")

plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r')
plt.title("Clustering labels for test set")

# Create a noisy dataset with 5 clusters
# ----------------------------------------
data_noisy = make_blobs(1000, 10, 5, random_state=42, cluster_std=3)
plt.scatter(data_noisy[0][:, 0],
            data_noisy[0][:, 1],
            c=data_noisy[1],
            cmap='rainbow_r')
def example1():
    # Generate dataset
    data = make_blobs(1000, 2, centers=5,
                      center_box=(-20, 20),
                      random_state=42)

    # Visualize dataset
    plt.figure(figsize=(6, 4))
    for i in range(5):
        plt.scatter(data[0][data[1] == i][:, 0],
                    data[0][data[1] == i][:, 1],
                    label=i, cmap='tab20')
    plt.title("Blobs dataset")
    # plt.savefig('./blobs.png', format='png')
    plt.show()

    # Create training and test sets
    X_tr, X_ts, y_tr, y_ts = train_test_split(data[0],
                                              data[1],
                                              test_size=0.30,
                                              random_state=42,
                                              stratify=data[1])

    # Initialize clustering and classifier
    classifier = KNeighborsClassifier(n_neighbors=15)
    clustering = KMeans()

    # Run relatve validation (repeated CV and testing)
    findbestclust = FindBestClustCV(nfold=2,
                                    nclust_range=list(range(2, 7, 1)),
                                    s=classifier,
                                    c=clustering,
                                    nrand=10,
                                    n_jobs=N_JOBS)
    metrics, nbest = findbestclust.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)
    out = findbestclust.evaluate(X_tr, X_ts, nclust=nbest)

    # Plot CV metrics
    plot_metrics(metrics, prob_lines=False)
    logging.info(f"Validation stability: {metrics['val'][nbest]}")
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

    logging.info(f"Best number of clusters: {nbest}")
    logging.info(f'AMI (true labels vs predicted labels) for test set = '
                 f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    logging.info('\n\n')

    # Compute metrics
    logging.info("Metrics from true label comparisons on test set:")
    class_scores = compute_metrics(y_ts, perm_lab, perm=False)
    for k, val in class_scores.items():
        if k in ['F1', 'MCC']:
            logging.info(f"{k}, {val}")
    logging.info("\n\n")

    # Internal measures
    # SILHOUETTE
    logging.info("Silhouette score based selection")
    sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, clustering, silhouette_score,
                                                          select='max',
                                                          nclust_range=list(range(2, 7, 1)))
    sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, clustering, silhouette_score,
                                                          select='max',
                                                          nclust_range=list(range(2, 7, 1)))

    sil_eval = evaluate_best(X_ts, clustering, silhouette_score, sil_best_tr)

    logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: "
                 f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
    logging.info(f"Test set evaluation {sil_eval}")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
    logging.info('\n\n')

    # DAVIES-BOULDIN
    logging.info("Davies-Bouldin score based selection")
    db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, clustering, davies_bouldin_score,
                                                       select='min', nclust_range=list(range(2, 7, 1)))
    db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, clustering, davies_bouldin_score,
                                                       select='min', nclust_range=list(range(2, 7, 1)))

    db_eval = evaluate_best(X_ts, clustering, davies_bouldin_score, db_best_tr)

    logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: "
                 f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
    logging.info(f"Test set evaluation {db_eval}")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
    logging.info('\n\n')

    # Plot true vs predicted labels for test sets
    plt.figure(figsize=(6, 4))
    for i in range(5):
        plt.scatter(X_ts[y_ts == i][:, 0],
                    X_ts[y_ts == i][:, 1],
                    label=str(i),
                    cmap='tab20')
    plt.legend(loc=3)
    plt.title("Test set true labels")
    # plt.savefig('./blobs_true.png', format='png')
    plt.show()

    plt.figure(figsize=(6, 4))
    for i in range(5):
        plt.scatter(X_ts[perm_lab == i][:, 0],
                    X_ts[perm_lab == i][:, 1],
                    label=str(i),
                    cmap='tab20')
    plt.legend(loc=3)
    plt.title("Test set clustering labels")
    # plt.savefig('./blobs_clustering.png', format='png')
    plt.show()
def example2():
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(int)

    X_tr, y_tr = mnist['data'][:60000], mnist.target[:60000]
    X_ts, y_ts = mnist['data'][60000::], mnist.target[60000::]
    transform = UMAP(n_components=2,
                     random_state=42,
                     n_neighbors=30,
                     min_dist=0.0)
    X_tr = transform.fit_transform(X_tr)
    X_ts = transform.transform(X_ts)

    s = KNeighborsClassifier(n_neighbors=30)
    c = hdbscan.HDBSCAN(min_samples=10,
                        min_cluster_size=200)

    reval = FindBestClustCV(s=s,
                            c=c,
                            nfold=2,
                            nrand=10,
                            n_jobs=N_JOBS)

    metrics, nclustbest, tr_lab = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)

    plot_metrics(metrics)

    out = reval.evaluate(X_tr, X_ts, nclust=nclustbest, tr_lab=tr_lab)
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)
    logging.info(f"Validation stability: {metrics['val'][nclustbest]}")

    logging.info(f"Best number of clusters during CV: {nclustbest}")
    logging.info(f"Best number of clusters on test set: "
                 f"{len([lab for lab in np.unique(out.test_cllab) if lab >= 0])}")
    logging.info(f'AMI (true labels vs predicted labels) = '
                 f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    logging.info('\n\n')

    logging.info("Metrics from true label comparisons on test set:")
    class_scores = compute_metrics(y_ts, perm_lab)
    for k, val in class_scores.items():
        logging.info(f'{k}, {val}')
    logging.info('\n\n')

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=y_tr, cmap='rainbow_r',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set true labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=kuhn_munkres_algorithm(y_tr, tr_lab),
                         cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set predicted labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         c=y_ts, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set true labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         s=0.1,
                         c=perm_lab, cmap='tab20')
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set clustering labels (digits dataset)")
    plt.show()

    # Internal measures
    # SILHOUETTE
    logging.info("Silhouette score based selection")
    sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, c, silhouette_score, select='max')
    sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, c, silhouette_score, select='max')
    logging.info(
        f"Best number of clusters (and scores) for tr/ts independent runs: "
        f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
    logging.info('\n\n')

    # DAVIES-BOULDIN
    logging.info("Davies-Bouldin score based selection")
    db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, c, davies_bouldin_score,
                                                       select='min')
    db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, c, davies_bouldin_score,
                                                       select='min')

    logging.info(
        f"Best number of clusters (and scores) for tr/ts independent runs: "
        f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
    logging.info('\n\n')

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=sil_label_tr, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set silhouette labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         c=sil_label_ts, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set silhouette labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=db_label_tr, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set Davies-Bouldin labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         s=0.1,
                         c=db_label_ts, cmap='tab20')
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set Davies-Bouldin labels (digits dataset)")
    plt.show()