def example_2(): digits_dataset = load_digits() digits_data = digits_dataset['data'] digits_target = digits_dataset['target'] X_tr, X_ts, y_tr, y_ts = train_test_split(digits_data, digits_target, test_size=0.40, random_state=42, stratify=digits_target) transform = UMAP(n_components=2, random_state=42, n_neighbors=30, min_dist=0.0) X_tr = transform.fit_transform(X_tr) X_ts = transform.transform(X_ts) s = KNeighborsClassifier(n_neighbors=30) c = KMeans() reval = FindBestClustCV(s=s, c=c, nfold=5, nclust_range=[2, 15], nrand=100) metrics, nclustbest, _ = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr) plot_metrics(metrics, title='Reval performance digits dataset') out = reval.evaluate(X_tr, X_ts, nclust=nclustbest) perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab) print(f"Best number of clusters: {nclustbest}") print(f"Test set prediction ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}") print(f'AMI (true labels vs predicted labels) = ' f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}') print(f"Validation set normalized stability (misclassification):" f"{metrics['val'][nclustbest]}") print(f'Test set ACC = {out.test_acc} ' f'(true labels vs predicted labels)') plt.figure(figsize=(6, 4)) plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r') plt.title("Test set true labels (digits dataset)") plt.show() plt.figure(figsize=(6, 4)) plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r') plt.title("Test set clustering labels (digits dataset)") plt.show()
def example_1(): data = make_blobs(1000, 2, 5, center_box=(-20, 20), random_state=42) plt.figure(figsize=(6, 4)) plt.scatter(data[0][:, 0], data[0][:, 1], c=data[1], cmap='rainbow_r') plt.title("Blobs dataset (N=1000)") plt.show() X_tr, X_ts, y_tr, y_ts = train_test_split(data[0], data[1], test_size=0.30, random_state=42, stratify=data[1]) classifier = KNeighborsClassifier(n_neighbors=5) clustering = KMeans() findbestclust = FindBestClustCV(nfold=10, nclust_range=[2, 7], s=classifier, c=clustering, nrand=100) metrics, nbest, _ = findbestclust.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr) out = findbestclust.evaluate(X_tr, X_ts, nbest) perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab) print(f"Best number of clusters: {nbest}") print(f"Test set prediction ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}") print(f'AMI (true labels vs predicted labels) = ' f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}') print(f"Validation set normalized stability (misclassification):" f"{metrics['val'][nbest]}") print(f'Test set ACC = {out.test_acc} ' f'(true labels vs predicted labels)') plot_metrics(metrics, title="Reval performance blobs dataset", legend_loc=2) plt.figure(figsize=(6, 4)) plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r') plt.title("Test set true labels (blobs dataset)") plt.show() plt.figure(figsize=(6, 4)) plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r') plt.title("Test set clustering labels (blobs dataset)") plt.show()
test_size=0.30, random_state=42, stratify=data1[1]) # Apply relative clustering validation with KNN and Hierarchical clustering classifier = KNeighborsClassifier() clustering = AgglomerativeClustering() findbestclust = FindBestClustCV(nfold=10, nclust_range=list(range(2, 7)), s=classifier, c=clustering, nrand=100) metrics, nbest = findbestclust.best_nclust(data=X_tr, strat_vect=y_tr) out = findbestclust.evaluate(X_tr, X_ts, nbest) plot_metrics(metrics, title="Reval performance for synthetic dataset with 10 features") data2 = make_blobs(1000, 20, centers=5, cluster_std=5, random_state=42) plt.scatter(data2[0][:, 0], data2[0][:, 1], c=data2[1], cmap='rainbow_r') plt.title('True labels for 20-feature dataset') X_tr, X_ts, y_tr, y_ts = train_test_split(data2[0], data2[1], test_size=0.30, random_state=42, stratify=data2[1]) findbestclust = FindBestClustCV(nfold=10, nclust_range=list(range(2, 7)), s=classifier, c=clustering, nrand=100) metrics, nbest = findbestclust.best_nclust(data=X_tr, strat_vect=y_tr)
clustering = AgglomerativeClustering() findbestclust = FindBestClustCV(nfold=2, nclust_range=list(range(2, 12)), s=classifier, c=clustering, nrand=10, n_jobs=1) metrics, nbest = findbestclust.best_nclust(mnist_tr, iter_cv=10, strat_vect=label_tr) out = findbestclust.evaluate(mnist_tr, mnist_ts, nbest) plot_metrics( metrics, title="Relative clustering validation performance on MNIST dataset") perm_lab = kuhn_munkres_algorithm(label_ts.astype(int), out.test_cllab) plt.scatter(mnist_ts[:, 0], mnist_ts[:, 1], c=perm_lab, s=0.1, cmap='rainbow_r') plt.title("Predicted labels for MNIST test set") print(f"Best number of clusters: {nbest}") print(f"Test set external ACC: " f"{1 - zero_one_loss(label_ts.astype(int), perm_lab)}") print(f'AMI = {adjusted_mutual_info_score(label_ts.astype(int), perm_lab)}')
c=clustering, nrand=100) metrics, nbest, _ = findbestclust.best_nclust(X_tr, y_tr) out = findbestclust.evaluate(X_tr, X_ts, nbest) perm_lab = _kuhn_munkres_algorithm(y_ts, out.test_cllab) print(f"Best number of clusters: {nbest}") print(f"Test set external ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}") print(f'AMI = {adjusted_mutual_info_score(y_ts, out.test_cllab)}') print( f"Validation set normalized stability (misclassification): {metrics['val'][nbest]}" ) print(f'Test set ACC = {out.test_acc}') plot_metrics(metrics, title="Reval performance") plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r') plt.title("True labels for test set") plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r') plt.title("Clustering labels for test set") # Create a noisy dataset with 5 clusters # ---------------------------------------- data_noisy = make_blobs(1000, 10, 5, random_state=42, cluster_std=3) plt.scatter(data_noisy[0][:, 0], data_noisy[0][:, 1], c=data_noisy[1], cmap='rainbow_r')
def example1(): # Generate dataset data = make_blobs(1000, 2, centers=5, center_box=(-20, 20), random_state=42) # Visualize dataset plt.figure(figsize=(6, 4)) for i in range(5): plt.scatter(data[0][data[1] == i][:, 0], data[0][data[1] == i][:, 1], label=i, cmap='tab20') plt.title("Blobs dataset") # plt.savefig('./blobs.png', format='png') plt.show() # Create training and test sets X_tr, X_ts, y_tr, y_ts = train_test_split(data[0], data[1], test_size=0.30, random_state=42, stratify=data[1]) # Initialize clustering and classifier classifier = KNeighborsClassifier(n_neighbors=15) clustering = KMeans() # Run relatve validation (repeated CV and testing) findbestclust = FindBestClustCV(nfold=2, nclust_range=list(range(2, 7, 1)), s=classifier, c=clustering, nrand=10, n_jobs=N_JOBS) metrics, nbest = findbestclust.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr) out = findbestclust.evaluate(X_tr, X_ts, nclust=nbest) # Plot CV metrics plot_metrics(metrics, prob_lines=False) logging.info(f"Validation stability: {metrics['val'][nbest]}") perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab) logging.info(f"Best number of clusters: {nbest}") logging.info(f'AMI (true labels vs predicted labels) for test set = ' f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}') logging.info('\n\n') # Compute metrics logging.info("Metrics from true label comparisons on test set:") class_scores = compute_metrics(y_ts, perm_lab, perm=False) for k, val in class_scores.items(): if k in ['F1', 'MCC']: logging.info(f"{k}, {val}") logging.info("\n\n") # Internal measures # SILHOUETTE logging.info("Silhouette score based selection") sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, clustering, silhouette_score, select='max', nclust_range=list(range(2, 7, 1))) sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, clustering, silhouette_score, select='max', nclust_range=list(range(2, 7, 1))) sil_eval = evaluate_best(X_ts, clustering, silhouette_score, sil_best_tr) logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: " f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})") logging.info(f"Test set evaluation {sil_eval}") logging.info(f'AMI (true labels vs clustering labels) training = ' f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}') logging.info(f'AMI (true labels vs clustering labels) test = ' f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}') logging.info('\n\n') # DAVIES-BOULDIN logging.info("Davies-Bouldin score based selection") db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, clustering, davies_bouldin_score, select='min', nclust_range=list(range(2, 7, 1))) db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, clustering, davies_bouldin_score, select='min', nclust_range=list(range(2, 7, 1))) db_eval = evaluate_best(X_ts, clustering, davies_bouldin_score, db_best_tr) logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: " f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})") logging.info(f"Test set evaluation {db_eval}") logging.info(f'AMI (true labels vs clustering labels) training = ' f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}') logging.info(f'AMI (true labels vs clustering labels) test = ' f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}') logging.info('\n\n') # Plot true vs predicted labels for test sets plt.figure(figsize=(6, 4)) for i in range(5): plt.scatter(X_ts[y_ts == i][:, 0], X_ts[y_ts == i][:, 1], label=str(i), cmap='tab20') plt.legend(loc=3) plt.title("Test set true labels") # plt.savefig('./blobs_true.png', format='png') plt.show() plt.figure(figsize=(6, 4)) for i in range(5): plt.scatter(X_ts[perm_lab == i][:, 0], X_ts[perm_lab == i][:, 1], label=str(i), cmap='tab20') plt.legend(loc=3) plt.title("Test set clustering labels") # plt.savefig('./blobs_clustering.png', format='png') plt.show()
def example2(): mnist = fetch_openml('mnist_784', version=1) mnist.target = mnist.target.astype(int) X_tr, y_tr = mnist['data'][:60000], mnist.target[:60000] X_ts, y_ts = mnist['data'][60000::], mnist.target[60000::] transform = UMAP(n_components=2, random_state=42, n_neighbors=30, min_dist=0.0) X_tr = transform.fit_transform(X_tr) X_ts = transform.transform(X_ts) s = KNeighborsClassifier(n_neighbors=30) c = hdbscan.HDBSCAN(min_samples=10, min_cluster_size=200) reval = FindBestClustCV(s=s, c=c, nfold=2, nrand=10, n_jobs=N_JOBS) metrics, nclustbest, tr_lab = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr) plot_metrics(metrics) out = reval.evaluate(X_tr, X_ts, nclust=nclustbest, tr_lab=tr_lab) perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab) logging.info(f"Validation stability: {metrics['val'][nclustbest]}") logging.info(f"Best number of clusters during CV: {nclustbest}") logging.info(f"Best number of clusters on test set: " f"{len([lab for lab in np.unique(out.test_cllab) if lab >= 0])}") logging.info(f'AMI (true labels vs predicted labels) = ' f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}') logging.info('\n\n') logging.info("Metrics from true label comparisons on test set:") class_scores = compute_metrics(y_ts, perm_lab) for k, val in class_scores.items(): logging.info(f'{k}, {val}') logging.info('\n\n') # Visualization fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_tr[:, 0], X_tr[:, 1], c=y_tr, cmap='rainbow_r', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Train set true labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_tr[:, 0], X_tr[:, 1], c=kuhn_munkres_algorithm(y_tr, tr_lab), cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Train set predicted labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Test set true labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_ts[:, 0], X_ts[:, 1], s=0.1, c=perm_lab, cmap='tab20') legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Test set clustering labels (digits dataset)") plt.show() # Internal measures # SILHOUETTE logging.info("Silhouette score based selection") sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, c, silhouette_score, select='max') sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, c, silhouette_score, select='max') logging.info( f"Best number of clusters (and scores) for tr/ts independent runs: " f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})") logging.info(f'AMI (true labels vs clustering labels) training = ' f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}') logging.info(f'AMI (true labels vs clustering labels) test = ' f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}') logging.info('\n\n') # DAVIES-BOULDIN logging.info("Davies-Bouldin score based selection") db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, c, davies_bouldin_score, select='min') db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, c, davies_bouldin_score, select='min') logging.info( f"Best number of clusters (and scores) for tr/ts independent runs: " f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})") logging.info(f'AMI (true labels vs clustering labels) training = ' f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}') logging.info(f'AMI (true labels vs clustering labels) test = ' f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}') logging.info('\n\n') # Visualization fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_tr[:, 0], X_tr[:, 1], c=sil_label_tr, cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Train set silhouette labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_ts[:, 0], X_ts[:, 1], c=sil_label_ts, cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Test set silhouette labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_tr[:, 0], X_tr[:, 1], c=db_label_tr, cmap='tab20', s=0.1) legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Train set Davies-Bouldin labels (digits dataset)") plt.show() fig, ax = plt.subplots(figsize=(10, 8)) scatter = ax.scatter(X_ts[:, 0], X_ts[:, 1], s=0.1, c=db_label_ts, cmap='tab20') legend = ax.legend(*scatter.legend_elements()) ax.add_artist(legend) plt.title("Test set Davies-Bouldin labels (digits dataset)") plt.show()