Esempio n. 1
0
            s=0.1,
            cmap='rainbow_r')
plt.title('UMAP-transformed test subsample of MNIST dataset (N=7,000)')

# Run relative clustering validation
classifier = KNeighborsClassifier()
clustering = AgglomerativeClustering()

findbestclust = FindBestClustCV(nfold=10,
                                nclust_range=[2, 12],
                                s=classifier,
                                c=clustering,
                                nrand=100)

metrics, nbest, _ = findbestclust.best_nclust(mnist_tr, label_tr)
out = findbestclust.evaluate(mnist_tr, mnist_ts, nbest)

plot_metrics(metrics,
             "Relative clustering validation performance on MNIST dataset")

perm_lab = _kuhn_munkres_algorithm(label_ts.astype(int), out.test_cllab)

plt.scatter(mnist_ts[:, 0],
            mnist_ts[:, 1],
            c=perm_lab,
            s=0.1,
            cmap='rainbow_r')
plt.title("Predicted labels for MNIST test set")

print(f"Best number of clusters: {nbest}")
print(f"Test set external ACC: "
Esempio n. 2
0
def time_cmplx(n_jobs=1):
    data = make_blobs(100, 10, centers=2)
    data_tr, data_ts, y_tr, y_ts = train_test_split(data[0],
                                                    data[1],
                                                    test_size=0.5,
                                                    stratify=data[1],
                                                    random_state=42)
    s = [
        KNeighborsClassifier(),
        SVC(),
        LogisticRegression(),
        RandomForestClassifier()
    ]
    c = [HDBSCAN(), AgglomerativeClustering(), KMeans(), SpectralClustering()]
    param = itertools.product(s, c)

    labels = ['KNN'] * 4 + ['SVM'] * 4 + ["LR"] * 4 + ['RF'] * 4
    time_cv = {'LR': [], 'KNN': [], 'RF': [], 'SVM': []}
    time_ev = {'LR': [], 'KNN': [], 'RF': [], 'SVM': []}
    for idx, mod in enumerate(param):
        classifier, clustering = mod[0], mod[1]
        findbest = FindBestClustCV(s=classifier,
                                   c=clustering,
                                   nrand=10,
                                   nfold=2,
                                   n_jobs=n_jobs,
                                   nclust_range=list(range(2, 7, 1)))
        if isinstance(clustering, HDBSCAN):
            start = time.time()
            _, _, tr_lab = findbest.best_nclust(data_tr,
                                                iter_cv=10,
                                                strat_vect=y_tr)
            time_cv[labels[idx]].append(time.time() - start)

            start = time.time()
            findbest.evaluate(data_tr, data_ts, nclust=2, tr_lab=tr_lab)
            time_ev[labels[idx]].append(time.time() - start)
        else:
            start = time.time()
            _, _ = findbest.best_nclust(data_tr, iter_cv=10, strat_vect=y_tr)
            time_cv[labels[idx]].append(time.time() - start)

            start = time.time()
            findbest.evaluate(data_tr, data_ts, nclust=2)
            time_ev[labels[idx]].append(time.time() - start)

    pkl.dump(time_cv, open(f'time_cv_njobs{n_jobs}.pkl', 'wb'))
    pkl.dump(time_ev, open(f'time_ev_njobs{n_jobs}.pkl', 'wb'))

    clustering = KMeans()
    classifier = KNeighborsClassifier()
    time_knnkmeans = {10: [], 100: [], 1000: []}
    for nsamples, nfeatures in itertools.product(
        [100, 500, 1000, 1500, 2000, 2500, 3000], [10, 100, 1000]):
        data = make_blobs(nsamples, nfeatures, centers=2)
        data_tr, data_ts, y_tr, y_ts = train_test_split(data[0],
                                                        data[1],
                                                        test_size=0.5,
                                                        stratify=data[1],
                                                        random_state=42)
        findbest = FindBestClustCV(s=classifier,
                                   c=clustering,
                                   nrand=10,
                                   nfold=2,
                                   n_jobs=n_jobs,
                                   nclust_range=list(range(2, 7, 1)))
        start = time.time()
        _, _ = findbest.best_nclust(data_tr, iter_cv=10, strat_vect=y_tr)
        findbest.evaluate(data_tr, data_ts, nclust=2)
        time_knnkmeans[nfeatures].append(time.time() - start)

    pkl.dump(time_knnkmeans, open(f'time_knnkmeans{n_jobs}.pkl', 'wb'))