Esempio n. 1
0
    def _run_gridsearchcv(self, data, param_s, param_c):
        """
        Private method that initializes classifier/clustering with different
        parameter combinations and :class:`reval.best_nclust_cv.FindBestClustCV`.

        :param data: training dataset.
        :type data: numpy array
        :param param_s: dictionary of classifier parameters.
        :type: dict
        :param param_c: dictionary of clustering parameters.
        :type param_c: dict
        :return: performance list.
        :rtype: list
        """
        self.class_method.set_params(**param_s)
        self.clust_method.set_params(**param_c)
        findclust = FindBestClustCV(nfold=self.cv,
                                    s=self.class_method,
                                    c=self.clust_method,
                                    nrand=self.nrand,
                                    n_jobs=1,
                                    nclust_range=self.clust_range)
        if self.clust_range is not None:
            metric, nclbest = findclust.best_nclust(data,
                                                    iter_cv=self.iter_cv,
                                                    strat_vect=self.strat)
            tr_lab = None
        else:
            try:
                metric, nclbest, tr_lab = findclust.best_nclust(
                    data, iter_cv=self.iter_cv, strat_vect=self.strat)
            except TypeError:
                perf = [(key, val) for key, val in param_s.items()] + \
                       [(key, val) for key, val in param_c.items()] + \
                       [('best_nclust', None),
                        ('mean_train_score', None),
                        ('sd_train_score', None),
                        ('mean_val_score', None),
                        ('sd_val_score', None),
                        ('validation_meanerror', None),
                        ('tr_label', None)]
                return perf

        perf = [(key, val) for key, val in param_s.items()] + \
               [(key, val) for key, val in param_c.items()] + \
               [('best_nclust', nclbest),
                ('mean_train_score', np.mean(
                    findclust.cv_results_.loc[findclust.cv_results_.ncl == nclbest]['ms_tr'])),
                ('sd_train_score', np.std(
                    findclust.cv_results_.loc[findclust.cv_results_.ncl == nclbest]['ms_tr'])),
                ('mean_val_score', np.mean(
                    findclust.cv_results_.loc[findclust.cv_results_.ncl == nclbest]['ms_val'])),
                ('sd_val_score', np.std(
                    findclust.cv_results_.loc[findclust.cv_results_.ncl == nclbest]['ms_val'])),
                ('validation_meanerror', metric['val'][nclbest]),
                ('tr_label', tr_lab)]
        return perf
Esempio n. 2
0
 def setUp(cls):
     cls.s = CLASSIFIER
     cls.c = NEW_CLUSTERING
     cls.c_work = hdbscan.HDBSCAN()
     cls.nrand = RNDLABELS_ITER
     cls.nfold = NFOLD
     cls.nclust_range = NEW_NCLUST_RANGE
     cls.n_jobs = N_JOBS
     cls.findbest = FindBestClustCV(cls.s, cls.c, cls.nrand, cls.nfold,
                                    cls.n_jobs, cls.nclust_range)
     cls.findbest_bis = FindBestClustCV(cls.s, cls.c_work, cls.nrand, cls.nfold,
                                        cls.n_jobs, cls.nclust_range)
def example_2():
    digits_dataset = load_digits()

    digits_data = digits_dataset['data']
    digits_target = digits_dataset['target']

    X_tr, X_ts, y_tr, y_ts = train_test_split(digits_data,
                                              digits_target,
                                              test_size=0.40,
                                              random_state=42,
                                              stratify=digits_target)

    transform = UMAP(n_components=2,
                     random_state=42,
                     n_neighbors=30,
                     min_dist=0.0)
    X_tr = transform.fit_transform(X_tr)
    X_ts = transform.transform(X_ts)

    s = KNeighborsClassifier(n_neighbors=30)
    c = KMeans()

    reval = FindBestClustCV(s=s, c=c, nfold=5, nclust_range=[2, 15], nrand=100)

    metrics, nclustbest, _ = reval.best_nclust(X_tr,
                                               iter_cv=10,
                                               strat_vect=y_tr)

    plot_metrics(metrics, title='Reval performance digits dataset')

    out = reval.evaluate(X_tr, X_ts, nclust=nclustbest)
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

    print(f"Best number of clusters: {nclustbest}")
    print(f"Test set prediction ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}")
    print(f'AMI (true labels vs predicted labels) = '
          f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    print(f"Validation set normalized stability (misclassification):"
          f"{metrics['val'][nclustbest]}")
    print(f'Test set ACC = {out.test_acc} '
          f'(true labels vs predicted labels)')

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r')
    plt.title("Test set true labels (digits dataset)")
    plt.show()

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r')
    plt.title("Test set clustering labels (digits dataset)")
    plt.show()
def example_1():
    data = make_blobs(1000, 2, 5, center_box=(-20, 20), random_state=42)
    plt.figure(figsize=(6, 4))
    plt.scatter(data[0][:, 0], data[0][:, 1], c=data[1], cmap='rainbow_r')
    plt.title("Blobs dataset (N=1000)")
    plt.show()

    X_tr, X_ts, y_tr, y_ts = train_test_split(data[0],
                                              data[1],
                                              test_size=0.30,
                                              random_state=42,
                                              stratify=data[1])

    classifier = KNeighborsClassifier(n_neighbors=5)
    clustering = KMeans()

    findbestclust = FindBestClustCV(nfold=10,
                                    nclust_range=[2, 7],
                                    s=classifier,
                                    c=clustering,
                                    nrand=100)
    metrics, nbest, _ = findbestclust.best_nclust(X_tr,
                                                  iter_cv=10,
                                                  strat_vect=y_tr)
    out = findbestclust.evaluate(X_tr, X_ts, nbest)

    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

    print(f"Best number of clusters: {nbest}")
    print(f"Test set prediction ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}")
    print(f'AMI (true labels vs predicted labels) = '
          f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    print(f"Validation set normalized stability (misclassification):"
          f"{metrics['val'][nbest]}")
    print(f'Test set ACC = {out.test_acc} '
          f'(true labels vs predicted labels)')

    plot_metrics(metrics,
                 title="Reval performance blobs dataset",
                 legend_loc=2)

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=y_ts, cmap='rainbow_r')
    plt.title("Test set true labels (blobs dataset)")
    plt.show()

    plt.figure(figsize=(6, 4))
    plt.scatter(X_ts[:, 0], X_ts[:, 1], c=perm_lab, cmap='rainbow_r')
    plt.title("Test set clustering labels (blobs dataset)")
    plt.show()
Esempio n. 5
0
    def _run_gridsearchcv(self, data, sc):
        """
        Private function with different initializations of
        :class:`reval.best_nclust_cv.FindBestClustCV`.

        :param data: input dataset.
        :type data: numpy array
        :param sc: classifier/clustering of the form {'s':, 'c':}.
        :type sc: dict
        :return: performance list.
        :rtype: list
        """
        findclust = FindBestClustCV(s=sc['s'],
                                    c=sc['c'],
                                    nfold=self.cv,
                                    nrand=self.nrand,
                                    n_jobs=1,
                                    nclust_range=self.clust_range)

        if 'n_clusters' in sc['c'].get_params().keys():
            metric, nclbest = findclust.best_nclust(data,
                                                    iter_cv=self.iter_cv,
                                                    strat_vect=self.strat)
            sc['c'].n_clusters = nclbest
            tr_lab = None
        else:
            try:
                metric, nclbest, tr_lab = findclust.best_nclust(
                    data, iter_cv=self.iter_cv, strat_vect=self.strat)
            except TypeError:
                perf = [('s', sc['s']), ('c', sc['c']), ('best_nclust', None),
                        ('mean_train_score', None), ('sd_train_score', None),
                        ('mean_val_score', None), ('sd_val_score', None),
                        ('validation_meanerror', None), ('tr_label', None)]
                return perf

        cv_scores = findclust.cv_results_
        perf = [('s', sc['s']), ('c', sc['c']), ('best_nclust', nclbest),
                ('mean_train_score',
                 np.mean(cv_scores.loc[cv_scores.ncl == nclbest]['ms_tr'])),
                ('sd_train_score',
                 np.std(cv_scores.loc[cv_scores.ncl == nclbest]['ms_tr'])),
                ('mean_val_score',
                 np.mean(cv_scores.loc[cv_scores.ncl == nclbest]['ms_val'])),
                ('sd_val_score',
                 np.std(cv_scores.loc[cv_scores.ncl == nclbest]['ms_val'])),
                ('validation_meanerror', metric['val'][nclbest]),
                ('tr_label', tr_lab)]
        return perf
Esempio n. 6
0
 def setUp(cls):
     cls.s = CLASSIFIER
     cls.c = CLUSTERING
     cls.nrand = RNDLABELS_ITER
     cls.nfold = NFOLD
     cls.nclust_range = NCLUST_RANGE
     cls.findbest = FindBestClustCV(cls.nfold, cls.nclust_range, cls.s, cls.c, cls.nrand)
 def setUp(cls):
     cls.s = KNeighborsClassifier(n_neighbors=5)
     cls.c = AgglomerativeClustering()
     cls.nrand = 10
     cls.nfold = 2
     cls.nclust_range = [2, 4]
     cls.findbest = FindBestClustCV(cls.nfold, cls.nclust_range, cls.s,
                                    cls.c, cls.nrand)
Esempio n. 8
0
import cProfile
import pstats
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans, AgglomerativeClustering
from hdbscan import HDBSCAN
from sklearn.neighbors import KNeighborsClassifier
from reval.best_nclust_cv import FindBestClustCV

s = KNeighborsClassifier()
# c = KMeans()
c = AgglomerativeClustering()

findbest = FindBestClustCV(s=s,
                           c=c,
                           nrand=10,
                           nfold=2,
                           n_jobs=1,
                           nclust_range=list(range(2, 7, 1)))

data = make_blobs(100, 10, centers=2)
data_tr, data_ts, y_tr, y_ts = train_test_split(data[0],
                                                data[1],
                                                test_size=0.5,
                                                stratify=data[1],
                                                random_state=42)

print("Profiling of algorithm that finds the best number of clusters.")
profiler = cProfile.Profile()
profiler.enable()
findbest.best_nclust(data_tr, iter_cv=10, strat_vect=y_tr)
profiler.disable()
# Plot synthetic dataset
plt.scatter(data1[0][:, 0], data1[0][:, 1], c=data1[1], cmap='rainbow_r')
plt.title('True labels for 10-feature dataset')

X_tr, X_ts, y_tr, y_ts = train_test_split(data1[0],
                                          data1[1],
                                          test_size=0.30,
                                          random_state=42,
                                          stratify=data1[1])
# Apply relative clustering validation with KNN and Hierarchical clustering
classifier = KNeighborsClassifier()
clustering = AgglomerativeClustering()

findbestclust = FindBestClustCV(nfold=10,
                                nclust_range=[2, 7],
                                s=classifier,
                                c=clustering,
                                nrand=100)
metrics, nbest, _ = findbestclust.best_nclust(X_tr, y_tr)
out = findbestclust.evaluate(X_tr, X_ts, nbest)

plot_metrics(metrics,
             "Reval performance for synthetic dataset with 10 features")

plt.scatter(X_ts[:, 0], X_ts[:, 1], c=out.test_cllab, cmap='rainbow_r')
plt.title("Predicted labels for 10-feature dataset")

# Compare Reval solution to true labels
print(f'AMI test set = {adjusted_mutual_info_score(y_ts, out.test_cllab)}')
relabeling = _kuhn_munkres_algorithm(y_ts, out.test_cllab)
print(f'ACC test set = {1 - zero_one_loss(y_ts, relabeling)}')
Esempio n. 10
0
def example_3():
    # Classifiers
    knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    svm = SVC(C=1, random_state=42)
    logreg = LogisticRegression(solver='liblinear', random_state=42)

    classifiers = [knn, logreg, svm, rf]

    # Clustering
    hc = AgglomerativeClustering()
    km = KMeans(random_state=42)
    sc = SpectralClustering(random_state=42)

    clustering = [hc, km, sc]

    # scaler = StandardScaler()
    transform = UMAP(n_neighbors=30, min_dist=0.0, random_state=42)

    # Import benchmark datasets
    uci_data = build_ucidatasets()
    # Run ensemble learning algorithm
    for data, name in zip(uci_data, uci_data._fields):
        nclass = len(np.unique(data['target']))
        logging.info(f"Processing dataset {name}")
        logging.info(f"Number of classes: {nclass}\n")
        X_tr, X_ts, y_tr, y_ts = train_test_split(data['data'],
                                                  data['target'],
                                                  test_size=0.40,
                                                  random_state=42,
                                                  stratify=data['target'])
        X_tr = transform.fit_transform(X_tr)
        X_ts = transform.transform(X_ts)
        for s in classifiers:
            if type(s) == type(svm):
                svm.gamma = 1 / data['data'].shape[0]
            for c in clustering:
                logging.info(
                    f"Clustering algorithm: {c} -- Classification algorithm {s}"
                )
                reval = FindBestClustCV(s=s,
                                        c=c,
                                        nfold=5,
                                        nclust_range=[2, nclass + 3],
                                        nrand=100)
                metrics, nclustbest, _ = reval.best_nclust(X_tr,
                                                           strat_vect=y_tr)

                out = reval.evaluate(X_tr, X_ts, nclust=nclustbest)
                perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

                logging.info(f"Best number of clusters: {nclustbest}")
                logging.info(f"Test set prediction ACC: "
                             f"{1 - zero_one_loss(y_ts, perm_lab)}")
                logging.info(
                    f'AMI (true labels vs predicted labels) = '
                    f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
                logging.info(
                    f"Validation set normalized stability (misclassification):"
                    f"{metrics['val'][nclustbest]}")
                logging.info(f'Test set ACC = {out.test_acc} '
                             f'(true labels vs predicted labels)\n')
        logging.info('*' * 100)
        logging.info('\n\n')
Esempio n. 11
0
def run_validation(data_tr, data_ts, raw_ts, n_neighbors, period, hierarchy, cl_range=(2, 11)):
    """Function that performs the relative clustering validation.
    :param data_tr: training dataset
    :type data_tr: dataframe
    :param data_ts: test dataset
    :type data_ts: dataframe
    :param raw_ts: original test dataset before imputation
    :type raw_ts: dataframe
    :param n_neighbors: number of neighbors to consider for UMAP preprocessing step
    :type n_neighbors: int
    :param cl_range: range of number of clusters to consider, default (2, 11)
    :type cl_range: tuple
    :param period: interview period
    :type period: str
    :param hierarchy: hierarchical feature level
    :type hierarchy: str
    """
    logging.info(f'Processing Vineland feature level {hierarchy} at period {period}...')
    transform = umap.UMAP(random_state=42, n_neighbors=n_neighbors, min_dist=0.0)
    X_tr = transform.fit_transform(data_tr)
    X_ts = transform.transform(data_ts)

    # Initialize classes
    knn = KNeighborsClassifier(n_neighbors=10)
    clust = AgglomerativeClustering(affinity='euclidean', linkage='ward')

    relval = FindBestClustCV(s=knn, c=clust, nfold=10, nclust_range=cl_range,
                             nrand=100)  # This runs a 10-fold cross validation with number of clusters from 2 to 10
    # Run the model
    metric, ncl, cv_scores = relval.best_nclust(X_tr)  # the strat_vect parameter can be used to perform a stratified CV
    logging.info(f"Best number of clusters: {ncl}")
    out = relval.evaluate(X_tr, X_ts, ncl)
    logging.info(f"Training ACC: {out.train_acc}, Test ACC: {out.test_acc}")
    plot_metrics(metric)
    unique, counts = np.unique(out.train_cllab, return_counts=True)
    logging.info(f'Training set (N = {X_tr.shape[0]})\n')
    for a, b in zip(unique, counts):
        logging.info(f'N subjects in cluster {a}: {b}')

    unique, counts = np.unique(out.test_cllab, return_counts=True)
    logging.info(f'\n\nTest set (N = {X_ts.shape[0]})\n')
    for a, b in zip(unique, counts):
        logging.info(f'N subjects in cluster {a}: {b}')

    umap_tr = X_tr
    umap_ts = X_ts
    #     umap_tr = transform.fit_transform(X_tr)
    #     umap_ts = transform.fit_transform(X_ts)

    flatui = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2",
              "#7f7f7f", "#bcbd22", "#17becf", "#8c564b", "#a55194"]
    _scatter_plot(umap_tr,
                  [(gui, cl) for gui, cl in zip(data_tr.index, out.train_cllab)],
                  flatui,
                  10, 20, {str(ncl): '-'.join(['cluster', str(ncl)]) for ncl in sorted(np.unique(out.train_cllab))},
                  title=f'Subgroups of UMAP preprocessed Vineland TRAINING '
                        f'dataset (period: {period} -- level: {hierarchy})')

    _scatter_plot(umap_ts,
                  [(gui, cl) for gui, cl in zip(data_ts.index, out[2])],
                  flatui,
                  10, 20, {str(ncl): '-'.join(['cluster', str(ncl)]) for ncl in sorted(np.unique(out[2]))},
                  title=f'Subgroups of UMAP preprocessed Vineland TEST '
                        f'dataset (period: {period} -- level: {hierarchy})')

    # Plot heatmap
    raw_ts = raw_ts.loc[data_ts.index]
    raw_ts['cluster'] = out[2]
    mis_perc = {}
    for lab in np.unique(out.test_cllab):
        ts_rid = raw_ts.loc[raw_ts.cluster == lab].copy()
        mis_perc[lab] = (sum([ts_rid.iloc[indx].isna().astype(int) for indx in range(ts_rid.shape[0])]) / ts_rid.shape[
            0]) * 100
    # Save missingness percentage dataset
    mis_count_df = pd.DataFrame(
        [raw_ts[[c for c in raw_ts.columns if c != 'cluster']].iloc[indx].isna().astype(int) for indx in
         range(raw_ts.shape[0])])
    mis_count_df['cluster'] = raw_ts.cluster
    cl_labels = np.repeat(sorted(raw_ts.cluster.unique().astype(str)), raw_ts.shape[1])
    feat = np.array(raw_ts.columns)
    values = np.array(mis_perc[0])
    for lab in range(1, len(raw_ts.cluster.unique())):
        feat = np.append(feat, np.array(raw_ts.columns))
        values = np.append(values, np.array(mis_perc[lab]))
    plot_miss_heat(raw_ts, cl_labels, feat, values, period=period, hierarchy=hierarchy)
    return out, mis_count_df
            cmap='rainbow_r')
plt.title('UMAP-transformed training subsample of MNIST dataset (N=7,000)')

plt.scatter(mnist_ts[:, 0],
            mnist_ts[:, 1],
            c=label_ts.astype(int),
            s=0.1,
            cmap='rainbow_r')
plt.title('UMAP-transformed test subsample of MNIST dataset (N=7,000)')

classifier = KNeighborsClassifier()
clustering = AgglomerativeClustering()

findbestclust = FindBestClustCV(nfold=2,
                                nclust_range=list(range(2, 12)),
                                s=classifier,
                                c=clustering,
                                nrand=10,
                                n_jobs=1)

metrics, nbest = findbestclust.best_nclust(mnist_tr,
                                           iter_cv=10,
                                           strat_vect=label_tr)
out = findbestclust.evaluate(mnist_tr, mnist_ts, nbest)

plot_metrics(
    metrics,
    title="Relative clustering validation performance on MNIST dataset")

perm_lab = kuhn_munkres_algorithm(label_ts.astype(int), out.test_cllab)

plt.scatter(mnist_ts[:, 0],
plt.scatter(mnist_tr[:, 0],
            mnist_tr[:, 1],
            c=label_tr.astype(int),
            s=0.1,
            cmap='rainbow_r')
plt.title('UMAP-transformed training subsample of MNIST dataset (N=7,000)')

plt.scatter(mnist_ts[:, 0], mnist_ts[:, 1],
            c=label_ts.astype(int), s=0.1, cmap='rainbow_r')
plt.title('UMAP-transformed test subsample of MNIST dataset (N=7,000)')

# Run relative clustering validation
classifier = KNeighborsClassifier()
clustering = AgglomerativeClustering()

findbestclust = FindBestClustCV(nfold=10, nclust_range=[2, 12],
                                s=classifier, c=clustering, nrand=100)

metrics, nbest, _ = findbestclust.best_nclust(mnist_tr, label_tr)
out = findbestclust.evaluate(mnist_tr, mnist_ts, nbest)

plot_metrics(metrics, "Relative clustering validation performance on MNIST dataset")

perm_lab = _kuhn_munkres_algorithm(label_ts.astype(int), out.test_cllab)

plt.scatter(mnist_ts[:, 0], mnist_ts[:, 1],
            c=perm_lab, s=0.1, cmap='rainbow_r')
plt.title("Predicted labels for MNIST test set")

print(f"Best number of clusters: {nbest}")
print(f"Test set external ACC: "
      f"{1 - zero_one_loss(label_ts.astype(int), perm_lab)}")
Esempio n. 14
0
# ----------------------------------
data = make_blobs(1000, 10, centers=5, random_state=42)
plt.scatter(data[0][:, 0], data[0][:, 1], c=data[1], cmap='rainbow_r')

classifier = KNeighborsClassifier()
clustering = AgglomerativeClustering()

X_tr, X_ts, y_tr, y_ts = train_test_split(data[0],
                                          data[1],
                                          test_size=0.30,
                                          random_state=42,
                                          stratify=data[1])

findbestclust = FindBestClustCV(nfold=2,
                                nclust_range=list(range(2, 7)),
                                s=classifier,
                                c=clustering,
                                nrand=10)
metrics, nbest = findbestclust.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)
out = findbestclust.evaluate(X_tr, X_ts, nbest)
plot_metrics(metrics, title="Reval performance")

perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

print(f"Best number of clusters: {nbest}")
print(f"Test set external ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}")
print(f'AMI = {adjusted_mutual_info_score(y_ts, out.test_cllab)}')
print(
    f"Validation set normalized stability (misclassification): {metrics['val'][nbest]}"
)
print(f'Test set ACC = {out.test_acc}')
Esempio n. 15
0
    def gridsearch_cv(self, df, n_neigh, na_perc, cl_range, cv_fold, save=None):
        """
        This function can be performed to decide which percentage of missing information
        to allow, and what's the best number of neighbors to consider both for the KNNImputer
        and the KNNClassifier. It takes as input the dataset as output from the function
        `create_dataset:dataset`.

        :param df: dataframe of merged instrument versions (longitudinal entries)
        :type df: pandas dataframe
        :param n_neigh: number of neighbors for imputation and classification
        :type n_neigh: tuple
        :param na_perc: max percentage of missing information allowed
        :type na_perc: tuple
        :param cl_range: range of minimum and maximum number of clusters to look for
        :type cl_range: tuple
        :param cv_fold: number of cross validation loop for RCV
        :type cv_fold: tuple
        :param save: whether to save the performance score table, defaults None
            Name of the file required
        :type save: str
        :return: best number of n_neigh, na_perc and cv_fold, and a summary of all performances
            The best performance is the one that has highest mean acc scores in both validation
                and test and the minimum mean amplitude of CIs
        :rtype: dict, pandas dataframe
        """
        logging.disable(logging.CRITICAL)
        subdomain_feat = [c for c in df.columns if re.search('vscore', c) and not re.search('written', c)]
        domain_feat = [c for c in df.columns if re.search('totalb', c) and not re.search('composite', c)]
        if save is not None:
            with(open(os.path.join(ut.out_folder, f'{save}.csv'), 'w')) as f:
                wr = csv.writer(f, delimiter=';', lineterminator='\n')
                wr.writerow(['cv_fold', 'na_perc_thrs', 'n_neigh', 'period', 'feat_lev',
                             'N', 'nclust', 'val_acc', 'val_ci', 'test_acc'])
        transformer = UMAP(n_neighbors=30, min_dist=0.0, n_components=2, random_state=42)
        scores = {}
        for k in cv_fold:
            for nap in na_perc:
                dict_tr, dict_ts = self.prepare_cs_dataset(prepare_imputation(df, nap))
                for n in n_neigh:
                    scores.setdefault('cv', list()).append(k)
                    scores.setdefault('na_perc', list()).append(nap)
                    scores.setdefault('n_neigh', list()).append(n)
                    impute = KNNImputer(n_neighbors=n)
                    dict_imp = {p: _impute(dict_tr[p], dict_ts[p], impute)
                                for p in self.include_age}
                    knn = KNeighborsClassifier(n_neighbors=n)
                    clust = AgglomerativeClustering(affinity='euclidean', linkage='ward')

                    relval = FindBestClustCV(s=knn, c=clust, nfold=k, nclust_range=cl_range,
                                             nrand=100)
                    # Run the model
                    val_misclass = []
                    test_misclass = []
                    conf_width = []
                    for p, tup in dict_imp.items():
                        X_tr = transformer.fit_transform(tup[0][subdomain_feat])
                        X_ts = transformer.transform(tup[1][subdomain_feat])
                        metric, ncl, cv_scores = relval.best_nclust(X_tr)
                        out = relval.evaluate(X_tr, X_ts, ncl)
                        ci = (1 - (metric['val'][ncl][1][0] + metric['val'][ncl][1][1]),
                              1 - (metric['val'][ncl][1][0] - metric['val'][ncl][1][1]))
                        val_misclass.append(metric['val'][ncl][0])
                        test_misclass.append(1 - out.test_acc)
                        conf_width.append(ci[1] - ci[0])
                        if save is not None:
                            with open(os.path.join(ut.out_folder, f'{save}.csv'), 'a') as f:
                                wr = csv.writer(f, delimiter=';', lineterminator='\n')
                                wr.writerow([k, nap, n, p, 'subdomain', (X_tr.shape[0], X_ts.shape[0]),
                                             ncl, 1 - metric['val'][ncl][0],
                                             ci, out.test_acc])
                        X_tr = transformer.fit_transform(tup[0][domain_feat])
                        X_ts = transformer.transform(tup[1][domain_feat])
                        metric, ncl, cv_scores = relval.best_nclust(X_tr)
                        out = relval.evaluate(X_tr, X_ts, ncl)
                        ci = (1 - (metric['val'][ncl][1][0] + metric['val'][ncl][1][1]),
                              1 - (metric['val'][ncl][1][0] - metric['val'][ncl][1][1]))
                        val_misclass.append(metric['val'][ncl][0])
                        test_misclass.append(1 - out.test_acc)
                        conf_width.append(ci[1] - ci[0])
                        if save is not None:
                            with open(os.path.join(ut.out_folder, f'{save}.csv'), 'a') as f:
                                wr = csv.writer(f, delimiter=';', lineterminator='\n')
                                wr.writerow([k, nap, n, p, 'domain', (X_tr.shape[0], X_ts.shape[0]),
                                             ncl, 1 - metric['val'][ncl][0],
                                             ci, out.test_acc])
                    scores.setdefault('avg_val_ms', list()).append(np.mean(val_misclass))
                    scores.setdefault('avg_test_ms', list()).append(np.mean(test_misclass))
                    scores.setdefault('avg_conf_width', list()).append(np.mean(conf_width))
        scores = pd.DataFrame(scores)
        minval = scores[['avg_val_ms', 'avg_test_ms', 'avg_conf_width']].apply(sum, 1).min()
        best_param = scores.loc[scores[['avg_val_ms', 'avg_test_ms', 'avg_conf_width']].apply(sum, 1) == minval]
        logging.disable(logging.NOTSET)
        logging.info(f"Best parameters selected: {best_param[['cv', 'na_perc', 'n_neigh']].to_dict('records')[0]} "
                     f"-- Scores {best_param[['avg_val_ms', 'avg_test_ms', 'avg_conf_width']].to_dict('records')[0]}")
        return best_param[['cv', 'na_perc', 'n_neigh']].to_dict('records')[0], scores
Esempio n. 16
0
    def run_rcv(self, df, demo_info, na_perc, n_neigh, cv_fold, cl_range, scatter=False, heatmap=False):
        """
        This function performs RCV method with fixed percentage of missing information,
        number of neighbors, number of cross validation. The range of clusters to consider still has to vary
        as desired. It is possible to flag :param scatter: and :param heatmap: to enable the
        visualization of UMAP scatterplots and the percentage fo missing information per feature for each cluster.
        The input dataset is a dataframe as returned by `strat:create_dataset:dataset` function (hence
        also longitudinal datasets). Finally, distance matrices for replication analysis are stored in the
        output folder. Also imputed datasets are saved to csv files.

        :param df: dataset obtained by merging different versions of the same instrument
        :type df: pandas dataframe
        :param demo_info: demographic information
        :type demo_info: dict
        :param na_perc: percentage of missing information
        :type na_perc: float
        :param n_neigh: number of neighbors for imputation and classification
        :type n_neigh: int
        :param cv_fold: number of cross validation iterations
        :type cv_fold: int
        :param cl_range: min/max number of clusters to consider
        :type cl_range: tuple
        :param scatter: flag for UMAP scatterplot (for training and test), defaults to False
        :type scatter: bool
        :param heatmap: flag for heatmap displaying percentage of missing information
            per feature for each cluster identified by the RCV method. Defaults to False.
        :type heatmap: bool
        :return: imputed datasets with clustering labels
        :rtype: dict
        """
        flatui = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2",
                  "#7f7f7f", "#bcbd22", "#17becf", "#8c564b", "#a55194"]

        subdomain_feat = [c for c in df.columns if re.search('vscore', c) and not re.search('written', c)]
        domain_feat = [c for c in df.columns if re.search('totalb', c) and not re.search('composite', c)]

        dict_tr, dict_ts = self.prepare_cs_dataset(prepare_imputation(df, na_perc))
        transformer = UMAP(n_neighbors=30, min_dist=0.0, n_components=2, random_state=42)
        impute = KNNImputer(n_neighbors=n_neigh)
        dict_imp = {p: _impute(dict_tr[p], dict_ts[p], impute)
                    for p in self.include_age}
        knn = KNeighborsClassifier(n_neighbors=n_neigh)
        clust = AgglomerativeClustering(affinity='euclidean', linkage='ward')

        relval = FindBestClustCV(s=knn, c=clust, nfold=cv_fold, nclust_range=cl_range,
                                 nrand=100)
        # Run the model
        for p, tup in dict_imp.items():
            dict_imp[p][1]['sex'] = [demo_info[gui].sex for gui in dict_imp[p][1].index]
            dict_imp[p][1]['phenotype'] = [demo_info[gui].phenotype.replace("'", "") for gui in dict_imp[p][1].index]
            dict_imp[p][1]['race'] = [demo_info[gui].race for gui in dict_imp[p][1].index]
            dict_imp[p][1]['collection_id'] = [dict_ts[p].loc[gui].collection_id for gui in dict_imp[p][1].index]
            dict_imp[p][1]['interview_age'] = [dict_ts[p].loc[gui].interview_age for gui in dict_imp[p][1].index]

            X_tr = transformer.fit_transform(tup[0][subdomain_feat])
            X_ts = transformer.transform(tup[1][subdomain_feat])
            metric, ncl, cv_scores = relval.best_nclust(X_tr)
            out = relval.evaluate(X_tr, X_ts, ncl)
            logging.info(f"Best number of clusters: {ncl}")
            logging.info(f"Training ACC: {out.train_acc}, Test ACC: {out.test_acc}")
            dict_imp[p][0]['cluster_subdomain'], dict_imp[p][1][
                'cluster_subdomain'] = out.train_cllab + 1, out.test_cllab + 1
            _, subj_mis = _check_na_perc(dict_ts[p][subdomain_feat])
            dict_imp[p][1]['missing_subdomain'] = list(subj_mis.values())
            plot_metrics(metric,
                         f'UMAP preprocessed dataset, RCV misclassification performance at {p}, level subdomain')
            if scatter:
                _scatter_plot(X_tr,
                              [(gui, cl) for gui, cl in zip(dict_imp[p][0].index, out.train_cllab + 1)],
                              flatui,
                              10, 15,
                              {str(ncl): '-'.join(['cluster', str(ncl)]) for ncl in
                               sorted(np.unique(out.train_cllab + 1))},
                              title=f'Subgroups of UMAP preprocessed Vineland TRAINING '
                                    f'dataset (period: {p} -- level: subdomain)')

                _scatter_plot(X_ts,
                              [(gui, cl) for gui, cl in zip(dict_imp[p][1].index, out.test_cllab + 1)],
                              flatui,
                              10, 15, {str(ncl): '-'.join(['cluster', str(ncl)]) for ncl in
                                       sorted(np.unique(out.test_cllab + 1))},
                              title=f'Subgroups of UMAP preprocessed Vineland TEST '
                                    f'dataset (period: {p} -- level: subdomain)')
            if heatmap:
                dict_ts[p]['cluster'] = out.test_cllab + 1
                feat = []
                values = []
                cl_labels = np.repeat(sorted(dict_ts[p].cluster.unique().astype(str)), len(subdomain_feat))
                for lab in np.unique(sorted(out.test_cllab + 1)):
                    na_feat, _ = _check_na_perc(dict_ts[p].loc[dict_ts[p].cluster == lab][subdomain_feat])
                    feat.extend(list(na_feat.keys()))
                    values.extend(list(na_feat.values()))
                plot_miss_heat(dict_ts[p], cl_labels, feat, values, period=p, hierarchy='subdomain')

            X_tr = transformer.fit_transform(tup[0][domain_feat])
            X_ts = transformer.transform(tup[1][domain_feat])
            metric, ncl, cv_scores = relval.best_nclust(X_tr)
            plot_metrics(metric,
                         f'UMAP preprocessed dataset, RCV misclassification performance at {p}, level domain')
            out = relval.evaluate(X_tr, X_ts, ncl)
            logging.info(f"Best number of clusters: {ncl}")
            logging.info(f"Training ACC: {out.train_acc}, Test ACC: {out.test_acc}")
            dict_imp[p][0]['cluster_domain'], dict_imp[p][1]['cluster_domain'] = out.train_cllab + 1, out.test_cllab + 1
            _, subj_mis = _check_na_perc(dict_ts[p][domain_feat])
            dict_imp[p][1]['missing_domain'] = list(subj_mis.values())
            if scatter:
                _scatter_plot(X_tr,
                              [(gui, cl) for gui, cl in zip(dict_imp[p][0].index, out.train_cllab + 1)],
                              flatui,
                              10, 15,
                              {str(ncl): '-'.join(['cluster', str(ncl)]) for ncl in
                               sorted(np.unique(out.train_cllab + 1))},
                              title=f'Subgroups of UMAP preprocessed Vineland TRAINING '
                                    f'dataset (period: {p} -- level: domain)')

                _scatter_plot(X_ts,
                              [(gui, cl) for gui, cl in zip(dict_imp[p][1].index, out.test_cllab + 1)],
                              flatui,
                              10, 15, {str(ncl): '-'.join(['cluster', str(ncl)]) for ncl in
                                       sorted(np.unique(out.test_cllab + 1))},
                              title=f'Subgroups of UMAP preprocessed Vineland TEST '
                                    f'dataset (period: {p} -- level: domain)')
            if heatmap:
                dict_ts[p]['cluster'] = out.test_cllab + 1
                feat = []
                values = []
                cl_labels = np.repeat(sorted(dict_ts[p].cluster.unique().astype(str)), len(domain_feat))
                for lab in np.unique(sorted(out.test_cllab + 1)):
                    na_feat, _ = _check_na_perc(dict_ts[p].loc[dict_ts[p].cluster == lab][domain_feat])
                    feat.extend(list(na_feat.keys()))
                    values.extend(list(na_feat.values()))
                plot_miss_heat(dict_ts[p], cl_labels, feat, values, period=p, hierarchy='subdomain')

        logging.info("Saving train/test datasets with new cluster")
        new_dict_imp = relabel(dict_imp, plot_scatter=False)
        for p in new_dict_imp.keys():
            new_dict_imp[p][0].to_csv(os.path.join(ut.out_folder, f'imputed_data_{p}_tr.csv'),
                                      index_label='subjectkey')
            new_dict_imp[p][1].to_csv(os.path.join(ut.out_folder, f'imputed_data_{p}.csv'),
                                      index_label='subjectkey')

        logging.info("Building distance matrix...")
        _build_distmat(imp_dict=dict_imp)

        return dict_imp
def example1():
    # Generate dataset
    data = make_blobs(1000, 2, centers=5,
                      center_box=(-20, 20),
                      random_state=42)

    # Visualize dataset
    plt.figure(figsize=(6, 4))
    for i in range(5):
        plt.scatter(data[0][data[1] == i][:, 0],
                    data[0][data[1] == i][:, 1],
                    label=i, cmap='tab20')
    plt.title("Blobs dataset")
    # plt.savefig('./blobs.png', format='png')
    plt.show()

    # Create training and test sets
    X_tr, X_ts, y_tr, y_ts = train_test_split(data[0],
                                              data[1],
                                              test_size=0.30,
                                              random_state=42,
                                              stratify=data[1])

    # Initialize clustering and classifier
    classifier = KNeighborsClassifier(n_neighbors=15)
    clustering = KMeans()

    # Run relatve validation (repeated CV and testing)
    findbestclust = FindBestClustCV(nfold=2,
                                    nclust_range=list(range(2, 7, 1)),
                                    s=classifier,
                                    c=clustering,
                                    nrand=10,
                                    n_jobs=N_JOBS)
    metrics, nbest = findbestclust.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)
    out = findbestclust.evaluate(X_tr, X_ts, nclust=nbest)

    # Plot CV metrics
    plot_metrics(metrics, prob_lines=False)
    logging.info(f"Validation stability: {metrics['val'][nbest]}")
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)

    logging.info(f"Best number of clusters: {nbest}")
    logging.info(f'AMI (true labels vs predicted labels) for test set = '
                 f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    logging.info('\n\n')

    # Compute metrics
    logging.info("Metrics from true label comparisons on test set:")
    class_scores = compute_metrics(y_ts, perm_lab, perm=False)
    for k, val in class_scores.items():
        if k in ['F1', 'MCC']:
            logging.info(f"{k}, {val}")
    logging.info("\n\n")

    # Internal measures
    # SILHOUETTE
    logging.info("Silhouette score based selection")
    sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, clustering, silhouette_score,
                                                          select='max',
                                                          nclust_range=list(range(2, 7, 1)))
    sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, clustering, silhouette_score,
                                                          select='max',
                                                          nclust_range=list(range(2, 7, 1)))

    sil_eval = evaluate_best(X_ts, clustering, silhouette_score, sil_best_tr)

    logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: "
                 f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
    logging.info(f"Test set evaluation {sil_eval}")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
    logging.info('\n\n')

    # DAVIES-BOULDIN
    logging.info("Davies-Bouldin score based selection")
    db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, clustering, davies_bouldin_score,
                                                       select='min', nclust_range=list(range(2, 7, 1)))
    db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, clustering, davies_bouldin_score,
                                                       select='min', nclust_range=list(range(2, 7, 1)))

    db_eval = evaluate_best(X_ts, clustering, davies_bouldin_score, db_best_tr)

    logging.info(f"Best number of clusters (and scores) for tr/ts independent runs: "
                 f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
    logging.info(f"Test set evaluation {db_eval}")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
    logging.info('\n\n')

    # Plot true vs predicted labels for test sets
    plt.figure(figsize=(6, 4))
    for i in range(5):
        plt.scatter(X_ts[y_ts == i][:, 0],
                    X_ts[y_ts == i][:, 1],
                    label=str(i),
                    cmap='tab20')
    plt.legend(loc=3)
    plt.title("Test set true labels")
    # plt.savefig('./blobs_true.png', format='png')
    plt.show()

    plt.figure(figsize=(6, 4))
    for i in range(5):
        plt.scatter(X_ts[perm_lab == i][:, 0],
                    X_ts[perm_lab == i][:, 1],
                    label=str(i),
                    cmap='tab20')
    plt.legend(loc=3)
    plt.title("Test set clustering labels")
    # plt.savefig('./blobs_clustering.png', format='png')
    plt.show()
def example2():
    mnist = fetch_openml('mnist_784', version=1)
    mnist.target = mnist.target.astype(int)

    X_tr, y_tr = mnist['data'][:60000], mnist.target[:60000]
    X_ts, y_ts = mnist['data'][60000::], mnist.target[60000::]
    transform = UMAP(n_components=2,
                     random_state=42,
                     n_neighbors=30,
                     min_dist=0.0)
    X_tr = transform.fit_transform(X_tr)
    X_ts = transform.transform(X_ts)

    s = KNeighborsClassifier(n_neighbors=30)
    c = hdbscan.HDBSCAN(min_samples=10,
                        min_cluster_size=200)

    reval = FindBestClustCV(s=s,
                            c=c,
                            nfold=2,
                            nrand=10,
                            n_jobs=N_JOBS)

    metrics, nclustbest, tr_lab = reval.best_nclust(X_tr, iter_cv=10, strat_vect=y_tr)

    plot_metrics(metrics)

    out = reval.evaluate(X_tr, X_ts, nclust=nclustbest, tr_lab=tr_lab)
    perm_lab = kuhn_munkres_algorithm(y_ts, out.test_cllab)
    logging.info(f"Validation stability: {metrics['val'][nclustbest]}")

    logging.info(f"Best number of clusters during CV: {nclustbest}")
    logging.info(f"Best number of clusters on test set: "
                 f"{len([lab for lab in np.unique(out.test_cllab) if lab >= 0])}")
    logging.info(f'AMI (true labels vs predicted labels) = '
                 f'{adjusted_mutual_info_score(y_ts, out.test_cllab)}')
    logging.info('\n\n')

    logging.info("Metrics from true label comparisons on test set:")
    class_scores = compute_metrics(y_ts, perm_lab)
    for k, val in class_scores.items():
        logging.info(f'{k}, {val}')
    logging.info('\n\n')

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=y_tr, cmap='rainbow_r',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set true labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=kuhn_munkres_algorithm(y_tr, tr_lab),
                         cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set predicted labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         c=y_ts, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set true labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         s=0.1,
                         c=perm_lab, cmap='tab20')
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set clustering labels (digits dataset)")
    plt.show()

    # Internal measures
    # SILHOUETTE
    logging.info("Silhouette score based selection")
    sil_score_tr, sil_best_tr, sil_label_tr = select_best(X_tr, c, silhouette_score, select='max')
    sil_score_ts, sil_best_ts, sil_label_ts = select_best(X_ts, c, silhouette_score, select='max')
    logging.info(
        f"Best number of clusters (and scores) for tr/ts independent runs: "
        f"{sil_best_tr}({sil_score_tr})/{sil_best_ts}({sil_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, sil_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, sil_label_ts))}')
    logging.info('\n\n')

    # DAVIES-BOULDIN
    logging.info("Davies-Bouldin score based selection")
    db_score_tr, db_best_tr, db_label_tr = select_best(X_tr, c, davies_bouldin_score,
                                                       select='min')
    db_score_ts, db_best_ts, db_label_ts = select_best(X_ts, c, davies_bouldin_score,
                                                       select='min')

    logging.info(
        f"Best number of clusters (and scores) for tr/ts independent runs: "
        f"{db_best_tr}({db_score_tr})/{db_best_ts}({db_score_ts})")
    logging.info(f'AMI (true labels vs clustering labels) training = '
                 f'{adjusted_mutual_info_score(y_tr, kuhn_munkres_algorithm(y_tr, db_label_tr))}')
    logging.info(f'AMI (true labels vs clustering labels) test = '
                 f'{adjusted_mutual_info_score(y_ts, kuhn_munkres_algorithm(y_ts, db_label_ts))}')
    logging.info('\n\n')

    # Visualization
    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=sil_label_tr, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set silhouette labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         c=sil_label_ts, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set silhouette labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_tr[:, 0],
                         X_tr[:, 1],
                         c=db_label_tr, cmap='tab20',
                         s=0.1)
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Train set Davies-Bouldin labels (digits dataset)")
    plt.show()

    fig, ax = plt.subplots(figsize=(10, 8))
    scatter = ax.scatter(X_ts[:, 0],
                         X_ts[:, 1],
                         s=0.1,
                         c=db_label_ts, cmap='tab20')
    legend = ax.legend(*scatter.legend_elements())
    ax.add_artist(legend)
    plt.title("Test set Davies-Bouldin labels (digits dataset)")
    plt.show()
Esempio n. 19
0
plt.scatter(data1[0][:, 0], data1[0][:, 1],
            c=data1[1], cmap='rainbow_r')
plt.title('True labels for 10-feature dataset')

X_tr, X_ts, y_tr, y_ts = train_test_split(data1[0],
                                          data1[1],
                                          test_size=0.30,
                                          random_state=42,
                                          stratify=data1[1])
# Apply relative clustering validation with KNN and Hierarchical clustering
classifier = KNeighborsClassifier()
clustering = AgglomerativeClustering()

findbestclust = FindBestClustCV(nfold=10,
                                nclust_range=list(range(2, 7)),
                                s=classifier,
                                c=clustering,
                                nrand=100)
metrics, nbest = findbestclust.best_nclust(data=X_tr, strat_vect=y_tr)
out = findbestclust.evaluate(X_tr, X_ts, nbest)

plot_metrics(metrics, title="Reval performance for synthetic dataset with 10 features")

data2 = make_blobs(1000, 20, centers=5, cluster_std=5, random_state=42)

plt.scatter(data2[0][:, 0], data2[0][:, 1],
            c=data2[1], cmap='rainbow_r')
plt.title('True labels for 20-feature dataset')

X_tr, X_ts, y_tr, y_ts = train_test_split(data2[0],
                                          data2[1],
Esempio n. 20
0
def blobs_performance():
    """
    Function performing multiple iterations of reval on simulated 5-blob datasets
    with varying number of samples and features and 10 repetitions of 10-fold CVs.
    The function plots the performance (in seconds) of the algorithm for each
    parameter configuration.
    """
    feat = [10, 100, 500, 1000]
    samples = [100, 500, 1000, 1500, 2000]

    perftime = []
    for s in samples:
        perf = []
        for f in feat:
            start = time.time()
            data = make_blobs(s, f, 5, center_box=(-20, 20), random_state=42)

            X_tr, X_ts, y_tr, y_ts = train_test_split(data[0],
                                                      data[1],
                                                      test_size=0.30,
                                                      random_state=42,
                                                      stratify=data[1])

            classifier = KNeighborsClassifier(n_neighbors=5)
            clustering = KMeans()

            findbestclust = FindBestClustCV(nfold=10,
                                            nclust_range=[2, 7],
                                            s=classifier,
                                            c=clustering,
                                            nrand=100)
            metrics, nbest, _ = findbestclust.best_nclust(X_tr,
                                                          iter_cv=10,
                                                          strat_vect=y_tr)
            tmp_time = time.time() - start
            perf.append(tmp_time)
            print(
                f'Feat {f}, samples {s}: N cluster {nbest}, time: {tmp_time}')
        perftime.append(perf)

    perftime = np.array(perftime)
    fig, ax = plt.subplots()
    ax.plot(samples,
            perftime[:, 0],
            label='10 features',
            linestyle='--',
            color='black')
    ax.plot(samples, perftime[:, 1], label='100 features', color='black')
    ax.plot(samples,
            perftime[:, 2],
            label='500 features',
            linestyle='-.',
            color='black')
    ax.plot(samples,
            perftime[:, 3],
            label='1000 features',
            linestyle=':',
            color='black')
    ax.set_xlabel('Number of samples')
    ax.set_ylabel('Execution time (s)')
    ax.set_title("")
    ax.legend()
    plt.savefig('./performance_blobs.png', dpi=300)
plt.scatter(data[0][:, 0], data[0][:, 1], c=data[1], cmap='rainbow_r')

# Split them into training and test set (30% of data)
X_tr, X_ts, y_tr, y_ts = train_test_split(data[0],
                                          data[1],
                                          test_size=0.30,
                                          random_state=42,
                                          stratify=data[1])

# Apply relative clustering validation with KNN and Hierarchical clustering
classifier = KNeighborsClassifier()
clustering = AgglomerativeClustering()

findbestclust = FindBestClustCV(nfold=10,
                                nclust_range=[2, 7],
                                s=classifier,
                                c=clustering,
                                nrand=100)
metrics, nbest, _ = findbestclust.best_nclust(X_tr, y_tr)
out = findbestclust.evaluate(X_tr, X_ts, nbest)

perm_lab = _kuhn_munkres_algorithm(y_ts, out.test_cllab)

print(f"Best number of clusters: {nbest}")
print(f"Test set external ACC: " f"{1 - zero_one_loss(y_ts, perm_lab)}")
print(f'AMI = {adjusted_mutual_info_score(y_ts, out.test_cllab)}')
print(
    f"Validation set normalized stability (misclassification): {metrics['val'][nbest]}"
)
print(f'Test set ACC = {out.test_acc}')
Esempio n. 22
0
def time_cmplx(n_jobs=1):
    data = make_blobs(100, 10, centers=2)
    data_tr, data_ts, y_tr, y_ts = train_test_split(data[0],
                                                    data[1],
                                                    test_size=0.5,
                                                    stratify=data[1],
                                                    random_state=42)
    s = [
        KNeighborsClassifier(),
        SVC(),
        LogisticRegression(),
        RandomForestClassifier()
    ]
    c = [HDBSCAN(), AgglomerativeClustering(), KMeans(), SpectralClustering()]
    param = itertools.product(s, c)

    labels = ['KNN'] * 4 + ['SVM'] * 4 + ["LR"] * 4 + ['RF'] * 4
    time_cv = {'LR': [], 'KNN': [], 'RF': [], 'SVM': []}
    time_ev = {'LR': [], 'KNN': [], 'RF': [], 'SVM': []}
    for idx, mod in enumerate(param):
        classifier, clustering = mod[0], mod[1]
        findbest = FindBestClustCV(s=classifier,
                                   c=clustering,
                                   nrand=10,
                                   nfold=2,
                                   n_jobs=n_jobs,
                                   nclust_range=list(range(2, 7, 1)))
        if isinstance(clustering, HDBSCAN):
            start = time.time()
            _, _, tr_lab = findbest.best_nclust(data_tr,
                                                iter_cv=10,
                                                strat_vect=y_tr)
            time_cv[labels[idx]].append(time.time() - start)

            start = time.time()
            findbest.evaluate(data_tr, data_ts, nclust=2, tr_lab=tr_lab)
            time_ev[labels[idx]].append(time.time() - start)
        else:
            start = time.time()
            _, _ = findbest.best_nclust(data_tr, iter_cv=10, strat_vect=y_tr)
            time_cv[labels[idx]].append(time.time() - start)

            start = time.time()
            findbest.evaluate(data_tr, data_ts, nclust=2)
            time_ev[labels[idx]].append(time.time() - start)

    pkl.dump(time_cv, open(f'time_cv_njobs{n_jobs}.pkl', 'wb'))
    pkl.dump(time_ev, open(f'time_ev_njobs{n_jobs}.pkl', 'wb'))

    clustering = KMeans()
    classifier = KNeighborsClassifier()
    time_knnkmeans = {10: [], 100: [], 1000: []}
    for nsamples, nfeatures in itertools.product(
        [100, 500, 1000, 1500, 2000, 2500, 3000], [10, 100, 1000]):
        data = make_blobs(nsamples, nfeatures, centers=2)
        data_tr, data_ts, y_tr, y_ts = train_test_split(data[0],
                                                        data[1],
                                                        test_size=0.5,
                                                        stratify=data[1],
                                                        random_state=42)
        findbest = FindBestClustCV(s=classifier,
                                   c=clustering,
                                   nrand=10,
                                   nfold=2,
                                   n_jobs=n_jobs,
                                   nclust_range=list(range(2, 7, 1)))
        start = time.time()
        _, _ = findbest.best_nclust(data_tr, iter_cv=10, strat_vect=y_tr)
        findbest.evaluate(data_tr, data_ts, nclust=2)
        time_knnkmeans[nfeatures].append(time.time() - start)

    pkl.dump(time_knnkmeans, open(f'time_knnkmeans{n_jobs}.pkl', 'wb'))