Exemple #1
0
def score_with_davies(k):
    kmean = KMeans(k)
    labels = kmean.fit_predict(test)
    score = davies_bouldin_score(test, labels)
    return score
Exemple #2
0
def apply_algo_k_3scores(X, params=None, quiet=True):
    '''Adapation of apply_algo_k_auto in benchmark context.
    
    [IN]
        - X (np.array[N,p]): design matrix to put in input of the algorithm. Each line is an observation, each column is a predictor.
        - params (dict): dict with all settings. Depends on 'max_k', 'n_clusters'
        - quiet (bool): if True, all prints are skipped
        
    [OUT]
        - labels (np.array[N]): vector of cluster number attribution
            BEWARE: the cluster identification number are random. Only borders matter.
        - n_clusters_opt (int): optimal number of clusters to be found in the data
        - classif_scores (float): value of classification score (chosen in params['n_clusters']) for the returned classification.
    '''

    if params is None:
        params = utils.get_default_params()

    # Apply algorithm and compute scores for several number of clusters
    all_labels = []
    s_scores = []
    db_scores = []
    ch_scores = []
    for n_clusters in range(2, params['max_k'] + 1):

        labels = apply_algo(X, n_clusters, params=params)
        all_labels.append(labels)

        if len(np.unique(labels)) > 1:
            with np.errstate(
                    divide='ignore', invalid='ignore'
            ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                db_scores.append(davies_bouldin_score(X, labels))
            s_scores.append(silhouette_score(X, labels))
            ch_scores.append(calinski_harabaz_score(X, labels))
        else:
            db_scores.append(np.nan)
            s_scores.append(np.nan)
            ch_scores.append(np.nan)

    # Choose the best number of clusters
    valid = True
    if params['classif_score'] in ['silhouette', 'silh']:
        k_best = np.nanargmax(s_scores)
        if s_scores[k_best] < 0.6:
            if not quiet:
                print("Bad classification according to silhouette score (",
                      s_scores[k_best], "). BLH is thus NaN")
            valid = False
    elif params['classif_score'] in ['davies_bouldin', 'db']:
        k_best = np.nanargmin(db_scores)
        if db_scores[k_best] > 0.4:
            if not quiet:
                print("Bad classification according to Davies-Bouldin score (",
                      db_scores[k_best], "). BLH is thus NaN")
            valid = False
    else:
        k_best = np.nanargmax(ch_scores)
        if ch_scores[k_best] < 200:
            if not quiet:
                print(
                    "Bad classification according to Calinski-Harabasz score (",
                    ch_scores[k_best], "). BLH is thus NaN")
            valid = False

    if all(np.isnan(db_scores)):
        valid = False

    # Return the results
    if valid:
        result = all_labels[k_best], k_best + 2, s_scores[k_best], db_scores[
            k_best], ch_scores[k_best]
    else:
        result = None, np.nan, s_scores[k_best], db_scores[k_best], ch_scores[
            k_best]

    return result
Exemple #3
0
    time_start = time.process_time()  # On regarde le temps CPU

    dataset = arff.loadarff(open(path + list_dataset[i], 'r'))
    data = [[x[0], x[1]] for x in dataset[0]]

    dbscan = cluster.DBSCAN(eps=esp[i], min_samples=min_samples[i])
    y_pred = dbscan.fit_predict(data)

    labels = dbscan.labels_

    plt.scatter((dataset[0])['x'], (dataset[0])['y'], c=y_pred, cmap="tab20")
    plt.show()
    if len(np.unique(labels)) > 1:
        print("Indice de Davies-Bouldin : " +
              str(metrics.davies_bouldin_score(data, labels)))
        print("Coefficient de silhouette : " +
              str(metrics.silhouette_score(data, labels, metric='euclidean')))

    time_stop = time.process_time()  # On regarde le temps CPU
    print("Temps de calcul : " + str(time_stop - time_start))
print(
    "\n#=================== Valeurs de esp et min_samples NON donné ===================#"
)

print("\n#--------------------- Variation de epsilon -----------------------#")
min_samples = [5, 34, 2, 19, 11, 2]
for i in range(len(list_dataset)):
    print("\n#----------------- " + list_dataset[i] + " --------------------#")
    time_list = []
    davies_bouldin_list = []
    print("\n************"+datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')+"************\n")
    text_file.close()
    
    ### Number of clusters
#    ranged = [2, 3, 4, 12]

    for n_clusters in range(2,31):

        ## linkage{"ward","complete","average","single"}, optional (default="ward")
        model = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage=mode)
        predict = pd.DataFrame(model.fit_predict(df))
        predict.columns=['predict']

        # concatenate labels to df as a new column
        r = pd.concat([df,predict],axis=1)

        r.to_csv("csv/"+datestamp+"_Linkage_"+mode+"_3_"+str(n_clusters)+".csv")

        #print(r.sample(10))
        # clusters
        silhouette_avg = silhouette_score(df.values, predict.values.ravel())
        DBI_avg = davies_bouldin_score(df.values, predict.values.ravel())
        text_file = open("data/"+datestamp+"_3_"+mode+"Linkage_score.txt", "a+")
        text_file.write("\n\nn_clusters ="+str(n_clusters)+"The average silhouette_score is :"+str(silhouette_avg))
        text_file.write("\nn_clusters ="+str(n_clusters)+"The average DBI_score is :"+str(DBI_avg))
        text_file.close()
        
        print("For n_clusters =", n_clusters, "The average silhouette_score is :", silhouette_avg)
        print("For n_clusters =", n_clusters, "The average DBI_score is :", DBI_avg)

Exemple #5
0
 def daviesBouldinScore(X, y_pred):
     return np.tanh(metrics.davies_bouldin_score(X, y_pred))
Exemple #6
0
def methods(dataset, clustering, params, dictionary):
    cluster_plot(dataset, clustering, params)
    print(colored(f"Clustering quality for {params['name']}", 'red'))
    clustering_quality(silhouette_score(dataset, clustering.labels_), 'silhouette', dictionary)
    clustering_quality(davies_bouldin_score(dataset, clustering.labels_), 'davies_bouldin', dictionary)
    clustering_quality(calinski_harabasz_score(dataset, clustering.labels_), 'calinski_harabasz', dictionary)
def lapscore_main():

    # iterate the whole process for 10 times
    for index, subsample in enumerate(X_testset):

        # construct affinity matrix
        kwargs_W = {"metric": "euclidean", "neighbor_mode": "knn",
                    "weight_mode": "heat_kernel", "k": 5, 't': 1}
        W = construct_W.construct_W(subsample, **kwargs_W)

        # obtain the scores of  features
        idx = lap_score.lap_score(subsample, mode="rank", W=W)
        # obtian the array of variables through ranking
        X_col_list = X_test_full.columns.values.tolist()
        prepare_list['lap_ranked_Xtestset' +
                     str(index)] = get_variable_rank(idx, X_col_list)
        ranked_var_filename = 'lap_ranked_Xtestset' + str(index) + '.txt'
        f_rank = open(ranked_var_filename, 'w')
        f_rank.write(str(prepare_list['lap_ranked_Xtestset' + str(index)]))
        f_rank.close()

        # perform evaluation on clustering task
        range_num_fea = range(10, 210, 10)  # number of selected features
        range_n_clusters = [3, 4, 5, 6, 7, 8, 9, 10]  # number of clusters

        # dynamic generating dictionaries to store results
        prepare_list['lapscore_criteria' +
                     str(index)] = {'silhouette_score': [], 'ch_score': [], 'db_score': []}

        # deciding optimal value for num_cluster and the optimal number of selected features

        for n_cluster in range_n_clusters:

            for num_features in range_num_fea:
                # obtain the dataset on the selected features
                selected_features = subsample[:, idx[0:num_features]]

                # initialize the clusterer with n_clusters value and a random generator
                # seed of 10 for reproducbility
                clusterer = KMeans(
                    n_clusters=n_cluster, random_state=10)
                cluster_labels = clusterer.fit_predict(selected_features)

                # the silhouette_score gives the average value for all the samples
                # this gives a perspective into the density and separation of the formed clusters
                silhouette_avg = metrics.silhouette_score(
                    selected_features, cluster_labels, metric='euclidean')
                # write the content into the dict
                prepare_list['lapscore_criteria' +
                             str(index)]['silhouette_score'].append(silhouette_avg)
                # in normal usage, the Calinski-Harabasz index is applied to the results of a cluster analysis
                ch_idx = metrics.calinski_harabasz_score(
                    selected_features, cluster_labels)
                # write the content into the dict
                prepare_list['lapscore_criteria' + str(index)
                             ]['ch_score'].append(ch_idx)
                # in normal usage, the Davies-Bouldin index is applied to the results of a cluster analysis
                db_idx = davies_bouldin_score(
                    selected_features, cluster_labels)
                # write the content into the dict
                prepare_list['lapscore_criteria' +
                             str(index)]['db_score'].append(db_idx)

                print("subset No.", index, ","
                      "For n_clusters =", n_cluster, ","
                      "For num_features =", num_features, ","
                      "the average silhouette_score is: ", silhouette_avg, ","
                      "the Calinski-Harabasz index is: ", ch_idx, ","
                      "the Davies-Bouldin index is: ", db_idx)

    lapscore_silhouette_score = generate_criteria_tb(
        dict_name='lapscore_criteria', col_name='silhouette_score')
    lapscore_Calinski_Harabasz_index = generate_criteria_tb(
        dict_name='lapscore_criteria', col_name='ch_score')
    lapscore_Davies_Bouldin_index = generate_criteria_tb(
        dict_name='lapscore_criteria', col_name='db_score')

    lapscore_silhouette_score.to_csv(
        'lapscore_silhouette_score.csv', index=False)
    lapscore_Calinski_Harabasz_index.to_csv(
        'lapscore_Calinski_Harabasz_index.csv', index=False)
    lapscore_Davies_Bouldin_index.to_csv(
        'lapscore_Davies_Bouldin_index.csv', index=False)
### Number of clusters
for n_clusters in range(2, 31):
    model = KMeans(n_clusters=n_clusters).fit(data_points)
    predict = pd.DataFrame(model.fit_predict(df))
    predict.columns = ['predict']

    # concatenate labels to df as a new column
    r = pd.concat([df, predict], axis=1)
    #print(r.sample(1))
    r.to_csv("csv/" + datestamp + "_KMeans_3_" + str(n_clusters) + ".csv")

    # The silhouette_score gives the average value for all the samples.
    # This gives a perspective into the density and separation of the formed

    # clusters
    silhouette_avg = silhouette_score(data_points, predict.values.ravel())
    DBI_avg = davies_bouldin_score(data_points, predict.values.ravel())

    text_file = open("data/" + datestamp + "_3_KMeans_score.txt", "a+")
    text_file.write("\n\nn_clusters =" + str(n_clusters) +
                    "The average silhouette_score is :" + str(silhouette_avg))
    text_file.write("\nn_clusters =" + str(n_clusters) +
                    "The average DBI_score is :" + str(DBI_avg))
    text_file.close()

    print("For n_clusters =", n_clusters, "The average silhouette_score is :",
          silhouette_avg)
    print("For n_clusters =", n_clusters, "The average DBI_score is :",
          DBI_avg)
Exemple #9
0
FMI = metrics.fowlkes_mallows_score(Y, labels_pred)  # external metrics

print('SC, DB, FMI = ', SC, DB, FMI)

################## Graph metrics
sse = []
fmi = []
dbi = []
k_list = range(1, 15)
for k in k_list:
    km = KMeans(n_clusters=k, random_state=1)
    km.fit(X_norm)
    labels_pred = km.labels_
    SC = metrics.silhouette_score(X_norm, labels_pred, metric='euclidean')
    DB = metrics.davies_bouldin_score(X_norm, labels_pred)  # inner metrics
    #print(k, SC)
    sse.append([k, SC])
    dbi.append([k, DB])
    fmi.append([k, metrics.fowlkes_mallows_score(Y, labels_pred)])

#oca_results_scale = pd.DataFrame({'Cluster': range(2,15), 'SSE': sse})
oca_results_scale = pd.DataFrame({'Cluster': range(1, 15), 'FMI': fmi})
plt.figure(figsize=(12, 6))
plt.plot(pd.DataFrame(sse)[0], pd.DataFrame(sse)[1], marker='o')
#plt.plot(pd.DataFrame(dbi)[0], pd.DataFrame(dbi)[1], marker='o')
#plt.plot(pd.DataFrame(fmi)[0], pd.DataFrame(fmi)[1], marker='o')
plt.title('Optimal Number of Clusters using Elbow Method (Scaled Data)')
plt.xlabel('Number of clusters')
plt.ylabel('silhouette_score')
plt.show()
def calcDBs(preds, k):
  db = davies_bouldin_score(df, preds)
  dbs[k].append(db)
print("The Davies-Bouldin Index is used to measure better defined clusters.")
print(
    "\nThe Davies-Bouldin score is lower when clusters more separated (e.g. better partitioned).\n"
)
print("Zero is the lowest possible Davies-Bouldin score.\n")

import warnings

warnings.filterwarnings("ignore")

range_n_clusters = [2, 3, 4, 5, 6]
for n_clusters in range_n_clusters:
    clusterer = KMeans(n_clusters=n_clusters, random_state=10)
    cluster_labels = clusterer.fit_predict(peopleMatrixPcaTransform)
    score = metrics.davies_bouldin_score(peopleMatrixPcaTransform,
                                         cluster_labels)
    print("The Davies-Bouldin score for :", n_clusters, " clusters is: ",
          score)

#Silhouette Analysis with Kmeans Clustering on the PCA transformed People Matrix
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py
range_n_clusters = [2, 3, 4, 5, 6]

for n_clusters in range_n_clusters:
    # Create a subplot with 1 row and 2 columns
    fig, (ax1, ax2) = plt.subplots(1, 2)
    fig.set_size_inches(18, 7)

    # The 1st subplot is the silhouette plot
    # The silhouette coefficient can range from -1, 1 but in this example all
    # lie within [-0.1, 1]
Exemple #12
0
K = range(2, 12, 1)
distortions = []
silhouette = []
db_score = []
calinski = []
for k in K:
    print("For {} clusters".format(k))
    km = KMeans(n_clusters=k, random_state=0).fit(df)
    labels = km.labels_
    # labels = km.predict(df)
    # print(labels[0])

    print("Silhouette Score: {}".format(
        silhouette_score(df, labels, metric='euclidean')))
    print("DB Score: {}".format(davies_bouldin_score(df, labels)))
    print("Calinski-Harabasz Index: {}".format(
        metrics.calinski_harabasz_score(df, labels)))
    distortions.append(km.inertia_)
    silhouette.append(silhouette_score(df, labels, metric='euclidean'))
    db_score.append(davies_bouldin_score(df, labels))
    calinski.append(metrics.calinski_harabasz_score(df, labels))

# plt.plot(K,silhouette,K,db_score)
# plt.plot(calinski)
# plt.xlabel('k')
# plt.ylabel('Distortion')
# plt.title('The Elbow Method showing the optimal k')
# plt.legend(['Silhouette Score','DB Score'])
# plt.xticks(K)
# plt.show()
Exemple #13
0
from sklearn.metrics.pairwise import cosine_similarity
dist = cosine_similarity(feature_mat)

#Silhouette  score - Best if closer to 1
from sklearn.metrics import silhouette_score
silhouette_avg = silhouette_score(feature_mat, kmeans.labels_)
print("Silhouette score " + str(silhouette_avg))

#Calinski-Harabaz Index¶ -  Higher the score better
from sklearn.metrics import calinski_harabaz_score
print("Calinski score : " + str(calinski_harabaz_score(feature_mat, clusters)))

#Davies-Bouldin Index - Closer to zero better
from sklearn.metrics import davies_bouldin_score
print("Davies-Bouldin score : " +
      str(davies_bouldin_score(feature_mat, clusters)))

#Recommendation
#Testing with video id
df = df.reset_index()
df = df.drop(['index'], axis=1)
vid = 'iUdgD8kYU-E'  #Eg 1 Supreme court justice
#vid = 'tG3wqbEmb7s' #eg 2 Iran nuclear deal
#vid = 'Oms5r6_yJB8' #eg 3 Robert Mueller
feature_df['id'] = df['id']
feature_df['clusters'] = clusters
df['clusters'] = clusters
rec_obj = recommendation()
least_rel, most_rel = rec_obj.getRecommendation(vid, df, feature_df)

print("The titles of most relevent recommendation \n")
Exemple #14
0
        'cc3_miles': i
    }, trans_miles_avg[i - 1])

Std = StandardScaler()
X = Std.fit_transform(X.astype(float))

silhouette = []
davies = []
K = range(2, 11)
for k in K:
    clusterer = KMeans(n_clusters=k, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    silhouette_avg = silhouette_score(X, cluster_labels)
    silhouette.append(silhouette_avg)

L = range(2, 11)
for l in L:
    clusterer = KMeans(n_clusters=l, random_state=10)
    cluster_labels = clusterer.fit_predict(X)

    davies_bouldin = davies_bouldin_score(X, cluster_labels)
    davies.append(davies_bouldin)

plt.plot(K, silhouette, 'mo-', label='silhouette')
plt.plot(L, davies, 'bo-', label='davies')
plt.xlabel('k')
plt.ylabel('silhouette/davies')
plt.title('silhouette/davies - k', fontsize=14, fontweight='bold')
plt.legend()
plt.show()
Exemple #15
0
def Davies_Bouldin_score(X, labels):
    return metrics.davies_bouldin_score(X, labels)
Exemple #16
0
    elif setup['algorithm'] == 2:
        algorithm = Kmeans(data, setup['clusters'], setup['class_args'])
    elif setup['algorithm'] == 3:
        algorithm = Agglomerative(data, setup['clusters'], setup['class_args'])
    elif setup['algorithm'] == 4:
        algorithm = DensityBased(data, setup['class_args'])
    elif setup['algorithm'] == 5:
        algorithm = optics(data, setup['class_args'])

    data_labels = algorithm.fit_predict()

    print(f'Silhouette:\t\t%0.4f' % silhouette_score(data, data_labels))
    print(f'Calinski-Harabasz:\t%0.1f' %
          calinski_harabasz_score(data, data_labels))
    print(f'Davies-Bouldin:\t\t%0.4f' %
          davies_bouldin_score(data, data_labels))

    x = data.index
    y = data.iloc[:, len(data.columns) - 1]

    xlabel = "Index"
    ylabel = data.columns[len(data.columns) - 1]

    if argument_parser.get_plot_x_axis() is not None:
        x = data.iloc[:, argument_parser.get_plot_x_axis()]
        xlabel = data.columns[argument_parser.get_plot_x_axis()]

    if argument_parser.get_plot_y_axis() is not None:
        y = data.iloc[:, argument_parser.get_plot_y_axis()]
        ylabel = data.columns[argument_parser.get_plot_y_axis()]
Exemple #17
0
    def fit(self, X, y=None):
        self.columns = X.columns if self.columns is None else self.columns
        self.transform_cols = [x for x in X.columns if x in self.columns]

        # Evaluation
        if any([
                self.eval_cluster, self.eval_silhouette, self.eval_chi,
                self.eval_dbi
        ]):
            n_clusters = []
            n_noises = []
            silhouettes1 = []
            silhouettes2 = []
            chis1 = []
            chis2 = []
            dbis1 = []
            dbis2 = []

            self.eval_df = pd.DataFrame()
            self.eval_df['eps'] = [x[0] for x in self.eps_samples_tuples]
            self.eval_df['min_samples'] = [
                x[1] for x in self.eps_samples_tuples
            ]
            self.eval_df['centroid'] = self.eval_df['eps'].apply(lambda x: [])

            tmp_X = X[self.transform_cols].copy()
            index = 0
            for eps, min_samples in tqdm(self.eps_samples_tuples):
                model = copy.deepcopy(self.model)
                model.eps = eps
                model.min_samples = min_samples
                model.fit(tmp_X)

                # Cluster centroid
                # Exclude calculating centroid of noise cluster
                tmp_df = pd.concat(
                    [tmp_X, pd.Series(model.labels_, name='Cluster')], axis=1)
                tmp_df = tmp_df[tmp_df['Cluster'] != -1].reset_index(drop=True)
                self.eval_df.at[index, 'centroid'] = self.__calc_centroids(
                    tmp_df[self.transform_cols], tmp_df['Cluster'])

                tmp_X2 = tmp_X.copy()
                tmp_X2 = pd.concat(
                    [tmp_X2, pd.Series(model.labels_, name='Cluster')], axis=1)
                labels2 = tmp_X2[tmp_X2['Cluster'] != -1]['Cluster'].values
                tmp_X2 = tmp_X2[tmp_X2['Cluster'] != -1].drop(
                    columns=['Cluster']).values

                # Reference: https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html
                n_cluster = len(np.unique(model.labels_))
                n_cluster2 = len(np.unique(labels2))
                if self.eval_cluster:
                    n_clusters.append(n_cluster)
                    n_noises.append(np.sum(np.where(model.labels_ == -1, 1,
                                                    0)))

                # Reference: https://towardsdatascience.com/clustering-metrics-better-than-the-elbow-method-6926e1f723a6
                if self.eval_silhouette:
                    kwargs = {
                        'metric': 'euclidean',
                        'sample_size': self.eval_sample_size,
                        'random_state': self.random_state
                    }
                    silhouettes1.append(
                        np.nan if n_cluster <= 1 else silhouette_score(
                            tmp_X, model.labels_, **kwargs))
                    silhouettes2.append(
                        np.nan if n_cluster2 <= 1 else silhouette_score(
                            tmp_X2, labels2, **kwargs))

                # Reference: https://stats.stackexchange.com/questions/52838/what-is-an-acceptable-value-of-the-calinski-harabasz-ch-criterion
                if self.eval_chi:
                    chis1.append(np.nan if n_cluster <= 1 else
                                 calinski_harabasz_score(tmp_X, model.labels_))
                    chis2.append(np.nan if n_cluster2 <= 1 else
                                 calinski_harabasz_score(tmp_X2, labels2))

                # Reference: https://stackoverflow.com/questions/59279056/davies-bouldin-index-higher-or-lower-score-better
                if self.eval_dbi:
                    dbis1.append(np.nan if n_cluster <= 1 else
                                 davies_bouldin_score(tmp_X, model.labels_))
                    dbis2.append(np.nan if n_cluster2 <= 1 else
                                 davies_bouldin_score(tmp_X2, labels2))

                index += 1

            if self.eval_cluster:
                self.eval_df['n_cluster'] = n_clusters
                self.eval_df['n_noise'] = n_noises

            if self.eval_silhouette:
                self.eval_df['silhouette'] = silhouettes1
                self.eval_df['silhouette_w/o_noise'] = silhouettes2

            if self.eval_chi:
                self.eval_df['calinski_harabasz'] = chis1
                self.eval_df['calinski_harabasz_w/o_noise'] = chis2

            if self.eval_dbi:
                self.eval_df['davies_bouldin'] = dbis1
                self.eval_df['davies_bouldin_w/o_noise'] = dbis2

        # Train
        else:
            self.model.fit(X[self.transform_cols])

            # Exclude calculating centroid of noise cluster
            tmp_df = pd.concat([
                X[self.transform_cols],
                pd.Series(self.model.labels_, name='Cluster')
            ],
                               axis=1)
            tmp_df = tmp_df[tmp_df['Cluster'] != -1].reset_index(drop=True)

            self.centroid_df = pd.DataFrame(self.__calc_centroids(
                tmp_df[self.transform_cols], tmp_df['Cluster']),
                                            columns=self.transform_cols)
            self.centroid_df['Cluster'] = [
                f'Cluster {x}' for x in np.unique(self.model.labels_)
                if x != -1
            ]
            self.centroid_df.set_index('Cluster', inplace=True)
            self.centroid_df.index.name = None

        return self
Exemple #18
0
plt.figure()
plt.suptitle("Each row is the next iteration")
for i in range(1, iteration):
    km_kpp = KMeans(n_clusters=9, init='k-means++', max_iter=1)
    km_frandom = KMeans(n_clusters=9, init=centers_random, max_iter=1)
    km_forgy = KMeans(n_clusters=9, init='random', max_iter=1)

    km_kpp.fit(X)
    km_frandom.fit(X)
    km_forgy.fit(X)

    y_kpp = km_kpp.predict(X)
    y_frandom = km_frandom.predict(X)
    y_forgy = km_forgy.predict(X)

    dbs_kpp.append(davies_bouldin_score(X, y_kpp))
    silcoeff_kpp.append(silhouette_score(X, y_kpp))

    dbs_frandom.append(davies_bouldin_score(X, y_frandom))
    silcoeff_frandom.append(silhouette_score(X, y_frandom))

    dbs_forgy.append(davies_bouldin_score(X, y_forgy))
    silcoeff_forgy.append(silhouette_score(X, y_forgy))

    plt.subplot(iteration - 1, 3, 3 * i - 2)
    plt.title("kmeans++")
    plt.scatter(X[:, 0], X[:, 1], marker='o', c=y_kpp, s=50, cmap='viridis')
    center_kpp = km_kpp.cluster_centers_
    plt.scatter(center_kpp[:, 0],
                center_kpp[:, 1],
                c='black',
Exemple #19
0
reduced_data = PCA(n_components=23).fit_transform(
    df)  #choosing number of principal components as 23
silhouette = []
daviesBouldin = []
distortions = []
K = range(2, 30)
for k in K:
    kmeans = KMeans(n_clusters=k).fit(reduced_data)
    kmeans.fit(reduced_data)
    distortions.append(
        sum(
            np.min(cdist(reduced_data, kmeans.cluster_centers_, 'euclidean'),
                   axis=1)) / reduced_data.shape[0])
    labels = kmeans.labels_
    sh = metrics.silhouette_score(reduced_data, labels, metric='euclidean')
    db = davies_bouldin_score(reduced_data, labels)
    silhouette.append(sh)
    daviesBouldin.append(db)

# Plot the elbow graph
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.figure()

#Calculate the silhouette index for each k
plt.plot(K, silhouette, 'bx-')
plt.xlabel('k-number')
plt.ylabel('Silhouette score')
plt.title('Silhouette scores for varying k')
		plt.xlabel('First Component')
		plt.ylabel('Second Component')
		plt.show()
	
	
	print('#################Kmeans#################')
	#Plot
	if(not ignore):
		silhouette_scores=[]
		davies_bouldin_scores=[]
		calinski_harabasz_scores=[]
		for k in range(2, 20):
		    kmeans = KMeans(n_clusters=k)
		    kmeans.fit(df_pca)
		    silhouette_scores.append(silhouette_score(df_pca, kmeans.labels_))
		    davies_bouldin_scores.append(davies_bouldin_score(df_pca,kmeans.labels_))
		    calinski_harabasz_scores.append(calinski_harabasz_score(df_pca,kmeans.labels_))
		
		#Plots, we want high-high-low
		fig = plt.figure(figsize=(15, 5))
		plt.plot(range(2, 20), silhouette_scores)
		plt.grid(True)
		plt.title('Get the optimal n_clusters')
		plt.xlabel('N_clusters')
		plt.ylabel('Silhoutte Score')
		plt.show()
		
		fig = plt.figure(figsize=(15, 5))
		plt.plot(range(2, 20), calinski_harabasz_scores)
		plt.grid(True)
		plt.title('Get the optimal n_clusters')
# In[26]:


corrected_image, r,c = correct_image(X_IPCA[0].reshape(orig_rows,orig_cols) , 13)
vectorized = corrected_image.reshape((r*c), 1) 
kmeans = KMeans(random_state=0, init='random', n_clusters=4)
labels = kmeans.fit_predict(vectorized)


# In[27]:


#We use the davies_bouldin_score as a clustering performance metric

from sklearn.metrics import davies_bouldin_score
score = davies_bouldin_score(vectorized, labels)
print(score)


# In[28]:


# Convert these into single channel 8 bit pixels and finally merge them into the RGBA format - which represents the false colour image

fig = plt.figure(figsize=(25, 50))


im_pil0 = Image.fromarray(segmented_images[0])
im0 = im_pil0.convert('L')

for m in models:
    predicts = m['m'].fit_predict(X)
    t = m['t']
    X_PCA = PCA(n_components=2).fit_transform(X)
    plt.scatter(X_PCA[:, 0], X_PCA[:, 1], c=predicts, s=50, cmap='rainbow',alpha=0.5)
    plt.title(t)
    plt.savefig('out/'+t+'.png')
    plt.clf()

print('\n3. Fazer a validação dos grupos através da utilização dos índices (Davies Bouldin e Silhouette)')

print('{}\t{}\t{}'.format('Model', 'Bouldin', 'Silhouette'))
for m in models:
    labels = m['m'].fit_predict(X)
    dbScore = davies_bouldin_score(X, labels)
    sScore = silhouette_score(X, labels, metric='euclidean')
    print('{}\t{}\t{}'.format(m['t'], dbScore, sScore))

print('\n4. Mostrar os resultados de ambos os índices para os agrupamentos criados')

print(
"""
--------------------------------------------------------------------------
Modelo	                            Bouldin	            Silhouette
--------------------------------------------------------------------------
AgglomerativeClustering (k = 2)	    0.7330018210488929	0.5532678504628996
KMeans (k = 2)	                    0.7133822795826191	0.5687897205830247
GaussianMixture (k = 2)	            0.8604650094596924	0.3919496047300402

AgglomerativeClustering (k = 3)	    0.6041813066360704	0.5281675826566276
    model = KMeans(n_clusters=k)

    # use the first 5 PCA components
    X_pc = PCA_components.iloc[:, :5]

    #apply kmeans cluster
    model.fit(X_pc)
    cluster_labels = model.fit_predict(X_pc)

    #ag or spectral
    if k > 1:

        cur_silhout = silhouette_score(X_pc, cluster_labels)
        silouts.append(cur_silhout)

        cur_davies_b = davies_bouldin_score(X_pc, cluster_labels)
        davies_b.append(cur_davies_b)

    # Append the inertia to the list of inertias
    print(k)
    inertias.append(model.inertia_)

#SI Figure 2
plt.figure()
plt.plot(ks, inertias, '-o', color='black')
plt.xlabel('Number of Clusters')
plt.ylabel('Within-Cluster Sum of Squres(Inertia)')
plt.xticks(ks)
plt.tight_layout()
ax = plt.axes()
ax.grid(False)
Exemple #24
0
def score_with_davies_bouldin(X: np.array, labels: list):
    return metrics.davies_bouldin_score(X, labels)
Exemple #25
0
def apply_algo_k_auto(X, init_codification=None, quiet=True, params=None):
    '''Apply the machine learning algorithm for various number of
    clusters and choose the best according a certain score
    
    [IN]
        - X (np.array[N,p]): design matrix to put in input of the algorithm. Each line is an observation, each column is a predictor.
        - init_codification (dict): dict to link initialisation strategy with actual algorithm inputs. See kabl.core.apply_algo
        - quiet (boolean): if True, cut down all prints
        - params (dict): dict with all settings. Depends on 'max_k', 'n_clusters'
        
    [OUT]
        - labels (np.array[N]): vector of cluster number attribution
            BEWARE: the cluster identification number are random. Only borders matter.
        - n_clusters_opt (int): optimal number of clusters to be found in the data
        - classif_scores (float): value of classification score (chosen in params['n_clusters']) for the returned classification.
    '''

    if params is None:
        params = utils.get_default_params()

    # Apply algorithm and compute scores for several number of clusters
    all_labels = []
    classif_scores = []
    for n_clusters in range(2, params['max_k']):

        labels = apply_algo(X,
                            n_clusters,
                            init_codification=init_codification,
                            params=params)
        all_labels.append(labels)

        if params['classif_score'] in ['silhouette', 'silh']:
            classif_scores.append(silhouette_score(X, labels))
        elif params['classif_score'] in ['davies_bouldin', 'db']:
            with np.errstate(
                    divide='ignore', invalid='ignore'
            ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                classif_scores.append(davies_bouldin_score(X, labels))
        else:  # Default because fastest
            classif_scores.append(calinski_harabaz_score(X, labels))

    # Choose the best number of clusters
    if params['classif_score'] in ['silhouette', 'silh']:
        k_best = np.argmax(classif_scores)
        if classif_scores[k_best] < 0.5:
            if not quiet:
                print("Bad classification according to silhouette score (",
                      classif_scores[k_best], "). BLH is thus NaN")
            k_best = None
    elif params['classif_score'] in ['davies_bouldin', 'db']:
        k_best = np.argmin(classif_scores)
        if classif_scores[k_best] > 0.36:
            if not quiet:
                print("Bad classification according to Davies-Bouldin score (",
                      classif_scores[k_best], "). BLH is thus NaN")
            k_best = None
    else:
        k_best = np.argmax(classif_scores)
        if classif_scores[k_best] < 200:
            if not quiet:
                print(
                    "Bad classification according to Calinski-Harabasz score (",
                    classif_scores[k_best], "). BLH is thus NaN")
            k_best = None

    # Return the results
    if k_best is not None:
        result = all_labels[k_best], k_best + 2, classif_scores[k_best]
    else:
        result = None, None, None

    return result
Exemple #26
0
def DB(points, labelsPred):
    return float("%0.2f"%metrics.davies_bouldin_score(points, labelsPred))
Exemple #27
0
def kabl_qualitymetrics(inputFile,
                        outputFile=None,
                        reference='None',
                        rsFile='None',
                        storeResults=True,
                        params=None):
    '''Copy of blh_estimation including calculus and storage of scores
    
    [IN]
      - inputFile (str): path to the input file, as generated by raw2l1
      - outputFile (str): path to the output file. Default adds ".out" before ".nc"
      - reference (str): path to the reference file, if any.
      - rsFile (str): path to the radiosounding estimations, if any (give the possibility to store it in the same netcdf)
      - storeResults (bool): if True, the field 'blh_ababl', containg BLH estimation, is stored in the outputFile
      - params (dict): dict of parameters. Depends on 'n_clusters'
    
    [OUT]
      - errl2_blh (float): root mean squared gap between BLH from KABL and the reference
      - errl1_blh (float): mean absolute gap between BLH from KABL and the reference
      - errl0_blh (float): maximum absolute gap between BLH from KABL and the reference
      - ch_score (float): mean over all day Calinski-Harabasz score (the higher, the better)
      - db_scores (float): mean over all day Davies-Bouldin score (the lower, the better)
      - s_scores (float): mean over all day silhouette score (the higher, the better)
      - chrono (float): computation time for the full day (seconds)
      - n_invalid (int): number of BLH estimation at NaN or Inf
    '''

    t0 = time.time()  #::::::::::::::::::::::

    if params is None:
        params = utils.get_default_params()

    # 1. Extract the data
    #---------------------
    loc, dateofday, lat, lon = utils.where_and_when(inputFile)
    t_values, z_values, rcs_1, rcs_2, blh_mnf, rr, vv, cbh = utils.extract_data(
        inputFile,
        to_extract=['rcs_1', 'rcs_2', 'pbl', 'rr', 'vv', 'b1'],
        params=params)

    blh = []
    K_values = []
    s_scores = []
    db_scores = []
    ch_scores = []

    # setup toolbar
    toolbar_width = int(len(t_values) / 10) + 1
    sys.stdout.write("KABL estimation (" + loc +
                     dateofday.strftime(', %Y/%m/%d') + "): [%s]" %
                     ("." * toolbar_width))
    sys.stdout.flush()
    sys.stdout.write("\b" *
                     (toolbar_width + 1))  # return to start of line, after '['

    # Loop on all profile of the day
    for t in range(len(t_values)):
        # toolbar
        if np.mod(t, 10) == 0:
            if any(np.isnan(blh[-11:-1])):
                sys.stdout.write("!")
            else:
                sys.stdout.write("*")
            sys.stdout.flush()

        # 2. Prepare the data
        #---------------------
        coords = {
            'time': dt.datetime.utcfromtimestamp(t_values[t]),
            'lat': lat,
            'lon': lon
        }
        t_back = max(t - params['n_profiles'] + 1, 0)
        X, Z = prepare_data(coords,
                            z_values,
                            rcs_1[t_back:t + 1, :],
                            rcs_2[t_back:t + 1, :],
                            params=params)

        # 3. Apply the machine learning algorithm
        #---------------------

        if isinstance(params['n_clusters'], int):
            n_clusters = params['n_clusters']
            labels = apply_algo(X, params['n_clusters'], params=params)

            # Compute classification score
            if len(np.unique(labels)) > 1:
                with np.errstate(
                        divide='ignore', invalid='ignore'
                ):  # to avoid itempestive warning ("RuntimeWarning: divide by zero encountered in true_divide...")
                    db_score = davies_bouldin_score(X, labels)
                s_score = silhouette_score(X, labels)
                ch_score = calinski_harabaz_score(X, labels)
            else:
                db_score = np.nan
                s_score = np.nan
                ch_score = np.nan
        else:
            labels, n_clusters, s_score, db_score, ch_score = apply_algo_k_3scores(
                X, params=params)

        # 4. Derive and store the BLH
        #---------------------
        blh.append(blh_from_labels(labels, Z))
        K_values.append(n_clusters)
        s_scores.append(s_score)
        db_scores.append(db_score)
        ch_scores.append(ch_score)

    # end toolbar
    t1 = time.time()  #::::::::::::::::::::::
    chrono = t1 - t0
    sys.stdout.write("] (" + str(np.round(chrono, 4)) + " s)\n")

    if outputFile is None:
        fname = inputFile.split('/')[-1]
        outputFile = "DAILY_BENCHMARK_" + fname[10:-3] + ".nc"

    mask_cloud = cbh[:] <= 3000

    if os.path.isfile(reference):
        blh_ref = np.loadtxt(reference)
    else:
        blh_ref = blh_mnf[:, 0]

    if storeResults:
        BLHS = [np.array(blh), np.array(blh_mnf[:, 0])]
        BLH_NAMES = ['BLH_KABL', 'BLH_INDUS']
        if os.path.isfile(reference):
            BLHS.append(blh_ref)
            BLH_NAMES.append('BLH_REF')

        # Cloud base height is added as if it were a BLH though it's not
        BLHS.append(cbh)
        BLH_NAMES.append("CLOUD_BASE_HEIGHT")

        msg = utils.save_qualitymetrics(outputFile, t_values, BLHS, BLH_NAMES,
                                        [s_scores, db_scores, ch_scores],
                                        ['SILH', 'DB', 'CH'], [rr, vv],
                                        ['MASK_RAIN', 'MASK_FOG'], K_values,
                                        chrono, params)

        if os.path.isfile(rsFile):
            blh_rs = utils.extract_rs(rsFile, t_values[0], t_values[-1])
        else:
            blh_rs = None

        # graphics.blhs_over_data(t_values,z_values,rcs_1,BLHS,[s[4:] for s in BLH_NAMES],
        # blh_rs=blh_rs,storeImages=True,showFigure=False)
        print(msg)

    errl2_blh = np.sqrt(np.nanmean((blh - blh_ref)**2))
    errl1_blh = np.nanmean(np.abs(blh - blh_ref))
    errl0_blh = np.nanmax(np.abs(blh - blh_ref))
    corr_blh = np.corrcoef(blh, blh_ref)[0, 1]
    n_invalid = np.sum(np.isnan(blh)) + np.sum(np.isinf(blh))

    return errl2_blh, errl1_blh, errl0_blh, corr_blh, np.mean(
        ch_scores), np.mean(db_scores), np.mean(s_scores), chrono, n_invalid
Exemple #28
0
kmeans = KMeans(n_clusters=5)
kmeans.fit(scaled_X)

scores = [KMeans(n_clusters=i+2).fit(scaled_X).inertia_ for i in range(20)]
sns.lineplot(np.arange(2, 22), scores)
plt.xlabel('Number of clusters')
plt.ylabel("Inertia")
plt.title("Inertia of KMeans versus number of clusters")

#Evaluación
from sklearn.metrics import silhouette_score,davies_bouldin_score
#silhouette scores
print('kmeans: {}'.format(silhouette_score(scaled_X, kmeans.labels_, metric='euclidean')))
#db index
print('kmeans: {}'.format(davies_bouldin_score(scaled_X, kmeans.labels_)))


scaled_X['cluster'] = kmeans.labels_
#Usamos RandomForest para interpretar

from sklearn.ensemble import RandomForestClassifier

#take the noise out of the model
scaled_X_no_noise = scaled_X[scaled_X.cluster != -1].copy()
scaled_X_no_noise.drop(columns = ['TotalCharges', 'MonthlyCharges'], inplace = True)

#build the RFC classifier to calculate MDI as a proxy for feature importance
y = scaled_X_no_noise.iloc[:,-1]
X = scaled_X_no_noise.iloc[:,:-1]
clf = RandomForestClassifier(n_estimators=100).fit(X, y)
Exemple #29
0

        # figname = create_path("fig", sys.argv[1], "KMeans", sys.argv[2], filename=("%d.png" % n_clusters))
        # silhouette_analysis(X, cluster_labels, n_clusters, figname)

        centers = pca.transform(clusterer.cluster_centers_)
        figname = create_path("fig", sys.argv[1], "KMeans", sys.argv[2], filename=("%d_vis.png" % n_clusters))
        visualize_cluster(X_vis, cluster_labels, n_clusters, centers, figname)

        ari = metrics.adjusted_rand_score(y, cluster_labels)
        ami = metrics.adjusted_mutual_info_score(y, cluster_labels)
        nmi = metrics.normalized_mutual_info_score(y, cluster_labels)
        fms = metrics.fowlkes_mallows_score(y, cluster_labels) 
        sil = metrics.silhouette_score(X, cluster_labels, metric='euclidean')
        chi = metrics.calinski_harabaz_score(X, cluster_labels)
        dbi = metrics.davies_bouldin_score(X, cluster_labels)

        print ("Adjusted Rand index: %.6f" % ari)
        print ("Adjusted Mutual Information: %.6f" % ami)
        print ("Normalized Mutual Information: %.6f" % nmi)
        print ("Fowlkes-Mallows score: %.6f" % fms)
        print ("Silhouette Coefficient: %.6f" % sil)
        print ("Calinski-Harabaz Index: %.6f" % chi)
        print ("Davies-Bouldin Index: %.6f" % dbi)

        ari_score.append(ari)
        ami_score.append(ami)
        nmi_score.append(nmi)
        fms_score.append(fms)
        sil_score.append(sil)
        chi_score.append(chi)
Exemple #30
0
#fit = bestfeatures.fit(X,y)
#dfscores = pd.DataFrame(fit.scores_)
#dfcolumns = pd.DataFrame(X.columns)
#featureScores = pd.concat([dfcolumns, dfscores], axis = 1)
#featureScores.columns = ['Specs','Score']
#print(featureScores)
#
#print(featureScores.nlargest(19,'Score'))

########## k means X ##########################################################
kmeans = KMeans(n_clusters=4)
kmeans.fit(X)
labels = kmeans.predict(X)
#np.savetxt("kmeans.txt", labels)

kmeans_DBI = davies_bouldin_score(X, labels)
print("KMeans DBI score :", kmeans_DBI)

kmeans_RI = rand_index_score(
    y, labels)  #calculate_rand_index(y, labels)#rand_index_score(y, labels)
print("KMeans RI score :", kmeans_RI)

##### Complete-Linkage Agglomerative nesting ##################################
clustering = AgglomerativeClustering(n_clusters=4,
                                     linkage="complete").fit_predict(X)
#np.savetxt("AgglomerativeClustering.txt", clustering)

agnest_DBI = davies_bouldin_score(X, clustering)
print("AgglomerativeClustering DBI score :", agnest_DBI)

agnest_RI = rand_index_score(