Exemple #1
0
def silhouette_samples(X, labels, metric='euclidean', **kwds):
    """Compute the Silhouette Coefficient for each sample.
        The Silhouette Coefficient is a measure of how well samples are clustered
        with samples that are similar to themselves. Clustering models with a high
        Silhouette Coefficient are said to be dense, where samples in the same
        cluster are similar to each other, and well separated, where samples in
        different clusters are not very similar to each other.
        The Silhouette Coefficient is calculated using the mean intra-cluster
        distance (``a``) and the mean nearest-cluster distance (``b``) for each
        sample.  The Silhouette Coefficient for a sample is ``(b - a) / max(a,
        b)``.
        Note that Silhouette Coefficient is only defined if number of labels
        is 2 <= n_labels <= n_samples - 1.
        This function returns the Silhouette Coefficient for each sample.
        The best value is 1 and the worst value is -1. Values near 0 indicate
        overlapping clusters.
        Read more in the :ref:`User Guide <silhouette_coefficient>`.
    Args:
        X : array [n_samples_a, n_samples_a] if metric == “precomputed”,
          or, [n_samples_a, n_features] otherwise
        Array of pairwise distances between samples, or a feature array.
        labels : array, shape = [n_samples]
        label values for each sample
        metric: string, or callable
        The metric to use when calculating distance between instances in a feature array. 
            If metric is a string, it must be one of the options allowed by sklearn.metrics.pairwise.pairwise_distances. If X is the distance array itself, use “precomputed” as the metric.
        `**kwds` : optional keyword parameters
        Any further parameters are passed directly to the distance function.
        If using a scipy.spatial.distance metric, the parameters are still metric dependent. See the scipy docs for usage examples.
    Returns:
        silhouette : array, shape = [n_samples]
            Silhouette Coefficient for each samples.
    """

    return unsupervised.silhouette_samples(X, labels, metric, **kwds)
Exemple #2
0
 def compute_silhouette(self):
     result = unsupervised.silhouette_samples(self.D,
                                              self.labels,
                                              metric='precomputed')
     mean_silhouette_labels = dict()
     for l in set(self.labels):
         if l != -1:
             label_mask = np.asarray(self.labels) != l
             masked_result = ma.masked_array(result, mask=label_mask)
             mean_silhouette_labels[l] = masked_result.mean()
     self.mean_silhouette_labels = mean_silhouette_labels
     return self.mean_silhouette_labels, result
def determine_anomaly_tweets_kmeans(top_n, pca_vectors, cluster_labels):
    #NOTE:
    #pca_vectors = PCA vectors of the top 2 PCA components only
    #cluster_labels = labels of 2 clusters only

    silhouette_values = silhouette_samples(X=pca_vectors,
                                           labels=cluster_labels,
                                           metric='cosine')
    #My understanding of silhouette_samples() is that it returns the silhouette score of an individual data point in a cluster
    tweet_index_silhouette_scores = [
    ]  #this will hold the tweet index & its actual sh_score
    absolute_silhouette_scores_tweet_index = [
    ]  #this will hold the tweet index & its absolute sh_score

    for index, sh_score in enumerate(silhouette_values):
        absolute_silhouette_scores_tweet_index.append((abs(sh_score), index))
        #We'll need to sort the tweet indices and their sh_scores by the abs sh_score hence the abs sh_score comes first in the tuple
        tweet_index_silhouette_scores.append((index, sh_score))
#We'll need to retrieve tweet indices and their actual sh_score hence index comes first
    sorted_scores = sorted(absolute_silhouette_scores_tweet_index)
    #sorted() is for use in lists/tuples/strings (they're similar) when not using data frame columns

    top_n_silhouette_scores = []
    pca_vectors_anomalies = []
    print('Top ', top_n, ' anomalies:')
    for i in range(top_n):
        abs_sh_score, index = sorted_scores[
            i]  #get top n of sorted absolute silhouette score i.e. those approx 0
        index_1, sh_score = tweet_index_silhouette_scores[
            index]  #get actual sh_score of top n sorted abs
        #top_n_silhouette_scores.append((index, sh_score)) #Results are the same with this commented out. Maybe it's for plotting?
        print(tweets_dict[index]
              )  #print actual tweet referenced by the index value
        print('PCA Vector: ',
              pca_vectors[index])  #print the tweet's PCA vectors
        #pca_vectors_anomalies.append(pca_vectors[index]) #Results are the same with this commented out. Maybe it's for plotting?
        print('Silhouette score: ', sh_score)
        print('.................')
Exemple #4
0
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction',
                                     init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto',
                                     seed=None, n_jobs=1, algorithm='auto', n_samples=None):
    
    feature_names, features = check_col_type(table, input_cols)

    if n_samples is None:
        n_samples = len(table)
    inputarr = features
    
    pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    silhouette_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol,
                           precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True,
                           n_jobs=n_jobs, algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)
        
        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        # silhouette_samples_list.append(samples)
    
        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, si,
                              facecolor=color, edgecolor=color, alpha=0.7)
            
            # cluster label
            ax1.text(0.9, y_lower + 0.45 * sizei, str(i))

            y_lower = y_upper
            
            if pca2.shape[1] == 1:
                ax2.scatter(pca2[:, 0][predict == i], pca2[:, 0][predict == i], color=color)
            else:
                ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color)

        ax1.axvline(x=score, color="red")
        ax1.set_xlim(right=1.0)
        ax1.set_yticks([])
        ax1.set_xlabel("Silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        
        if pca2.shape[1] == 1:
            ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors)
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 1st feature")
        else:
            ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors)
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 2nd feature")   
        
        plt.tight_layout()
        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)
    
    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_
    best_sse = best_model.inertia_
    
    n_clusters = len(best_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    fig_centers = _kmeans_centers_plot(feature_names, best_centers, colors)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers, seed, colors)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2, colors)
    
    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    plt.xlabel("Number of Clusters k")
    plt.tight_layout()
    fig_silhouette = plt2MD(plt)
    plt.clf()
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Silhouette Result
    | - silhoutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - Sum of square error: {best_sse}.
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette, best_k=best_k, best_sse=best_sse, fig_pca=fig_pca, fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = predict
    # out_table['silhouette'] = silhouette_samples_list[best_k-2]
    # out_table = out_table.sort_values(by=['prediction','silhouette'])  
    # out_table = out_table.reset_index(drop=True)
        
    return {'out_table': out_table, 'model': model}
Exemple #5
0
def _kmeans_silhouette_train_predict(table,
                                     input_cols,
                                     n_clusters_list=range(2, 10),
                                     prediction_col='prediction',
                                     init='k-means++',
                                     n_init=10,
                                     max_iter=300,
                                     tol=1e-4,
                                     precompute_distances='auto',
                                     seed=None,
                                     n_jobs=1,
                                     algorithm='auto',
                                     n_samples=None):
    if n_samples is None:
        n_samples = len(table)
    inputarr = table[input_cols]

    validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    silhouette_list = []
    silouette_samples_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k,
                           init=init,
                           n_init=n_init,
                           max_iter=max_iter,
                           tol=tol,
                           precompute_distances=precompute_distances,
                           verbose=0,
                           random_state=seed,
                           copy_x=True,
                           n_jobs=n_jobs,
                           algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)

        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        silouette_samples_list.append(samples)

        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              si,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            y_lower = y_upper

            ax2.scatter(pca2[:, 0][predict == i],
                        pca2[:, 1][predict == i],
                        color=color)

        ax1.axvline(x=score, color="red")
        ax2.scatter(pca2_centers[:, 0],
                    pca2_centers[:, 1],
                    marker='x',
                    edgecolors=1,
                    s=200,
                    color=colors)

        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)

    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_

    fig_centers = _kmeans_centers_plot(input_cols, best_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       best_centers)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2)

    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    fig_silhouette = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Silhouette Result
    | - silloutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette,
               best_k=best_k,
               fig_pca=fig_pca,
               fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(
            strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = predict

    return {'out_table': out_table, 'model': model}
Exemple #6
0
def silhouette_plot(X,
                    labels,
                    metric='euclidean',
                    fig_size=None,
                    cluster=None,
                    ID=None,
                    index=False,
                    **kwds):
    """Makes a silhouette bar graph.
        Calculate the silhouette of the samples and then plot a graph of the chosen cluster
        and the silhouettes of the samples or plot a graph with all the means of the
        silhouettes of the clusters
    Args:
        X : array [n_samples_a, n_samples_a] if metric == "precomputed", or, \
                 [n_samples_a, n_features] otherwise
            Array of pairwise distances between samples, or a feature array.
        labels : array, shape = [n_samples]
             Predicted labels for each sample.
        metric : string, or callable
            The metric to use when calculating distance between instances in a
            feature array. If metric is a string, it must be one of the options
            allowed by :func:`metrics.pairwise.pairwise_distances
            <sklearn.metrics.pairwise.pairwise_distances>`. If X is the distance
            array itself, use ``metric="precomputed"``.
        fig_size: number, type int
            Number for the figsize of the plot if fig_size == None then a 
            calculation is made to leave a suitable size for the quantity of samples.
        cluster: number do cluster,type int
            Cluster number to generate the silhouettes graph of its samples 
            if cluster == None will then generate a plot of the mean silhouettes of all clusters.
        y : Smple ID
            ID of the samples, used to label the bars on the y-axis
        **kwds : optional keyword parameters
            Any further parameters are passed directly to the distance function.
            If using a scipy.spatial.distance metric, the parameters are still
            metric dependent. See the scipy docs for usage examples.
    Returns:
        Void
    """

    silhouette_samples = unsupervised.silhouette_samples(
        X, labels, metric, **kwds)

    df = pd.DataFrame(silhouette_samples)
    df['cluster'] = labels

    if cluster == None:
        cluster_means = df.groupby('cluster').mean()
        dit = dict(zip(cluster_means.index, cluster_means[0]))
        df2 = pd.DataFrame(list(dit.items()))
        df2.columns = ['Cluster', 'silhouette_mean']

        if fig_size == None:
            if len(df2) > 64:
                fig = plt.figure(figsize=(len(df2) / 7, len(df2) / 3))
            else:
                fig = plt.figure(figsize=(10, 8))
        else:
            fig = plt.figure(figsize=fig_size)

        df2 = df2.sort_values(['silhouette_mean'],
                              ascending=False).reset_index(drop=True)
        ax = sns.barplot(df2['silhouette_mean'], y=df2.index, orient='h')
        ax.set_yticklabels(df2['Cluster'])
        plt.ylabel('Cluster')
        plt.show()

    elif cluster != None:

        df['ID'] = ID

        cluster = df[df['cluster'] == cluster]
        cluster.columns = ['silhouette', 'Cluster', 'ID']
        cluster = cluster.sort_values(['silhouette'],
                                      ascending=False).reset_index(drop=True)

        if fig_size == None:
            if len(cluster) > 64:
                fig = plt.figure(figsize=(len(cluster) / 7, len(cluster) / 3))
            else:
                fig = plt.figure(figsize=(12, 10))
        else:
            fig = plt.figure(figsize=fig_size)

        ax = sns.barplot(cluster['silhouette'], y=cluster.index, orient='h')

        if index != None:
            ax.set_yticklabels(cluster['ID'])

        plt.ylabel('ID samples')
        plt.show()
#                                               int(max(y_pred) + 1))))
                
                #import random
                r = lambda: random.randint(255)
                colours = numpy.array([ '#%02X%02X%02X'%(r(),r(),r()) for i in range(max(y_pred)+1) ])
                print(colours)
                c = Counter(y_pred)
                print(c.most_common())

#                 ixs_over_thresh = [k for (k,v) in c.items() if v > 1]
#                 print(ixs_over_thresh)
#                 sil_X = X[ixs_over_thresh, :]
#                 sil_y = y_pred[ixs_over_thresh]
                
#                 ss = silhouette_samples(sil_X,sil_y, metric='euclidean')
                ss = silhouette_samples(X,y_pred, metric="euclidean")
                ss = [s for s in ss if not isnan(s)]
                
                print("Sil=", numpy.mean(ss))
                print("CHS=",calinski_harabaz_score(X, y_pred))
                
                pca = PCA()
                to_plot = pca.fit_transform(X)
                
                print(X.shape)
                print(y_pred.shape)
                w_clusters = numpy.column_stack((X,y_pred))
                numpy.savetxt("w_clusters.csv", w_clusters, delimiter=",")