def log_silhouette_chart(model, X, experiment=None, **kwargs):
    """Log Silhouette Coefficients charts for KMeans clusterer.

    Charts are computed for j = 2, 3, ..., n_clusters.

    Make sure you created an experiment by using ``neptune.create_experiment()`` before you use this method.

    Tip:
        Check `Neptune documentation <https://docs.neptune.ai/integrations/scikit_learn.html>`_ for the full example.

    Args:
        model (:obj:`KMeans`):
            | KMeans object.
        X (:obj:`ndarray`):
            | Training instances to cluster.
        experiment (:obj:`neptune.experiments.Experiment`, optional, default is ``None``):
            | Neptune ``Experiment`` object to control to which experiment you log the data.
            | If ``None``, log to currently active, and most recent experiment.
        kwargs:
            KMeans parameters.

    Returns:
        ``None``

    Examples:
        .. code:: python3

            km = KMeans(n_init=11, max_iter=270)
            X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743)

            neptune.init('my_workspace/my_project')
            neptune.create_experiment()

            log_silhouette_chart(km, X=X, n_clusters=12)
    """
    assert isinstance(model,
                      KMeans), 'Model should be sklearn KMeans instance.'
    exp = _validate_experiment(experiment)

    model.set_params(**kwargs)

    n_clusters = model.get_params()['n_clusters']

    for j in range(2, n_clusters + 1):
        model.set_params(**{'n_clusters': j})
        model.fit(X)

        try:
            fig, ax = plt.subplots()
            visualizer = SilhouetteVisualizer(model, is_fitted=True, ax=ax)
            visualizer.fit(X)
            visualizer.finalize()
            exp.log_image(
                'charts_sklearn',
                fig,
                image_name='Silhouette Coefficients for k={}'.format(j))
            plt.close(fig)
        except Exception as e:
            print('Did not log Silhouette Coefficients chart. Error {}'.format(
                e))
Exemple #2
0
def silhouette(ax=None):
    X, y = make_blobs(centers=12, n_samples=1000, n_features=16, shuffle=True)

    viz = SilhouetteVisualizer(KMeans(9), ax=ax)
    viz.fit(X)
    viz.finalize()

    return viz
def create_silhouette_chart(model, X, **kwargs):
    """Create silhouette coefficients charts for KMeans clusterer.

    Charts are computed for j = 2, 3, ..., n_clusters.

    Tip:
        Check Sklearn-Neptune integration
        `documentation <https://docs-beta.neptune.ai/essentials/integrations/machine-learning-frameworks/sklearn>`_
        for the full example.

    Args:
        model (:obj:`KMeans`):
            | KMeans object.
        X (:obj:`ndarray`):
            | Training instances to cluster.
        kwargs:
            KMeans parameters.

    Returns:
        ``neptune.types.FileSeries`` object that you can assign to run's ``base_namespace``.

    Examples:
        .. code:: python3

            import neptune.new.integrations.sklearn as npt_utils

            km = KMeans(n_init=11, max_iter=270)
            X, y = make_blobs(n_samples=579, n_features=17, centers=7, random_state=28743)

            run = neptune.init(project='my_workspace/my_project')
            run['kmeans/silhouette'] = npt_utils.create_silhouette_chart(km, X, n_clusters=12)
    """
    assert isinstance(model,
                      KMeans), 'Model should be sklearn KMeans instance.'

    charts = []

    model.set_params(**kwargs)

    n_clusters = model.get_params()['n_clusters']

    for j in range(2, n_clusters + 1):
        model.set_params(**{'n_clusters': j})
        model.fit(X)

        try:
            fig, ax = plt.subplots()
            visualizer = SilhouetteVisualizer(model, is_fitted=True, ax=ax)
            visualizer.fit(X)
            visualizer.finalize()
            charts.append(neptune.types.File.as_image(fig))
            plt.close(fig)
        except Exception as e:
            print('Did not log Silhouette Coefficients chart. Error {}'.format(
                e))

    return neptune.types.FileSeries(charts)
Exemple #4
0
 def shiloutte_score_plot(self, directory):
     self._check_model()
     plt.figure(figsize=(10, 10))
     visualizer = SilhouetteVisualizer(
         self.best_estimator_.named_steps['clustering'],
         colors='yellowbrick',
         is_fitted=True)
     visualizer.fit(self.data_preprocessed)
     visualizer.show(directory + "/shiloutte_score.png")
     visualizer.finalize()
     plt.close()
def silhoutte_yellowbrick(
    X,
    y,
    features,
):
    plt.switch_backend('agg')
    plt.clf()
    X_train, X_test, y_train, y_test = train_test_split(X[features],
                                                        y,
                                                        stratify=y,
                                                        test_size=0.01)
    X = pd.DataFrame(X_test, columns=features)
    y = pd.Series(y_test)
    n_clusters = y.nunique()
    model = MiniBatchKMeans(n_clusters)
    visualizer_sil = SilhouetteVisualizer(model, colors='yellowbrick')
    visualizer_sil.fit(X)
    visualizer_sil.finalize()

    return plt
def kmeans_silhouette_plots(tfidf, num_clusters=[3, 5, 7, 9, 11]):
    '''
    Vectorizer results are normalized, which makes KMeans behave as
    spherical k-means for better results. Since LSA/SVD results are
    not normalized, we have to redo the normalization.

    The best silhouette value is 1 and the worst value is -1. Values near 0 indicate overlapping clusters. Negative values generally indicate that a sample has been assigned to the wrong cluster, as a different cluster is more similar.
    '''

    print('\nUse kmeans silhouette score to visualize Silhouette Coefficients')
    for k in num_clusters:
        start = datetime.now()

        svd = TruncatedSVD(n_components=50, n_iter=10, random_state=0)
        normalizer = Normalizer(copy=False)
        lsa = make_pipeline(svd, normalizer)

        reduced = lsa.fit_transform(tfidf)

        model = KMeans(n_clusters=k, init='k-means++')

        # Instantiate the clustering model and visualizer
        visualizer = SilhouetteVisualizer(model)

        # Fit the training data to the visualizer
        visualizer.fit(reduced)
        visualizer.finalize()

        filename = r'images/silhouette_plots/kmeans_silh_plot_' + str(
            k) + '_clusters_' + str(tfidf.shape[0]) + '_docs.png'
        plt.savefig(filename)
        plt.close()

        end = datetime.now()
        print('            ' + filename)
        print("            Time taken: {}".format(end - start))