Example #1
0
    def _get_optimal_number_of_clusters(self,
                                        correlation,
                                        asset_returns,
                                        linkage,
                                        num_reference_datasets=5,
                                        max_number_of_clusters=10):
        """
        Find the optimal number of clusters for hierarchical clustering using the Gap statistic.

        :param correlation: (np.array) matrix of asset correlations
        :param asset_returns: (pd.DataFrame) historical asset returns
        :param linkage: (str) the type of linkage method to use for clustering
        :param num_reference_datasets: (int) the number of reference datasets to generate for calculating expected inertia
        :param max_number_of_clusters: (int) the maximum number of clusters to check for finding the optimal value
        :return: (int) the optimal number of clusters
        """

        cluster_func = AgglomerativeClustering(affinity='precomputed',
                                               linkage=linkage)
        original_distance_matrix = np.sqrt(2 * (1 - correlation).round(5))
        gap_values = []
        for num_clusters in range(1, max_number_of_clusters + 1):
            cluster_func.n_clusters = num_clusters

            # Calculate expected inertia from reference datasets
            reference_inertias = []
            for _ in range(num_reference_datasets):

                # Generate reference returns from uniform distribution and calculate the distance matrix.
                reference_asset_returns = pd.DataFrame(
                    np.random.rand(*asset_returns.shape))
                reference_correlation = np.array(
                    reference_asset_returns.corr())
                reference_distance_matrix = np.sqrt(
                    2 * (1 - reference_correlation).round(5))

                reference_cluster_assignments = cluster_func.fit_predict(
                    reference_distance_matrix)
                inertia = self._compute_cluster_inertia(
                    reference_cluster_assignments,
                    reference_asset_returns.values)
                reference_inertias.append(inertia)
            expected_inertia = np.mean(reference_inertias)

            # Calculate inertia from original data
            original_cluster_asignments = cluster_func.fit_predict(
                original_distance_matrix)
            inertia = self._compute_cluster_inertia(
                original_cluster_asignments, asset_returns.values)

            # Calculate the gap statistic
            gap = expected_inertia - inertia
            gap_values.append(gap)

        return np.argmax(gap_values)
Example #2
0
def plot_agglomerative_algorithm():
    # generate synthetic two-dimensional data
    X, y = make_blobs(random_state=0, n_samples=12)

    agg = AgglomerativeClustering(n_clusters=X.shape[0],
                                  compute_full_tree=True).fit(X)

    fig, axes = plt.subplots(X.shape[0] // 5,
                             5,
                             subplot_kw={
                                 'xticks': (),
                                 'yticks': ()
                             },
                             figsize=(20, 8))

    eps = X.std() / 2

    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps

    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]

    for i, ax in enumerate(axes.ravel()):
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        agg.n_clusters = X.shape[0] - i
        agg.fit(X)
        ax.set_title("Step %d" % i)
        ax.scatter(X[:, 0], X[:, 1], s=60, c='grey')
        bins = np.bincount(agg.labels_)
        for cluster in range(agg.n_clusters):
            if bins[cluster] > 1:
                points = X[agg.labels_ == cluster]
                other_points = X[agg.labels_ != cluster]

                kde = KernelDensity(bandwidth=.5).fit(points)
                scores = kde.score_samples(gridpoints)
                score_inside = np.min(kde.score_samples(points))
                score_outside = np.max(kde.score_samples(other_points))
                levels = .8 * score_inside + .2 * score_outside
                ax.contour(xx,
                           yy,
                           scores.reshape(100, 100),
                           levels=[levels],
                           colors='k',
                           linestyles='solid',
                           linewidths=2)

    axes[0, 0].set_title("Initialization")
    plt.show()
Example #3
0
def plot_agglomerative():
    X, y = make_blobs(random_state=0, n_samples=12)
    agg = AgglomerativeClustering(n_clusters=3)

    eps = X.std() / 2.

    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps

    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
                         np.linspace(y_min, y_max, 100))
    gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]

    ax = plt.gca()
    for i, x in enumerate(X):
        ax.text(x[0] + .1,
                x[1],
                "%d" % i,
                horizontalalignment='left',
                verticalalignment='center')

    ax.scatter(X[:, 0], X[:, 1], s=60, c='grey')
    ax.set_xticks(())
    ax.set_yticks(())

    for i in range(11):
        agg.n_clusters = X.shape[0] - i
        agg.fit(X)

        bins = np.bincount(agg.labels_)
        for cluster in range(agg.n_clusters):
            if bins[cluster] > 1:
                points = X[agg.labels_ == cluster]
                other_points = X[agg.labels_ != cluster]

                kde = KernelDensity(bandwidth=.5).fit(points)
                scores = kde.score_samples(gridpoints)
                score_inside = np.min(kde.score_samples(points))
                score_outside = np.max(kde.score_samples(other_points))
                levels = .8 * score_inside + .2 * score_outside
                ax.contour(xx,
                           yy,
                           scores.reshape(100, 100),
                           levels=[levels],
                           colors='k',
                           linestyles='solid',
                           linewidths=1)

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
def plot_agglomerative():
    from sklearn.datasets import make_blobs
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.neighbors import KernelDensity
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    m = 16
    k = 3
    X, y = make_blobs(n_samples= m, n_features=2, centers=k, cluster_std=1.3, random_state = 2255)
    agg = AgglomerativeClustering(n_clusters=3)

    eps = X.std() / 2.

    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps

    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
    gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]

    ax = plt.gca()
    for i, x in enumerate(X):
        ax.text(x[0] + .1, x[1], "%d" % i, horizontalalignment='left', verticalalignment='center')

    ax.scatter(X[:, 0], X[:, 1], s=20, c='grey')
    ax.set_xticks(())
    ax.set_yticks(())

    for i in range((m-1)):
        agg.n_clusters = X.shape[0] - i
        agg.fit(X)

        bins = np.bincount(agg.labels_)
        for cluster in range(agg.n_clusters):
            if bins[cluster] > 1:
                points = X[agg.labels_ == cluster]
                other_points = X[agg.labels_ != cluster]

                kde = KernelDensity(bandwidth= 0.9).fit(points)
                scores = kde.score_samples(gridpoints)
                score_inside = np.min(kde.score_samples(points))
                score_outside = np.max(kde.score_samples(other_points))
                levels = .80 * score_inside + .20 * score_outside
                ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels],
                           colors='k', linestyles='solid', linewidths=0.8)

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
Example #5
0
def plot_agglomerative_algorithm():
    from sklearn.datasets import make_blobs
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.neighbors import KernelDensity
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    m = 16
    k = 3
    X, y = make_blobs(n_samples= m, n_features=2, centers=k, cluster_std=1.3,  
                  random_state = 2255)
    agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X)

    fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={'xticks': (),
                                                             'yticks': ()},
                             figsize=(20, 8))

    eps = X.std() / 1.7

    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps

    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
    gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]

    for i, ax in enumerate(axes.ravel()):
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        agg.n_clusters = X.shape[0] - i
        agg.fit(X)
        ax.set_title("Step %d" % i)
        ax.scatter(X[:, 0], X[:, 1], s=20, c='grey')
        bins = np.bincount(agg.labels_)
        for cluster in range(agg.n_clusters):
            if bins[cluster] > 1:
                points = X[agg.labels_ == cluster]
                other_points = X[agg.labels_ != cluster]

                kde = KernelDensity(bandwidth=.3).fit(points)
                scores = kde.score_samples(gridpoints)
                score_inside = np.min(kde.score_samples(points))
                score_outside = np.max(kde.score_samples(other_points))
                levels = .745 * score_inside + .255 * score_outside
                ax.contour(xx, yy, scores.reshape(100, 100), levels=[levels],
                           colors='k', linestyles='solid', linewidths=1)

    axes[0, 0].set_title("Initialization")
def plot_agglomerative():
    X, y = make_blobs(random_state=0, n_samples=12)
    agg = AgglomerativeClustering(n_clusters=3)

    eps = X.std() / 2.0

    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps

    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
    gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]

    ax = plt.gca()
    for i, x in enumerate(X):
        ax.text(x[0] + 0.1, x[1], "%d" % i, horizontalalignment="left", verticalalignment="center")

    ax.scatter(X[:, 0], X[:, 1], s=60, c="grey")
    ax.set_xticks(())
    ax.set_yticks(())

    for i in range(11):
        agg.n_clusters = X.shape[0] - i
        agg.fit(X)

        bins = np.bincount(agg.labels_)
        for cluster in range(agg.n_clusters):
            if bins[cluster] > 1:
                points = X[agg.labels_ == cluster]
                other_points = X[agg.labels_ != cluster]

                kde = KernelDensity(bandwidth=0.5).fit(points)
                scores = kde.score_samples(gridpoints)
                score_inside = np.min(kde.score_samples(points))
                score_outside = np.max(kde.score_samples(other_points))
                levels = 0.8 * score_inside + 0.2 * score_outside
                ax.contour(
                    xx, yy, scores.reshape(100, 100), levels=[levels], colors="k", linestyles="solid", linewidths=1
                )

    ax.set_xlim(x_min, x_max)
    ax.set_ylim(y_min, y_max)
def plot_agglomerative_algorithm():
    # generate synthetic two-dimensional data
    X, y = make_blobs(random_state=0, n_samples=12)

    agg = AgglomerativeClustering(n_clusters=X.shape[0], compute_full_tree=True).fit(X)

    fig, axes = plt.subplots(X.shape[0] // 5, 5, subplot_kw={"xticks": (), "yticks": ()}, figsize=(20, 8))

    eps = X.std() / 2

    x_min, x_max = X[:, 0].min() - eps, X[:, 0].max() + eps
    y_min, y_max = X[:, 1].min() - eps, X[:, 1].max() + eps

    xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100), np.linspace(y_min, y_max, 100))
    gridpoints = np.c_[xx.ravel().reshape(-1, 1), yy.ravel().reshape(-1, 1)]

    for i, ax in enumerate(axes.ravel()):
        ax.set_xlim(x_min, x_max)
        ax.set_ylim(y_min, y_max)
        agg.n_clusters = X.shape[0] - i
        agg.fit(X)
        ax.set_title("Step %d" % i)
        ax.scatter(X[:, 0], X[:, 1], s=60, c="grey")
        bins = np.bincount(agg.labels_)
        for cluster in range(agg.n_clusters):
            if bins[cluster] > 1:
                points = X[agg.labels_ == cluster]
                other_points = X[agg.labels_ != cluster]

                kde = KernelDensity(bandwidth=0.5).fit(points)
                scores = kde.score_samples(gridpoints)
                score_inside = np.min(kde.score_samples(points))
                score_outside = np.max(kde.score_samples(other_points))
                levels = 0.8 * score_inside + 0.2 * score_outside
                ax.contour(
                    xx, yy, scores.reshape(100, 100), levels=[levels], colors="k", linestyles="solid", linewidths=2
                )

    axes[0, 0].set_title("Initialization")