Beispiel #1
0
def test_kmeans_fit():
    dataset = datasets.load_iris()
    scaler = StandardScaler()
    x = scaler.fit_transform(dataset.data)

    clf = KMeans(3)
    clf.fit(x)
    assert len(clf.centers) == 3
    assert clf.n_clusters == 3
    assert np.array(clf.centers).shape == (3, 4)
    assert (set(clf.centers[0]) != set(clf.centers[1]) != set(clf.centers[2])
            != set(clf.centers[0]))
Beispiel #2
0
def main():

    inputs = [[-14, -5], [13, 13], [20, 23], [-19, -11], [-9, -16], [21, 27],
              [-49, 15], [26, 13], [-46, 5], [-34, -1], [11, 15], [-49, 0],
              [-22, -16], [19, 28], [-12, -8], [-13, -19], [-41, 8], [-11, -6],
              [-25, -9], [-18, -3]]

    random.seed(0)
    clusterer = KMeans(3)
    clusterer.train(inputs)
    print("3-means:")
    print(clusterer.means)
    print()

    plot_squared_clustering_errors(inputs)
def run_kmeans(data_path):
    img = imread(data_path)
    img2 = img.reshape(img.shape[0] * img.shape[1], img.shape[2])

    # Cluster for K = 2,3,...,16 save the image to see differences
    for cluster_count in range(2, 17):
        km = KMeans(cluster_count)
        V, cmap = km.fit(img2)

        if cluster_count in [2, 4, 6, 8, 16]:
            save_image(
                V,
                cmap,
                img.shape,
                "result/kmeans_{0}_clusters.png".format(cluster_count),
            )
Beispiel #4
0
def test_kmean_predict():
    np.random.seed(42)
    dataset = datasets.load_iris()
    scaler = StandardScaler()
    x = scaler.fit_transform(dataset.data)
    y = dataset.target
    clf = KMeans(3)
    clf.fit(x)
    centers = clf.centers
    y2_pred = clf.predict(x[20:30])
    assert len(x[20:30]) == len(y2_pred)
    y_pred = clf.predict(x)
    assert set(y_pred) == set(y)
    assert clf.n_clusters == 3
    assert np.array_equiv(clf.centers, centers)

    for cluster in range(clf.n_clusters):
        for point_n in np.random.choice(clf.clusters[cluster],
                                        size=min(20,
                                                 len(clf.clusters[cluster])),
                                        replace=False):
            point = x[point_n]
            dist_internal = np.linalg.norm(clf.centers[cluster] - point)
            dist_external = min(
                np.linalg.norm(clf.centers[(cluster + 1) % 3] - point),
                np.linalg.norm(clf.centers[(cluster + 2) % 3] - point))
            assert dist_internal <= dist_external
Beispiel #5
0
def main():
	df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data', header=None)
	y = df.iloc[:, 4].values
	y[y == "Iris-setosa"] = 0
	y[y == "Iris-versicolor"] = 1
	y[y == "Iris-virginica"] = 2
	x = df.iloc[:, [0,2]].values

	x = np.array(x)
	y = np.array(y)

	x[:, 0] = (x[:, 0] - x[:, 0].min())/(x[:, 0].max() - x[:, 0].min())
	x[:, 1] = (x[:, 1] - x[:, 1].min())/(x[:, 1].max() - x[:, 1].min())

	clf = KMeans.KMeans(n_clusters=3)
	clf.fit(x)
	
	x1_min, x1_max = x[:, 0].min(), x[:, 0].max()
	x2_min, x2_max = x[:, 1].min(), x[:, 1].max()

	x1_range = np.arange(x1_min, x1_max, 0.02)
	x2_range = np.arange(x2_min, x2_max, 0.02)
	xx, yy = np.meshgrid(x1_range, x2_range)
	x_ = np.array([xx.ravel(), yy.ravel()]).T

	z = clf.predict(x_)
	z = z.reshape(xx.shape)
	plt.contourf(xx, yy, z)

	colors = ['g', 'm', 'y']
	for idx in range(len(np.unique(y))):
		plt.scatter(x[:, 0][y == idx], x[:, 1][y == idx], c=colors[idx], s=10, edgecolors='k')

	for k in clf.centroids:
		plt.scatter(clf.centroids[k][0], clf.centroids[k][1], marker='x', s=100, c='b')
		print clf.centroids[k]

	plt.title('K-Means')
	plt.xlabel('sepal length [standardized]')
	plt.ylabel('petal length [standardized]')
	plt.legend(loc='upper left')
	plt.show()
Beispiel #6
0
def test_kmeans_init():
    clf = KMeans(3)
    assert clf.n_clusters == 3
    assert len(clf.clusters) == 3
    assert clf.centers == []
Beispiel #7
0
def kmeans_comparison_credita():
    X, y = datasets.load_credita()
    results = []

    # PCA --------------------------
    # transform dataset with our PCA into 2 components
    # for better visualization in 2 dimensions
    pca = PCA(2, verbose=True)
    X_pca_2d = pca.fit_transform(X)

    # the 2 last components are significantly lower than the rest
    # thus, they do not bring relevant (enough) information and can
    # be truncated safely
    k = X.shape[1] - 2
    pca2 = PCA(k, verbose=True)
    X_pca = pca2.fit_transform(X)

    # run KMeans on original dataset
    kmeans = KMeans(k=2, n_init=50)
    y_pred = kmeans.fit_predict(X)
    results.append(('KMeans', y, y_pred))

    # run KMeans on reduced dataset
    kmeans = KMeans(k=2, n_init=50)
    y_pred = kmeans.fit_predict(X_pca)
    results.append(('KMeans-PCA', y, y_pred))


    # t-SNE ------------------------
    # transform dataset with t-SNE
    best_tsne = TSNE(2)
    tsne = best_tsne

    # run several times and keep the best result
    for _ in range(10):
        res = tsne.fit_transform(X)
        
        if tsne.kl_divergence_ <= best_tsne.kl_divergence_:
            best_tsne = tsne
            X_tsne = res

        tsne = TSNE(2)

    # run KMeans on reduced dataset
    kmeans = KMeans(k=2, n_init=50)
    y_pred = kmeans.fit_predict(X_tsne)
    results.append(('KMeans-TSNE', y, y_pred))
    
    print('\n\n')
    print_binary_metrics(X, results)


    # plot
    fig, ax = plt.subplots(2, 3, figsize=(15, 10))
    plt.subplots_adjust(bottom=.10, left=.05, top=.90, right=.95)

    # PCA --------------------------
    ax[0, 0].scatter(X_pca_2d[y == 0, 0], X_pca_2d[y == 0, 1])
    ax[0, 0].scatter(X_pca_2d[y == 1, 0], X_pca_2d[y == 1, 1])
    ax[0, 0].title.set_text('Credit-A PCA')

    y_pred = results[0][2]
    ax[0, 1].scatter(X_pca_2d[y_pred == 0, 0], X_pca_2d[y_pred == 0, 1])
    ax[0, 1].scatter(X_pca_2d[y_pred == 1, 0], X_pca_2d[y_pred == 1, 1])
    ax[0, 1].title.set_text('KMeans on original dataset')

    y_pred = 1 - results[1][2]  # we'll invert labels for visualization purposes
    ax[0, 2].scatter(X_pca_2d[y_pred == 0, 0], X_pca_2d[y_pred == 0, 1])
    ax[0, 2].scatter(X_pca_2d[y_pred == 1, 0], X_pca_2d[y_pred == 1, 1])
    ax[0, 2].title.set_text('KMeans on dataset PCA')


    # t-SNE -------------------------
    ax[1, 0].scatter(X_tsne[y == 0, 0], X_tsne[y == 0, 1])
    ax[1, 0].scatter(X_tsne[y == 1, 0], X_tsne[y == 1, 1])
    ax[1, 0].title.set_text('Credit-A t-SNE')

    y_pred = results[0][2]
    ax[1, 1].scatter(X_tsne[y_pred == 0, 0], X_tsne[y_pred == 0, 1])
    ax[1, 1].scatter(X_tsne[y_pred == 1, 0], X_tsne[y_pred == 1, 1])
    ax[1, 1].title.set_text('KMeans on original dataset')

    y_pred = results[2][2]  # we'll invert labels for visualization purposes
    ax[1, 2].scatter(X_tsne[y_pred == 0, 0], X_tsne[y_pred == 0, 1])
    ax[1, 2].scatter(X_tsne[y_pred == 1, 0], X_tsne[y_pred == 1, 1])
    ax[1, 2].title.set_text('KMeans on dataset t-SNE')

    plt.show()
Beispiel #8
0
def kmeans_comparison_satimage():

    results = []
    X, y = datasets.load_satimage()
    y = y.values.reshape(-1)

    pca = PCA(2, verbose=True)
    X_trans = pca.fit_transform(X)

    kmeans = KMeans(k=6, n_init=20)
    y_pred = kmeans.fit_predict(X)
    results.append(('KMeans', y, y_pred))

    kmeans = KMeans(k=6, n_init=20)
    y_pred_PCA = kmeans.fit_predict(X_trans)
    results.append(('KMeans-PCA', y, y_pred_PCA))

    # t-SNE ------------------------
    # transform dataset with t-SNE
    best_tsne = TSNE(2)
    tsne = best_tsne
    # run several times and keep the best result
    for _ in range(3):
        res = tsne.fit_transform(X)
        
        if tsne.kl_divergence_ <= best_tsne.kl_divergence_:
            best_tsne = tsne
            X_trans2 = res

        tsne = TSNE(2)
    # run KMeans on reduced dataset
    kmeans = KMeans(k=6, n_init=20)
    y_pred_tSNE = kmeans.fit_predict(X_trans2)
    results.append(('KMeans-TSNE', y, y_pred_tSNE))

    print(results)
    print('\n\n')
    print_multi_metrics(X, results)

    #PCA figures
    fig = plt.figure(figsize=(15, 5))
    ax = fig.add_subplot(1, 2, 1)
    ax.set_title('K-means hist cluster for SatImage')
    ax.hist(y_pred, zorder=3)
    ax.set_xlabel('Number of instances', fontsize=13)
    ax.set_ylabel('Clusters', fontsize=13)
    ax.grid(zorder=0)
    ax = fig.add_subplot(1, 2, 2)
    ax.set_title('K-means on original SatImage dataset')
    for i in np.unique(y_pred):
        ax.scatter(X_trans[y_pred == i, 0], X_trans[y_pred == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i))
    ax.set_xlabel('X', fontsize=13)
    ax.set_ylabel('Y', fontsize=13)
    plt.show()

    #PCA figures
    fig = plt.figure(figsize=(15, 5))
    ax = fig.add_subplot(1, 2, 1)
    ax.set_title('K-means hist cluster for SatImage')
    ax.hist(y_pred_PCA, zorder=3)
    ax.set_xlabel('Number of instances', fontsize=13)
    ax.set_ylabel('Clusters', fontsize=13)
    ax.grid(zorder=0)
    ax = fig.add_subplot(1, 2, 2)
    ax.set_title('K-means on PCA SatImage')
    for i in np.unique(y_pred_PCA):
        ax.scatter(X_trans[y_pred_PCA == i, 0], X_trans[y_pred_PCA == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i))
    ax.set_xlabel('X', fontsize=13)
    ax.set_ylabel('Y', fontsize=13)
    plt.show()


    #t-SNE figures
    fig = plt.figure(figsize=(15, 5))
    ax = fig.add_subplot(1, 2, 1)
    ax.set_title('K-means t-SNE for SatImage')
    for i in np.unique(y_pred_tSNE):
        ax.scatter(X_trans2[y_pred_tSNE == i, 0], X_trans2[y_pred_tSNE == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i))
    ax.set_xlabel('X', fontsize=13)
    ax.set_ylabel('Y', fontsize=13)
    ax = fig.add_subplot(1, 2, 2)
    ax.set_title('K-means on orignal SatImage dataset')
    for i in np.unique(y_pred):
        ax.scatter(X_trans2[y_pred == i, 0], X_trans2[y_pred == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i))
    ax.set_xlabel('X', fontsize=13)
    ax.set_ylabel('Y', fontsize=13)
    plt.show()

    #Ground Turth Figures
    fig = plt.figure(figsize=(15, 5))
    ax = fig.add_subplot(1, 2, 1)
    ax.set_title('SatImage PCA Ground Truth')
    for i in np.unique(y):
        ax.scatter(X_trans[y == i, 0], X_trans[y == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i))
    ax.set_xlabel('X', fontsize=13)
    ax.set_ylabel('Y', fontsize=13)
    ax = fig.add_subplot(1, 2, 2)
    ax.set_title('SatImage t-SNE Ground Truth')
    for i in np.unique(y):
        ax.scatter(X_trans2[y == i, 0], X_trans2[y == i, 1], alpha=.5, color='C' + str(i + 1), label="cluster " + str(i))
    ax.set_xlabel('X', fontsize=13)
    ax.set_ylabel('Y', fontsize=13)
    plt.show()
Beispiel #9
0
def kmeans_comparison_kropt():
    X, y = datasets.load_kropt()
    results = []
    # from sklearn.metrics.cluster import davies_bouldin_score
    # print(davies_bouldin_score(X, y))

    # Apply custom PCA for dimension reduction
    pca = PCA(n_components=2, verbose=True)
    X_trans = pca.fit_transform(X)

    # Apply K-Means to original data
    kmeans = KMeans(k=18, n_init=50)
    y_pred = kmeans.fit_predict(X)
    results.append(('KMeans', y, y_pred))

    # Apply K-Means to transformed data
    kmeans = KMeans(k=18, n_init=50)
    y_pred_PCA = kmeans.fit_predict(X_trans)
    results.append(('KMeans-PCA', y, y_pred_PCA))

    # t-SNE ------------------------
    # transform dataset with t-SNE
    best_tsne = TSNE(2)
    tsne = best_tsne

    # run several times and keep the best result
    for _ in range(1):  # Too much time duration with big dataset like Kropt
        res = tsne.fit_transform(X)

        if tsne.kl_divergence_ <= best_tsne.kl_divergence_:
            best_tsne = tsne
            X_trans2 = res

        tsne = TSNE(2)

    # run KMeans on reduced dataset
    kmeans = KMeans(k=18, n_init=50)
    y_pred_tsne = kmeans.fit_predict(X_trans2)
    results.append(('KMeans-TSNE', y, y_pred_tsne))

    print('\n\n')
    print_multi_metrics(X, results)

    # Results Visualization

    total_colors = mcolors.cnames
    selected_colors = random.sample(list(total_colors), k=18)

    fig, ax = plt.subplots(2, 3, figsize=(30, 10))

    # PCA result visualization

    cvec = [selected_colors[label] for label in y]

    ax[0, 0].scatter(X_trans[:, 0], X_trans[:, 1], c=cvec, alpha=0.5)
    ax[0, 0].title.set_text('Kropt Groundtruth - Visual PCA')
    ax[0, 0].set_xlabel('PC1')
    ax[0, 0].set_ylabel('PC2')

    cvec = [selected_colors[label] for label in y_pred]

    ax[0, 1].scatter(X_trans[:, 0], X_trans[:, 1], c=cvec, alpha=0.5)
    ax[0, 1].title.set_text('K-Means Clustering - Visual PCA')
    ax[0, 1].set_xlabel('PC1')
    ax[0, 1].set_ylabel('PC2')

    cvec = [selected_colors[label] for label in y_pred_PCA]

    ax[0, 2].scatter(X_trans[:, 0], X_trans[:, 1], c=cvec, alpha=0.5)
    ax[0, 2].title.set_text('K-Means PCA Kropt')
    ax[0, 2].set_xlabel('PC1')
    ax[0, 2].set_ylabel('PC2')

    # t-SNE result visualization

    cvec = [selected_colors[label] for label in y]

    ax[1, 0].scatter(X_trans2[:, 0], X_trans2[:, 1], c=cvec, alpha=0.5)
    ax[1, 0].title.set_text('Kropt Groundtruth - Visual t-SNE')
    ax[1, 0].set_xlabel('PC1')
    ax[1, 0].set_ylabel('PC2')

    cvec = [selected_colors[label] for label in y_pred]

    ax[1, 1].scatter(X_trans2[:, 0], X_trans2[:, 1], c=cvec, alpha=0.5)
    ax[1, 1].title.set_text('K-Means Clustering - Visual t-SNE')
    ax[1, 1].set_xlabel('PC1')
    ax[1, 1].set_ylabel('PC2')

    cvec = [selected_colors[label] for label in y_pred_tsne]

    ax[1, 2].scatter(X_trans2[:, 0], X_trans2[:, 1], c=cvec, alpha=0.5)
    ax[1, 2].title.set_text('K-Means t-SNE Kropt')
    ax[1, 2].set_xlabel('PC1')
    ax[1, 2].set_ylabel('PC2')

    plt.show()
Beispiel #10
0
    def train_model(self,
                    data_train,
                    labels_train=None,
                    data_test=None,
                    labels_test=None,
                    verbose=1,
                    compiled=False,
                    clustering_loss='kld',
                    decoder_loss='mse',
                    clustering_loss_weight=0.5,
                    hardening_order=1,
                    hardening_strength=2.0,
                    compiled=False,
                    optimizer='adam',
                    lr=0.001,
                    decay=0.0):
        """Train DCE Model:

            If labels_train are not present, train DCE model in a unsupervised
        learning process; otherwise, train DCE model in a supervised learning
        process.

        Args:
            data_train: input training data
            labels_train: true labels of traning data
            data_test: input test data
            labels_test: true lables of testing data
            verbose: 0, turn off the screen prints
            clustering_loss: string, clustering layer loss function
            decoder_loss:, string, decoder loss function
            clustering_loss_weight: float in [0,1], w_c,
            harderning_order: odd int, the order of hardening function
            harderning_strength: float >=1.0, the streng of the harderning
            compiled: boolean, indicating if the model is compiled or not
            optmizer: string, keras optimizers
            lr: learning rate
            dacay: learning rate dacay

        Returns:
            train_loss:  training loss
            test_loss: only if data_test and labels_test are not None in
                       supervised learning process
        """
        if (not compiled):
            assert clustering_loss_weight <= 1 and clustering_loss_weight >= 0

            if optimizer == 'adam':
                dce_optimizer = optimizers.Adam(lr=lr, decay=decay)
            elif optimizer == 'sgd':
                dce_optimizer = optimizers.sgd(lr=lr, decay=decay)
            else:
                raise Exception('Input optimizer was not found')

            self.model.compile(loss={
                'clustering': clustering_loss,
                'decoder_output': decoder_loss
            },
                               loss_weights=[
                                   clustering_loss_weight,
                                   1 - clustering_loss_weight
                               ],
                               optimizer=dce_optimizer)

        if (labels_train is not None):
            supervised_learning = True
            if verbose >= 1: print('Starting supervised learning')
        else:
            supervised_learning = False
            if verbose >= 1: print('Starting unsupervised learning')

        # initializing model by using sklean-Kmeans as guess
        kmeans_init = KMeans(n_clusters=self.n_clusters)
        kmeans_init.build_model()
        encoder  = Model(inputs=self.model.input,
                         outputs=self.model.get_layer(\
                         name='embedding_layer').output)
        kmeans_init.model.fit(encoder.predict(data_train))
        y_pred_last = kmeans_init.model.labels_
        self.model.get_layer(name='clustering').\
            set_weights([kmeans_init.model.cluster_centers_])

        # Prepare training: p disctribution methods
        if not supervised_learning:
            # Unsupervised Learning
            assert hardening_order in DCE.HARDENING_FUNCS.keys()
            assert hardening_strength >= 1.0
            h_func = DCE.HARDENING_FUNCS[hardening_order]
        else:
            # Supervised Learning
            assert len(labels_train) == len(data_train)
            assert len(np.unique(labels_train)) == self.n_clusters
            p = np.zeros(shape=(len(labels_train), self.n_clusters))
            for i in range(len(labels_train)):
                p[i][labels_train[i]] = 1.0

            if data_test is not None:
                assert len(labels_test) == len(data_test)
                assert len(np.unique(labels_test)) == self.n_clusters
                p_test = np.zeros(shape=(len(labels_test), self.n_clusters))
                for i in range(len(labels_test)):
                    p_test[i][labels_test[i]] = 1.0

                validation_loss = []

        # training start:
        loss = []

        for iteration in range(int(self.max_iteration)):

            if iteration % self.update_interval == 0:
                # updating p for unsupervised learning process
                q, _ = self.model.predict(data_train)
                if not supervised_learning:
                    p = DCE.hardening(q, h_func, hardening_strength)

                # get label change i
                y_pred = q.argmax(1)
                delta_label_i = np.sum(y_pred != y_pred_last).\
                    astype(np.float32) / y_pred.shape[0]
                y_pred_last = y_pred

                # exam convergence
                if iteration > 0 and delta_label_i < self.clustering_tol:
                    print(
                        str(delta_label_i) + ' < ' + str(self.clustering_tol))
                    print('Reached tolerance threshold. Stopping training.')
                    break

            loss.append(
                self.model.train_on_batch(x=data_train, y=[p, data_train]))
            if supervised_learning and data_test is not None:
                validation_loss.append(
                    self.model.test_on_batch(x=data_test,
                                             y=[p_test, data_test]))

            if verbose > 0 and iteration % self.update_interval == 0:
                print('Epoch: ' + str(iteration))
                if verbose == 1:
                    print('  Total_loss = ' + str(loss[iteration][0]) +
                          ';Delta_label = ' + str(delta_label_i))
                    print('  Clustering_loss = ' + str(loss[iteration][1]) +
                          '; Decoder_loss = ' + str(loss[iteration][2]))

        if iteration == self.max_iteration - 1:
            print('Reached maximum iteration. Stopping training.')

        if data_test is None:
            return np.array(loss).T
        else:
            return [np.array(loss).T, np.array(validation_loss).T]
Beispiel #11
0
    x_axis = numpy.arange(cluster.borders[0][0] - 1, cluster.borders[0][1] + 1,
                          0.1)  # boundaries of x
    y_axis = numpy.arange(cluster.borders[1][0] - 1, cluster.borders[1][1] + 1,
                          0.1)  # boundaries of y
    x_axis, y_axis = numpy.meshgrid(x_axis, y_axis)

    z = numpy.array([
        cluster.predict([x, y]) for x, y in numpy.c_[x_axis.ravel(),
                                                     y_axis.ravel()]
    ])

    plt.pcolormesh(x_axis, y_axis, z.reshape(x_axis.shape), cmap=plt.cm.Paired)
    plt.show()


if __name__ == '__main__':
    data = load("Cluster.csv")

    pca = PCA(n_components=2)
    data = pca.fit_transform(data)

    results = {}
    for n in [1, 5, 10, 20]:
        kmeans = KMeans(n_clusters=n).fit(data)
        print("k:", n, "\terror:", kmeans.cluster_error_)

        results[kmeans] = kmeans.cluster_error_, kmeans

    plot(min(results, key=lambda x: results[x]))