コード例 #1
0
ファイル: test_kmeans.py プロジェクト: jborchma/dask-ml
    def test_basic(self, Xl_blobs_easy):
        X, _ = Xl_blobs_easy

        # make it super easy to cluster
        a = DKKMeans(n_clusters=3, random_state=0)
        b = SKKMeans(n_clusters=3, random_state=0)
        a.fit(X)
        b.fit(X)
        assert_estimator_equal(
            a,
            b,
            exclude=["n_iter_", "inertia_", "cluster_centers_", "labels_"])
        assert abs(a.inertia_ - b.inertia_) < 0.01
        # order is arbitrary, so align first
        a_order = np.argsort(a.cluster_centers_, 0)[:, 0]
        b_order = np.argsort(b.cluster_centers_, 0)[:, 0]
        a_centers = a.cluster_centers_[a_order]
        b_centers = b.cluster_centers_[b_order]
        np.testing.assert_allclose(a_centers, b_centers, rtol=1e-3)
        b_labels = replace(b.labels_, [0, 1, 2],
                           a_order[b_order]).astype(b.labels_.dtype)
        assert_eq(a.labels_.compute(), b_labels)
        assert a.n_iter_
        # this is hacky
        b.cluster_centers_ = b_centers
        a.cluster_centers_ = a_centers
        assert_eq(a.transform(X), b.transform(X), rtol=1e-3)

        yhat_a = a.predict(X)
        yhat_b = b.predict(X)
        assert_eq(yhat_a.compute(), yhat_b)
コード例 #2
0
def test_k_means(n_clusters, n_samples, n_features):

    # Initialize the models
    cluster_centers = np.random.rand(n_clusters, n_features)

    model = KMeans(n_clusters, n_features)
    sk_model = SKKMeans(n_clusters, cluster_centers, N_INIT, N_ITER)

    # Random dataset
    X = np.random.rand(n_samples, n_features)
    if n_clusters == 2:
        X[n_samples // 2:, :] += 0.5
    else:
        X[n_samples // 3:2 * n_samples // 3, :] += 0.5
        X[n_samples // 3:2 * n_samples // 3, :] -= 0.5

    # Fit the models
    model.fit(X, N_ITER, N_INIT, cluster_centers)
    sk_model.fit(X)

    # get cluster-centers and sort them
    centers = np.sort(model.cluster_centers, axis=0)
    sk_centers = np.sort(sk_model.cluster_centers_, axis=0)

    # Compare between the centers
    assert centers.shape == sk_centers.shape
    diff = 0
    for i in (centers - sk_centers):
        diff += np.linalg.norm(i)**2
    assert diff < ACCEPTABLE_ERROR
コード例 #3
0
ファイル: test_kmeans.py プロジェクト: jborchma/dask-ml
 def test_random_init(self, Xl_blobs_easy):
     X, y = Xl_blobs_easy
     X_ = X.compute()
     rs = 0
     dkkm = DKKMeans(3, init="random", random_state=rs)
     skkm = SKKMeans(3, init="random", random_state=rs, n_init=1)
     dkkm.fit(X)
     skkm.fit(X_)
     assert abs(dkkm.inertia_ - skkm.inertia_) < 1e-4
     assert dkkm.init == "random"
コード例 #4
0
ファイル: test_kmeans.py プロジェクト: jborchma/dask-ml
 def test_kmeanspp_init(self, Xl_blobs_easy):
     X, y = Xl_blobs_easy
     X_ = X.compute()
     rs = np.random.RandomState(0)
     dkkm = DKKMeans(3, init="k-means++", random_state=rs)
     skkm = SKKMeans(3, init="k-means++", random_state=rs)
     dkkm.fit(X)
     skkm.fit(X_)
     assert abs(dkkm.inertia_ - skkm.inertia_) < 1e-4
     assert dkkm.init == "k-means++"
コード例 #5
0
 def test_fit_given_init(self, X_blobs):
     X_ = X_blobs.compute()
     x_squared_norms = k_means_.row_norms(X_, squared=True)
     rs = np.random.RandomState(0)
     init = k_means_._k_init(X_, 3, x_squared_norms, rs)
     dkkm = DKKMeans(3, init=init, random_state=rs)
     skkm = SKKMeans(3, init=init, random_state=rs, n_init=1)
     dkkm.fit(X_blobs)
     skkm.fit(X_)
     assert_eq(dkkm.inertia_, skkm.inertia_)
コード例 #6
0
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    inputarr = table[input_cols]
    if n_samples is None:
        n_samples = len(inputarr)
        
    validate(greater_than_or_equal_to(n_clusters, 1, 'n_clusters'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))
        
    k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
    
    k_means.fit(inputarr)
    
    params = {'input_cols':input_cols, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol,
              'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm}
    
    cluster_centers = k_means.cluster_centers_
    labels = k_means.labels_
    
    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _kmeans_centers_plot(input_cols, cluster_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2)
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params))))
    
    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}
コード例 #7
0
ファイル: test_kmeans.py プロジェクト: nwut/dask-ml
 def test_fit_given_init(self):
     X, y = sklearn.datasets.make_blobs(n_samples=1000, n_features=4, random_state=1)
     X = da.from_array(X, chunks=500)
     X_ = X.compute()
     x_squared_norms = sklearn.utils.extmath.row_norms(X_, squared=True)
     rs = np.random.RandomState(0)
     init = _k_init(X_, 3, x_squared_norms, rs)
     dkkm = DKKMeans(3, init=init, random_state=0)
     skkm = SKKMeans(3, init=init, random_state=0, n_init=1)
     dkkm.fit(X)
     skkm.fit(X_)
     assert_eq(dkkm.inertia_, skkm.inertia_)
コード例 #8
0
ファイル: kmeans.py プロジェクト: yemode2k/studio
def _kmeans_train_predict(table, input_cols, n_clusters=3, prediction_col='prediction', init='k-means++', n_init=10,
             max_iter=300, tol=1e-4, precompute_distances='auto', seed=None,
             n_jobs=1, algorithm='auto', n_samples=None):
    feature_names, inputarr = check_col_type(table, input_cols)
    if n_samples is None:
        n_samples = len(inputarr)
        
    k_means = SKKMeans(n_clusters=n_clusters, init=init, n_init=n_init,
             max_iter=max_iter, tol=tol, precompute_distances=precompute_distances,
             verbose=0, random_state=seed, copy_x=True, n_jobs=n_jobs, algorithm=algorithm)
    
    k_means.fit(inputarr)
    
    params = {'input_cols':feature_names, 'n_clusters':n_clusters, 'init':init, 'n_init':n_init, 'max_iter':max_iter, 'tol':tol,
              'precompute_distances':precompute_distances, 'seed':seed, 'n_jobs':n_jobs, 'algorithm':algorithm, 'n_samples':n_samples}
    
    cluster_centers = k_means.cluster_centers_
    n_clusters = len(cluster_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    labels = k_means.labels_
    
    pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    fig_centers = _kmeans_centers_plot(feature_names, cluster_centers, colors)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, cluster_centers, seed, colors)
    fig_pca = _kmeans_pca_plot(labels, cluster_centers, pca2_model, pca2, colors)
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Result
    | - Number of iterations run: {n_iter_}.
    | - Sum of square error: {sse_}.
    | - Coordinates of cluster centers
    | {fig_cluster_centers} 
    | - Samples
    | {fig_pca}
    | {fig_samples}
    |
    | ### Parameters
    | {params}
    """.format(n_iter_=k_means.n_iter_, sse_=k_means.inertia_, fig_cluster_centers=fig_centers, fig_pca=fig_pca, fig_samples=fig_samples, params=dict2MD(params))))
    
    model = _model_dict('kmeans')
    model['model'] = k_means
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = labels
    return {'out_table':out_table, 'model':model}
コード例 #9
0
ファイル: test_kmeans.py プロジェクト: jborchma/dask-ml
    def test_dtypes(self):
        X = da.random.uniform(size=(100, 2), chunks=(50, 2))
        X2 = X.astype("f4")
        pairs = [(X, X), (X2, X2), (X, X2), (X2, X)]

        for xx, yy in pairs:
            a = DKKMeans()
            b = SKKMeans()
            a.fit(xx)
            b.fit(xx)
            assert a.cluster_centers_.dtype == b.cluster_centers_.dtype
            assert a.labels_.dtype == b.labels_.dtype
            assert a.transform(xx).dtype == b.transform(xx).dtype
            assert a.transform(yy).dtype == b.transform(yy).dtype
コード例 #10
0
ファイル: kmeans.py プロジェクト: yemode2k/studio
def _kmeans_silhouette_train_predict(table, input_cols, n_clusters_list=range(2, 10), prediction_col='prediction',
                                     init='k-means++', n_init=10, max_iter=300, tol=1e-4, precompute_distances='auto',
                                     seed=None, n_jobs=1, algorithm='auto', n_samples=None):
    
    feature_names, features = check_col_type(table, input_cols)

    if n_samples is None:
        n_samples = len(table)
    inputarr = features
    
    pca2_model = PCA(n_components=min(2, len(feature_names))).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)
    
    silhouette_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k, init=init, n_init=n_init, max_iter=max_iter, tol=tol,
                           precompute_distances=precompute_distances, verbose=0, random_state=seed, copy_x=True,
                           n_jobs=n_jobs, algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)
        
        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        # silhouette_samples_list.append(samples)
    
        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0, si,
                              facecolor=color, edgecolor=color, alpha=0.7)
            
            # cluster label
            ax1.text(0.9, y_lower + 0.45 * sizei, str(i))

            y_lower = y_upper
            
            if pca2.shape[1] == 1:
                ax2.scatter(pca2[:, 0][predict == i], pca2[:, 0][predict == i], color=color)
            else:
                ax2.scatter(pca2[:, 0][predict == i], pca2[:, 1][predict == i], color=color)

        ax1.axvline(x=score, color="red")
        ax1.set_xlim(right=1.0)
        ax1.set_yticks([])
        ax1.set_xlabel("Silhouette coefficient values")
        ax1.set_ylabel("Cluster label")
        
        if pca2.shape[1] == 1:
            ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 0], marker='x', edgecolors=1, s=200, color=colors)
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 1st feature")
        else:
            ax2.scatter(pca2_centers[:, 0], pca2_centers[:, 1], marker='x', edgecolors=1, s=200, color=colors)
            ax2.set_xlabel("Feature space for the 1st feature")
            ax2.set_ylabel("Feature space for the 2nd feature")   
        
        plt.tight_layout()
        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)
    
    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_
    best_sse = best_model.inertia_
    
    n_clusters = len(best_centers)
    colors = cm.nipy_spectral(np.arange(n_clusters).astype(float) / n_clusters)
    fig_centers = _kmeans_centers_plot(feature_names, best_centers, colors)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples, best_centers, seed, colors)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2, colors)
    
    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    plt.xlabel("Number of Clusters k")
    plt.tight_layout()
    fig_silhouette = plt2MD(plt)
    plt.clf()
    
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Kmeans Silhouette Result
    | - silhoutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - Sum of square error: {best_sse}.
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette, best_k=best_k, best_sse=best_sse, fig_pca=fig_pca, fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()
    
    out_table = table.copy()
    out_table[prediction_col] = predict
    # out_table['silhouette'] = silhouette_samples_list[best_k-2]
    # out_table = out_table.sort_values(by=['prediction','silhouette'])  
    # out_table = out_table.reset_index(drop=True)
        
    return {'out_table': out_table, 'model': model}
コード例 #11
0
ファイル: kmeans.py プロジェクト: shovsj/studio
def _kmeans_silhouette_train_predict(table,
                                     input_cols,
                                     n_clusters_list=range(2, 10),
                                     prediction_col='prediction',
                                     init='k-means++',
                                     n_init=10,
                                     max_iter=300,
                                     tol=1e-4,
                                     precompute_distances='auto',
                                     seed=None,
                                     n_jobs=1,
                                     algorithm='auto',
                                     n_samples=None):
    if n_samples is None:
        n_samples = len(table)
    inputarr = table[input_cols]

    validate(all_elements_greater_than(n_clusters_list, 1, 'n_clusters_list'),
             greater_than_or_equal_to(n_init, 1, 'n_init'),
             greater_than_or_equal_to(max_iter, 1, 'max_iter'),
             greater_than(tol, 0.0, 'tol'),
             greater_than_or_equal_to(n_jobs, 1, 'n_jobs'),
             greater_than_or_equal_to(n_samples, 0, 'n_samples'))

    pca2_model = PCA(n_components=2).fit(inputarr)
    pca2 = pca2_model.transform(inputarr)

    silhouette_list = []
    silouette_samples_list = []
    models = []
    centers_list = []
    images = []
    for k in n_clusters_list:
        k_means = SKKMeans(n_clusters=k,
                           init=init,
                           n_init=n_init,
                           max_iter=max_iter,
                           tol=tol,
                           precompute_distances=precompute_distances,
                           verbose=0,
                           random_state=seed,
                           copy_x=True,
                           n_jobs=n_jobs,
                           algorithm=algorithm)
        k_means.fit(inputarr)
        models.append(k_means)
        predict = k_means.labels_
        centersk = k_means.cluster_centers_
        centers_list.append(centersk)

        score = silhouette_score(inputarr, predict)
        silhouette_list.append(score)
        samples = silhouette_samples(inputarr, predict)
        silouette_samples_list.append(samples)

        pca2_centers = pca2_model.transform(centersk)

        _, (ax1, ax2) = plt.subplots(1, 2)
        colors = cm.nipy_spectral(np.arange(k).astype(float) / k)
        y_lower = 0

        for i, color in zip(range(k), colors):
            si = samples[predict == i]
            si.sort()

            sizei = si.shape[0]
            y_upper = y_lower + sizei

            ax1.fill_betweenx(np.arange(y_lower, y_upper),
                              0,
                              si,
                              facecolor=color,
                              edgecolor=color,
                              alpha=0.7)

            y_lower = y_upper

            ax2.scatter(pca2[:, 0][predict == i],
                        pca2[:, 1][predict == i],
                        color=color)

        ax1.axvline(x=score, color="red")
        ax2.scatter(pca2_centers[:, 0],
                    pca2_centers[:, 1],
                    marker='x',
                    edgecolors=1,
                    s=200,
                    color=colors)

        imagek = plt2MD(plt)
        plt.clf()
        images.append(imagek)

    argmax = np.argmax(silhouette_list)
    best_k = n_clusters_list[argmax]
    best_model = models[argmax]
    predict = best_model.predict(inputarr)
    best_centers = best_model.cluster_centers_
    best_labels = best_model.labels_

    fig_centers = _kmeans_centers_plot(input_cols, best_centers)
    fig_samples = _kmeans_samples_plot(table, input_cols, n_samples,
                                       best_centers)
    fig_pca = _kmeans_pca_plot(predict, best_centers, pca2_model, pca2)

    x_clusters = range(len(n_clusters_list))
    plt.xticks(x_clusters, n_clusters_list)
    plt.plot(x_clusters, silhouette_list, '.-')
    fig_silhouette = plt2MD(plt)
    plt.clf()

    rb = BrtcReprBuilder()
    rb.addMD(
        strip_margin("""
    | ## Kmeans Silhouette Result
    | - silloutte metrics:
    | {fig_silhouette}
    | - best K: {best_k} 
    | - best centers:
    | {fig_pca}
    | {fig_centers}
    | {fig_samples}
    |
    """.format(fig_silhouette=fig_silhouette,
               best_k=best_k,
               fig_pca=fig_pca,
               fig_centers=fig_centers,
               fig_samples=fig_samples)))

    for k, image in zip(n_clusters_list, images):
        rb.addMD(
            strip_margin("""
        | ### k = {k}
        | {image}
        |
        """.format(k=k, image=image)))

    model = _model_dict('kmeans_silhouette')
    model['best_k'] = best_k
    model['best_centers'] = best_centers
    model['best_model'] = best_model
    model['input_cols'] = input_cols
    model['_repr_brtc_'] = rb.get()

    out_table = table.copy()
    out_table[prediction_col] = predict

    return {'out_table': out_table, 'model': model}
コード例 #12
0
if n_colors == 'bw':  # Black and white option
    n_colors = 2
    bw = True

assert 0 < int(n_colors) < 256

# Upload the image and convert it to numpy array
img = Image.open(img_path)
img = np.array(img)
w, h, d = img.shape
X = img.reshape((w * h, d))
print(f"Succefly uploaded image from: \"{img_path}\"")

# Initialize K-Means model and cauterize the data
model = SKKMeans(int(n_colors), "random", 1, 100)
model.fit(X)
labels = model.labels_
print("Succefly created and fit K-Means model")

# Cluster centers
if bw:
    centers = np.array([[0, 0, 0], [255, 255, 255]])
else:
    centers = model.cluster_centers_

# Build new image
new_img = []
for label in labels:
    new_img.append(centers[int(label)])