Esempio n. 1
0
def main():
    k = 3
    X =   [[random.randint(0,20),random.randint(0,20)] for i in range(30)]       \
        + [[random.randint(40,60), random.randint(40,60)] for i in range(30)]    \
        + [[random.randint(80, 100), random.randint(80, 100)] for i in range(30)]

    print(f"Cluster points:{X}")

    kmeans = KMeans(n_cluster=k, tol=3e-4)
    centroids = kmeans.fit(X)
    prediction = kmeans.predict([[0.0,0.0],[50.0,40.0],[100.0,100.0]])

    print(f"KMeans centroids: {centroids}")
    print(f"KMeans predict for [0,0],[50,40],[100,100]]: {prediction}")

    colors = ['r', 'g', 'b']
    for i in range(k):
            plt.scatter([x[0] for x in X], [x[1] for x in X], s=7, c=colors[i])
    plt.scatter([x[0] for x in centroids], [x[1] for x in centroids], marker='*', s=200, c='black')
    plt.show()
Esempio n. 2
0
from clustering import KMeans, GaussianMixture
from constants import *

utils = Utils()
prep = Preprocessor()
nlp = NLP()

# Only uncomment if data crawling is needed again. Runs Julia script.
# utils.crawl_data("news_crawler.jl")

# Only uncomment if pre-processing raw data again, e.g. after crawling new data. Saves processed documents in /data/raw.
# prep.process_raw_data(STOPS, "متن", ECONOMICS, POLITICS, SPORTS, CULTURE)

# Creates tf-idf vectors from documents. Change to False if changing other arguments.
doc_vectors = nlp.doc_to_vec(250, 200, True)
err_msg = "Error occurred! Check tf_idf vectors before continuing."

k_means = KMeans(4, 15)
if doc_vectors is not None:
    # k_means.fit(doc_vectors)
    k_means.predict(TEST)
else:
    print(err_msg)

mix_model = GaussianMixture(4, 20)
if doc_vectors is not None:
    # mix_model.fit(doc_vectors, 30)
    mix_model.predict(TEST)
else:
    print(err_msg)
Esempio n. 3
0
def kmeans_area_example():
    from time import time
    import numpy as np
    import matplotlib.pyplot as plt

    from sklearn import metrics
    from sklearn.cluster import KMeans
    from sklearn.datasets import load_digits
    from sklearn.decomposition import PCA
    from sklearn.preprocessing import scale

    np.random.seed(42)

    digits = load_digits()
    data = scale(digits.data)

    n_samples, n_features = data.shape
    n_digits = len(np.unique(digits.target))
    labels = digits.target

    sample_size = 300

    print("n_digits: %d, \t n_samples %d, \t n_features %d" %
          (n_digits, n_samples, n_features))

    print(79 * '_')
    print(
        '% 9s' % 'init'
        '    time  inertia    h**o   compl  v-meas     ARI AMI  silhouette')

    def bench_k_means(estimator, name, data):
        t0 = time()
        estimator.fit(data)
        print('% 9s   %.2fs    %i   %.3f   %.3f   %.3f   %.3f   %.3f    %.3f' %
              (name, (time() - t0), estimator.inertia_,
               metrics.homogeneity_score(labels, estimator.labels_),
               metrics.completeness_score(labels, estimator.labels_),
               metrics.v_measure_score(labels, estimator.labels_),
               metrics.adjusted_rand_score(labels, estimator.labels_),
               metrics.adjusted_mutual_info_score(labels, estimator.labels_),
               metrics.silhouette_score(data,
                                        estimator.labels_,
                                        metric='euclidean',
                                        sample_size=sample_size)))

    bench_k_means(KMeans(init='k-means++', n_clusters=n_digits, n_init=10),
                  name="k-means++",
                  data=data)

    bench_k_means(KMeans(init='random', n_clusters=n_digits, n_init=10),
                  name="random",
                  data=data)

    # in this case the seeding of the centers is deterministic, hence we run the
    # kmeans algorithm only once with n_init=1
    pca = PCA(n_components=n_digits).fit(data)
    bench_k_means(KMeans(init=pca.components_, n_clusters=n_digits, n_init=1),
                  name="PCA-based",
                  data=data)
    print(79 * '_')

    reduced_data = PCA(n_components=2).fit_transform(data)
    kmeans = KMeans(init='k-means++', n_clusters=n_digits, n_init=10)
    kmeans.fit(reduced_data)

    # Step size of the mesh. Decrease to increase the quality of the VQ.
    h = .02  # point in the mesh [x_min, x_max]x[y_min, y_max].

    # Plot the decision boundary. For that, we will assign a color to each
    x_min, x_max = reduced_data[:, 0].min() - 1, reduced_data[:, 0].max() + 1
    y_min, y_max = reduced_data[:, 1].min() - 1, reduced_data[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))

    # Obtain labels for each point in mesh. Use last trained model.
    Z = kmeans.predict(np.c_[xx.ravel(), yy.ravel()])

    # print Z
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    print Z[0]
    plt.figure(1)
    plt.clf()
    plt.imshow(
        Z,
        #interpolation='nearest',
        extent=(xx.min(), xx.max(), yy.min(), yy.max()),
        #cmap=plt.cm.Paired,
        #aspect='auto', origin='lower'
    )

    plt.plot(reduced_data[:, 0], reduced_data[:, 1], 'k.', markersize=2)
    # Plot the centroids as a white X
    centroids = kmeans.cluster_centers_
    plt.scatter(centroids[:, 0],
                centroids[:, 1],
                marker='x',
                s=169,
                linewidths=3,
                color='w',
                zorder=10)
    plt.title('K-means clustering on the digits dataset (PCA-reduced data)\n'
              'Centroids are marked with white cross')
    plt.xlim(x_min, x_max)
    plt.ylim(y_min, y_max)
    plt.xticks(())
    plt.yticks(())
    plt.show()