def spectral(X, sigma, k, centroids):
    """
    Ng谱聚类算法
    :param X: 数据点
    :param sigma: 参数
    :param k: 参数
    :return: accu聚类精度
    """
    (n, d) = X.shape
    L_sym, L = get_L(X, k, sigma)
    eig, eigvec = np.linalg.eig(L_sym)  # eigvec按列
    # eig_index = np.argsort(eig)[1:d+1]
    eig_index = np.argsort(eig)[:d]  # 最小的d个特征值的索引
    U = eigvec[:, eig_index]
    T = np.zeros(U.shape)
    for i in range(n):
        for j in range(d):
            T[i][j] = U[i][j] / np.linalg.norm(U[i])
    Y = T
    # visual(Y, k=k, sigma=sigma, save=1)

    cluster = KMeans(2, 100, centroids)
    cluster.fit(Y)
    labels = cluster.labels

    if labels[0] == 0:
        n1 = 100 - sum(labels[:100])
        n2 = sum(labels[100:])
    else:
        n1 = sum(labels[:100])
        n2 = 100 - sum(labels[100:])
    accu = (n1 + n2) / n
    print('---------------------sigma=%.2f, k=%d, accu=%.4f' %
          (sigma, k, accu))
    return accu
Beispiel #2
0
 def test_predict(self):
     test_samples = [[-3, -3], [3, 3], [-1, -1], [1, 1]]
     expected_predictions = [0, 1, 0, 1]
     k_means = KMeans(num_clusters=self.num_clusters, seed=1)
     k_means.fit(self.data)
     predictions = k_means.predict(test_samples)
     self.assertEqual(expected_predictions, predictions)
Beispiel #3
0
def cluster_newsgroups():
    """ Cluster newsgroup categories. """

    from kmeans import KMeans
    from similarity import simMatrix

    corpus, dictionary = build_dictionary(bigram=True)
    tfidf = TFIDF(dictionary)
    newsgroups = tfidf.vectorize(corpus)
    dictionary = tfidf.dictionary

    categories = sorted(corpus.keys())

    N = 6
    print "\n{}-Most Common Words".format(N)
    for index, category in enumerate(categories):
        nlargest = np.argpartition(newsgroups[index, :], -N)[-N:]
        nlargest = nlargest[np.argsort(newsgroups[index, nlargest])][::-1]
        print "{:>24} {}".format(category, dictionary[nlargest])
    print

    K = 3
    km = KMeans(n_clusters=K)
    km.fit(newsgroups)

    labels = km.labels_

    print "\nKMeans Label Assignment, K = {}".format(K)
    for category, label, in zip(categories, labels):
        print int(label), category

    simMatrix(newsgroups).plot().show()
Beispiel #4
0
    def _initialise_prams(self, X):

        # Get initial clusters using Kmeans
        kmeans = KMeans(k=self.k, max_iters=500)
        kmeans.fit(X)
        kmeans_preds = kmeans.predict(X)

        N, col_length = X.shape
        mixture_labels = np.unique(kmeans_preds)
        initial_mean = np.zeros((self.k, col_length))
        initial_cov = np.zeros((self.k, col_length, col_length))
        initial_pi = np.zeros(self.k)

        for index, mixture_label in enumerate(mixture_labels):
            mixture_indices = (kmeans_preds == mixture_label)
            Nk = X[mixture_indices].shape[0]

            # Initial pi
            initial_pi[index] = Nk / N

            # Intial mean
            initial_mean[index, :] = np.mean(X[mixture_indices], axis=0)

            # Initial covariance
            de_meaned = X[mixture_indices] - initial_mean[index, :]
            initial_cov[index] = np.dot(initial_pi[index] * de_meaned.T,
                                        de_meaned) / Nk
        assert np.sum(initial_pi) == 1
        return initial_pi, initial_mean, initial_cov
Beispiel #5
0
 def __init__(self,
              n_cluster: int,
              data: np.ndarray,
              use_kmeans: bool = False,
              w: float = 0.9,
              c1: float = 0.5,
              c2: float = 0.3,
              flag: int = 1,
              weights: list = None):
     index = np.random.choice(list(range(len(data))), n_cluster)
     self.centroids = data[index].copy()
     if use_kmeans:
         kmeans = KMeans(n_cluster=n_cluster, init_pp=False)
         kmeans.fit(data)
         self.centroids = kmeans.centroid.copy()
     self.best_position = self.centroids.copy()
     self.best_score = quantization_error(self.centroids, self._predict(data), data)
     self.flag=flag
     if self.flag%2==1:
         self.best_sse = calc_sse(self.centroids, self._predict(data), data)
     else:
         self.best_sse = calc_sse2(self.centroids, self._predict(data), data, weights)
     self.velocity = np.zeros_like(self.centroids)
     self._w = w
     self._c1 = c1
     self._c2 = c2
Beispiel #6
0
    def test_whole(self):
        """
        Tests the score method.
        """

        X, y, centers = generate_cluster_samples()
        n_samples = X.shape[0]
        n_features = X.shape[1]
        k = centers.shape[0]

        # run N_TRIALS, pick best model
        best_model = None
        for i in range(N_TRIALS):
            kmeans = KMeans(k, N_ITER)
            kmeans.fit(X)
            if best_model is None:
                best_model = kmeans
            elif kmeans.score(X) < best_model.score(X):
                best_model = kmeans

        # check sum squared errors
        sum_squared_errors = best_model.score(X)
        self.assertLess(sum_squared_errors / n_samples, EPS)

        # compare centers to expected centers
        smallest_distances = find_smallest_distances(
            best_model.cluster_centers, centers)
        for distance in smallest_distances:
            self.assertLess(distance, EPS)
Beispiel #7
0
 def test_fit_with_different_initial_centroids(self):
     expected_labels = [0, 0, 0, 1, 1, 1]
     expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]]
     k_means = KMeans(num_clusters=self.num_clusters, seed=0)
     k_means.fit(self.data)
     self.assertEqual(expected_labels, k_means.labels_)
     np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
Beispiel #8
0
def B1(pca=False):
    '''
		Plot WC_SSD and SC over K.
	'''
    K = [2, 4, 6, 8, 16, 32]
    fnames = [
        'digits-embedding.csv', 'digits-embedding-2467.csv',
        'digits-embedding-67.csv'
    ]
    wc_ssd_val = zeros((len(fnames), len(K)))
    sc_val = zeros((len(fnames), len(K)))
    for i, fname in enumerate(fnames):
        X = genfromtxt(fname, delimiter=',')[:, 2:]
        for j, k in enumerate(K):
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(X)
            wc_ssd_val[i, j], sc_val[i, j], _ = kmeans.get_evals()
    # Plot WC_SSD
    figure()
    for i, fname in enumerate(fnames):
        plot(K, wc_ssd_val[i], label=fname)
    legend()
    title('WC_SSD v.s. K')
    figure()
    for i, fname in enumerate(fnames):
        plot(K, sc_val[i], label=fname)
    legend()
    title('SC v.s. K')
    show()
Beispiel #9
0
def main2():
    df = pd.read_csv('credit_card_data.csv')
    df = df.fillna(df.median())
    original_data = df.iloc[:, 1:].values
    data = copy.deepcopy(original_data)

    columns = list(df.columns)[1:]  # lista naziva kolona
    print(columns)

    # min_max_data(df, columns)

    normalizacija(data)  # radimo normalizaciju nad ucitanim podacima

    pca = PCA()
    pca.fit(data)

    # odredjujem na koliiko cu da smanjim dimenzionalnost
    plt.plot(range(1, 18), pca.explained_variance_ratio_.cumsum(), marker='x', linestyle='--')
    plt.xlabel('Components')  # features
    plt.ylabel('Variance')
    plt.show()

    components = 7  # vidimo iz plota
    pca = PCA(n_components=components)
    pca.fit(data)
    scores = pca.transform(data)
    # print(scores)  # ima onoliko komponenti koliko smo stavili

    # pokazujemo da prve dve dimenzije uticu najvise na grafik
    plt.bar(range(pca.n_components_), pca.explained_variance_ratio_, color='black')
    plt.xlabel('PCA components')
    plt.ylabel('Variance %')  # procenat koliko uticu na grafik, da tako kazemo
    plt.xticks(range(pca.n_components_))
    plt.show()

    # dobijam optimal k = 5 za 500 prvih ucitanih
    # za sve ucitane dobijam 6
    # optimal_k_plot(data)

    broj_klastera = 6

    k_means = MyKMeans(n_clusters=broj_klastera, max_iter=100)
    k_means.fit(scores, normalize=False)
    klaster_indeksi = k_means.klaster_indeksi
    print(klaster_indeksi)

    lista_klastera_sa_originalnim_podacima = []  # lista klastera sa originalnim podacima
    for i in range(broj_klastera):
        lista_klastera_sa_originalnim_podacima.append([])

    for i in range(len(original_data)):
        lista_klastera_sa_originalnim_podacima[klaster_indeksi[i]].append(original_data[i])

    # printujem osobine i stablo odlucivanja
    print_descriptions(lista_klastera_sa_originalnim_podacima, columns)
    # print_decision_tree(original_data, klaster_indeksi, columns)
    print_clusters_description()

    # iscrtavamo tacke
    plot_2_D(k_means)
Beispiel #10
0
def cluster_newsgroups():
    """ Cluster newsgroup categories. """

    from kmeans import KMeans
    from similarity import simMatrix

    corpus, dictionary = build_dictionary(bigram=True)
    tfidf = TFIDF(dictionary)
    newsgroups = tfidf.vectorize(corpus)
    dictionary = tfidf.dictionary

    categories = sorted(corpus.keys())

    N = 6
    print "\n{}-Most Common Words".format(N)
    for index, category in enumerate(categories):
        nlargest = np.argpartition(newsgroups[index,:], -N)[-N:]
        nlargest = nlargest[np.argsort(newsgroups[index,nlargest])][::-1]
        print "{:>24} {}".format(category, dictionary[nlargest])
    print

    K = 3
    km = KMeans(n_clusters=K)
    km.fit(newsgroups)

    labels = km.labels_

    print "\nKMeans Label Assignment, K = {}".format(K)
    for category, label, in zip(categories, labels):
        print int(label), category

    simMatrix(newsgroups).plot().show()
Beispiel #11
0
 def test_select_initial_centroids(self):
     expected_initial_centroids = [[2, 1], [-1, -2]]
     k_means = KMeans(num_clusters=self.num_clusters, seed=3)
     k_means.fit(self.data)
     initial_centroids = k_means._select_initial_centroids(self.data)
     self.assertEqual(expected_initial_centroids, initial_centroids)
     self.assertEqual(self.num_clusters, len(initial_centroids))
Beispiel #12
0
    def __init_parameters(self):
        N = self.X.shape[0]
        n_features = self.X.shape[1]

        kmeans = KMeans(n_clusters=self.n_components, n_init=5)
        kmeans.fit(self.X)

        # mu, means for each component
        self.means_ = kmeans.cluster_centers_
        # sigma, covariances for each component
        self.covariances_ = np.zeros(
            [self.n_components, n_features, n_features])
        # pi, weights for each component
        self.weights_ = np.zeros(self.n_components)
        for k in range(self.n_components):
            logic = (kmeans.labels_ == k)
            Nk = logic.sum()

            # otherwise error
            if Nk > 1:
                Xk = self.X[logic]
                self.covariances_[k] = np.cov(Xk.T)

            self.weights_[k] = Nk / N

        # gamma(Znk)
        self.gamma = np.zeros([N, self.n_components])
        # log_likelihood
        self.lower_bound_ = -np.inf

        return self
Beispiel #13
0
def main():
    filepath = "./data/self_test.csv"
    #filepath = "./data/self_test_petit.csv"
    #filepath = "./data/iris.csv"

    # chargement des données
    data, labels = load_dataset(filepath)

    # initialisation de l'objet KMeans
    kmeans = KMeans(n_clusters=3,
                    max_iter=100,
                    early_stopping=True,
                    tol=1e-6,
                    display=True)

    # calcule les clusters
    kmeans.fit(data)

    # calcule la pureté de nos clusters
    score = kmeans.score(data, labels)
    print("Pureté : {}".format(score))



    input("Press any key to exit...")
Beispiel #14
0
    def test06_fit_two_clusters(self):
        np.random.seed(1)
        model = KMeans(k=2, init=init.forgy_initialization)
        data = np.array([[-1.0, 0.0], [-1.001, 0.0], [-0.999, 0.0], [0.0, 1.0],
                         [0.0, 0.999], [0.0, 1.001]])

        model.fit(data)
        self.assertEquals(model.predict(data), [1, 1, 1, 0, 0, 0])
Beispiel #15
0
def plot_elbow(interval, data, random_seed=None):
    inertia = []
    for n_clusters in interval:
        clf = KMeans(k=n_clusters, init='kmeans++', random_seed=random_seed)
        clf.fit(data)
        inertia.append(clf.inertia)
    plot_metrics(interval, inertia, 'Elbow method', 'Number of clusters (K)',
                 'Sum of Squared Error')
Beispiel #16
0
def test_fit():
    test_model = KMeans(n_clust=2, random_seed=100)
    test_x_train = np.array([[1, 2], [1, 4.1], [1, 0], [10, 2.1], [10, 4.1],
                             [10, 0]])
    test_model.fit(test_x_train)
    expected_classes = np.array([0, 0, 0, 1, 1, 1])
    expected_centers = np.array([[1., 2.0333333], [10., 2.0666667]])
    np.testing.assert_array_equal(test_model.classes, expected_classes)
    np.testing.assert_allclose(test_model.centers, expected_centers)
Beispiel #17
0
def test_kmeans():
    X = np.random.normal(size=(50, 2))
    km = KMeans(nr_clusters=2)
    km.fit(X)
    assert km.centroids.shape[0] == 2
    distances = []
    for centroid in km.centroids:
        distances.append(km.euclidean_distance_2d(centroid, X[-1:]))
    assert km.predict(X[-1:])[0] == np.argmin(distances)
Beispiel #18
0
 def test_fit(self):
     expected_labels = [0, 0, 0, 1, 1, 1]
     expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]]
     expected_inertia = 2.6666667
     k_means = KMeans(num_clusters=self.num_clusters, seed=1)
     k_means.fit(self.data)
     self.assertEqual(expected_labels, k_means.labels_)
     np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
     self.assertAlmostEqual(expected_inertia, k_means.inertia_)
Beispiel #19
0
 def initialize(self, data):
     """
     :param data: data, numpy 2-D array
     """
     clf = KMeans(self.K)
     clf.fit(data, 10)
     self.centers = clf.get_centers()
     self.weights = np.ones(self.K) / self.K
     self.covariances = np.array(
         [1e10 * np.eye(data.shape[1]) for _ in range(self.K)])
Beispiel #20
0
def main():
    (X_train, y_train), (X_test, y_test) = tf.contrib.keras.datasets.mnist.load_data()
    X_train = (X_train).reshape(-1, 28*28)
    X_test = (X_test).reshape(-1, 28*28)
    Y_train = tf.contrib.keras.utils.to_categorical(y_train)
    print("Data Loaded")

    model = KMeans(k=25, n_features=28*28, n_classes=10)
    model.fit(X_train, Y_train)
    print("final testing accuracy: %.4f" % (model.predict(X_test) == y_test).mean())
Beispiel #21
0
 def test_manhattan_distance(self):
     expected_labels = [0, 0, 0, 1, 1, 1]
     expected_centroids = [[-2, -2], [2, 2]]
     expected_inertia = 4
     k_means = KMeans(num_clusters=self.num_clusters,
                      distance_function='manhattan',
                      seed=1)
     k_means.fit(self.data)
     self.assertEqual(expected_labels, k_means.labels_)
     np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
     self.assertAlmostEqual(expected_inertia, k_means.inertia_)
Beispiel #22
0
 def test_gaussian_mixture(self):
     pos_list, ground_truth = datasets.make_blobs(n_samples=100,
                                                  centers=[[3, 3], [-3, -3], [3, -3], [-3, 3]],
                                                  cluster_std=1, random_state=0)
     kmeans = KMeans(4)
     standard_kmeans = cluster.KMeans(4, random_state=0)
     np.random.seed(2020)
     kmeans.fit(pos_list)
     standard_kmeans.fit(pos_list)
     self.assertAlmostEqual(metrics.adjusted_rand_score(kmeans.labels_, ground_truth), 1.0)
     self.assertAlmostEqual(kmeans.inertia_, standard_kmeans.inertia_)
Beispiel #23
0
 def test_implementation(self):
     x = np.array([[0, 0], [0, 1], [4, 0], [4, 1]])
     kmeans = KMeans(2)
     np.random.seed(2020)
     kmeans.fit(x)
     self.assertAlmostEqual(kmeans.inertia_, 1.0)
     self.assertAlmostEqual(metrics.adjusted_rand_score(kmeans.labels_, [0, 0, 1, 1]), 1.0)
     if np.abs(kmeans.cluster_centers_[0, 0] - 4) < 1e-5:
         assert_array_almost_equal(kmeans.cluster_centers_, np.array([[4, 0.5], [0, 0.5]]))
     else:
         assert_array_almost_equal(kmeans.cluster_centers_, np.array([[0, 0.5], [4, 0.5]]))
 def test_fit_predict(self):
     data = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
     model = KMeans(2)
     model.fit(data)
     labels = model.predict(data)
     self.assertEqual(len(labels), len(data))
     self.assertGreaterEqual(2, len(np.unique(labels)))
     for label in labels:
         self.assertTrue(
             isinstance(label, np.int64) or isinstance(label, np.int32)
             or isinstance(label, int))
Beispiel #25
0
    def test05_fit_one_cluster(self):
        model = KMeans(k=1, init=init.forgy_initialization)

        data = np.array([[0.0, 0.0]])

        model.fit(data)

        self.assertEqual(model.predict(data), [0])

        np_test.assert_array_equal(model.centroids, np.array([[0.0, 0.0]]))

        test_points = np.array([[1.0, 0.0], [0.0, 1.0], [1.0, 1.0]])
        self.assertEqual(model.predict(test_points), [0] * 3)
Beispiel #26
0
def plot_optimal_k(data):
    plt.figure()
    sum_squared_errors = []
    for n_clusters in range(1, 20):
        k_means = KMeans(n_clusters=n_clusters, max_iter=100)
        k_means.fit(data)
        sse = k_means.sum_squared_error()
        sum_squared_errors.append(sse)
    print(sum_squared_errors)
    plt.plot(range(1, 20), sum_squared_errors)
    plt.xlabel('# of clusters')
    plt.ylabel('WCSSE')
    plt.show()
    def initialize(self, data: np.ndarray):
        """
        Initializes cluster centers, weights, and covariances

        :param data: data, numpy 2-D array
        """
        km = KMeans(self.K)
        km.fit(data)
        _ = km.predict(data)
        self.centers = km.get_centers()
        self.weights = np.random.uniform(0, 1, (self.K, ))
        self.weights = self.weights / np.sum(self.weights)
        self.covariances = np.array([np.eye(data.shape[-1])] * self.K) * 10e8
Beispiel #28
0
    def train(self, x_train):
        """Receive the input training data, then learn the model.

        Parameters
        ----------
        x_train: np.array, shape (num_samples, num_features)
        Returns
        -------
        None
        """
        self.affinity_matrix_ = self._get_affinity_matrix(x_train)
        embedding_features = self._get_embedding()
        kmeans = KMeans(n_clusters=self.n_clusters)
        kmeans.fit(embedding_features)
        self.labels_ = kmeans.labels_
Beispiel #29
0
def main(args):
    data = load_data(Path(args.data_csv))
    plt.clf()
    plt.scatter(data[:, 0], data[:, 1], s=3, color='blue')

    if args.algorithm == 'gmm':
        gmm = GaussianMixtureModel(args.num_clusters)
        gmm.fit(data)
        y = gmm.predict_cluster(data)
    else:
        km = KMeans(args.num_clusters)
        km.fit(data)
        y = km.predict(data)
    plt.scatter(data[:, 0], data[:, 1], c=y)
    plt.show()
Beispiel #30
0
def main():
    ds = Dataset(config)
    imgs, annots = ds.open_traffic_ds(config)
    dp = DataPrepper(x_data=imgs,y_data=annots)

    dp.x_data_scaled,dp.y_data_scaled = dp.rescale_data(dp.x_data,dp.y_data)
    km = KMeans(k=args.k,dataset=dp.y_data_scaled)
    if(args.fit_avg):
        km.fit_average(max_iterations=args.kmeans_iters)
        if(args.save_anchors):
            km.write_anchors(km.centroids)
    else:
        km.fit()
        if(args.save_anchors):
            km.write_anchors(km.centroids)
Beispiel #31
0
def kmeans_toy():
    x, y = toy_dataset(4)
    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=y)
    fig.savefig('plots/toy_dataset_real_labels.png')

    fig.ax.scatter(x[:, 0], x[:, 1])
    fig.savefig('plots/toy_dataset.png')
    n_cluster = 4
    k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8)
    centroids, membership, i = k_means.fit(x)

    assert centroids.shape == (n_cluster, 2), \
        ('centroids for toy dataset should be numpy array of size {} X 2'
            .format(n_cluster))

    assert membership.shape == (50 * n_cluster,), \
        'membership for toy dataset should be a vector of size 200'

    assert type(i) == int and i > 0,  \
        'Number of updates for toy datasets should be integer and positive'

    print('[success] : kmeans clustering done on toy dataset')
    print('Toy dataset K means clustering converged in {} steps'.format(i))

    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=membership)
    fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red')
    fig.savefig('plots/toy_dataset_predicted_labels.png')

    np.savez('results/k_means_toy.npz',
             centroids=centroids,
             step=i,
             membership=membership,
             y=y)
def main(args):
    df = pd.read_csv(args.data_csv)
    data = np.array(df[['X', 'Y']])
    plt.clf()
    plt.scatter(data[:, 0], data[:, 1], s=3, color='blue')

    if args.algorithm == 'gmm':
        gmm = GaussianMixtureModel(args.num_clusters)
        gmm.fit(data)
        y = gmm.predict_cluster(data)
    else:
        km = KMeans(args.num_clusters)
        km.fit(data)
        y = km.predict(data)
    plt.scatter(data[:, 0], data[:, 1], c=y)
    plt.show()
Beispiel #33
0
def calculate_em(X, n_clusters, diag=False, ridge=1e-10, verbose=False, max_iterations=100):
    """
    Returns mu, sigma and tpi
    """
    n_samples, n_features = X.shape
    # Initialise the data using kmeans
    k_means = KMeans(k=n_clusters)
    k_means_labels, _ = k_means.fit(X.copy())
    k_means_cluster_centers = k_means.centers_

    # OK, so we've got the centers and the labels. Let's now compute the EM
    # algorithm
    tau = np.zeros((n_samples, n_clusters))
    mu = np.zeros((n_clusters, n_features))
    sigma = np.zeros((n_clusters, n_features, n_features))
    p = np.zeros((n_clusters, n_samples))
    # FIXME shouldbe able to do the following using pure matric arithmetics
    for i, element in enumerate(k_means_labels):
        tau[i, element] = 1

    for j in range(max_iterations):
        old_mu = mu.copy()
        for i in range(n_clusters):
            mu[i] = (tau[:, i].reshape((tau.shape[0], 1)) * X).sum(axis=0) / (tau[:, i]).sum()

        for i in range(n_clusters):
            a = 0
            for n in range(n_samples):
                b = (X[n, :] - mu[i]).reshape((2, 1))
                if diag:
                    a += tau[n, i] * np.dot(b.T, b)
                else:
                    a += tau[n, i] * np.dot(b, b.T)

            if diag:
                sigma[i, :] = a.mean() / tau[:, i].sum() * np.identity(mu.shape[1])
            else:
                sigma[i, :] = a / tau[:, i].sum()

        tpi = tau.sum(axis=1) / n_samples
        for i in range(n_clusters):
            p[i, :] = _calculate_normal(X, mu[i, :], sigma[i, :])

        for i in range(n_clusters):
            tau.T[i, :] = tpi[i] * p[i, :] / (tpi * p).sum(axis=0)

        if ((old_mu - mu) ** 2).sum() < ridge:
            if verbose:
                print "break at iterations %d" % j
            break

    return mu, sigma, tpi
Beispiel #34
0
iris_data = iris_data.data[:, 1:3]  # uzima se druga i treca osobina iz data seta (sirina sepala i duzina petala)

plt.figure()
for i in range(len(iris_data)):
    plt.scatter(iris_data[i, 0], iris_data[i, 1])

plt.xlabel('Sepal width')
plt.ylabel('Petal length')
plt.show()


# --- INICIJALIZACIJA I PRIMENA K-MEANS ALGORITMA --- #

# TODO 2: K-means na Iris data setu
kmeans = KMeans(n_clusters=2, max_iter=100)
kmeans.fit(iris_data)

colors = {0: 'red', 1: 'green'}
plt.figure()
for idx, cluster in enumerate(kmeans.clusters):
    plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200)  # iscrtavanje centara
    for datum in cluster.data:  # iscrtavanje tacaka
        plt.scatter(datum[0], datum[1], c=colors[idx])

plt.xlabel('Sepal width')
plt.ylabel('Petal length')
plt.show()


# --- ODREDJIVANJE OPTIMALNOG K --- #
Beispiel #35
0
X = load_data('EMGaussienne.data')
X_test = load_data('EMGaussienne.data')

n_clusters = 4
num_init = 3

##############################################################################
# Plot result

fig = pl.figure()
colors = ['#4EACC5', '#FF9C34', '#4E9A06', '#00465F']

for ini in range(num_init):
    km = KMeans(k=n_clusters)

    k_means_labels, k_means_inertia = km.fit(X)
    k_means_cluster_centers = km.centers_
    k_means_labels_test, k_means_inertia_test = km.predict(X_test)


    # KMeans
    ax = fig.add_subplot(num_init, 2, 2 * ini + 1)
    for k, col in zip(range(n_clusters), colors):
        my_members = k_means_labels == k
        cluster_center = k_means_cluster_centers[k]
        ax.plot(X[my_members, 0], X[my_members, 1], 'w',
                markerfacecolor=col, marker='.')
        ax.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
                                        markeredgecolor='k', markersize=6)
    ax.set_title('KMeans - inertia %d' % k_means_inertia)
    r2, theta2 = np.random.normal(5, 0.25), np.random.uniform(0, 2*np.pi)
    x2, y2 = r2 * np.cos(theta2), r2 * np.sin(theta2)
    s2[i] = (x2, y2, r2, theta2)

    plt.scatter(x1, y1)
    plt.scatter(x2, y2)

    data.append((x1, y1))
    data.append((x2, y2))

plt.show()

# TODO 5: K-means nad ovim podacima
kmeans = KMeans(n_clusters=2, max_iter=100)
kmeans.fit(data)

colors = {0: 'red', 1: 'green'}
plt.figure()
for idx, cluster in enumerate(kmeans.clusters):
    plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200)  # iscrtavanje centara
    for datum in cluster.data:  # iscrtavanje tacaka
        plt.scatter(datum[0], datum[1], c=colors[idx])

plt.show()

# TODO 7: DBSCAN nad ovim podacima
dbscan = DBScan(epsilon=1.2, min_points=3)
dbscan.fit(data)

colors = {0: 'red', 1: 'pink', 2: 'yellow', 3: 'cyan', 4: 'green', 5: 'blue'}
Beispiel #37
0
class ClsfCRTL(object):

    def __init__(self, n_clusters, initCent, max_iter):
        self.data = np.array([])
        self.belongApp = np.array([])
        self.n_clusters = n_clusters
        self.clf = KMeans(n_clusters, initCent, max_iter)

    def genDataset(self, file_name):
        dataSet, belongApp = [], []
        f = open(file_name, "r")
        lines = f.readlines()
        for line in lines:
            line_elm = line.split("\t")
            dataSet.append([int(line_elm[0]), 0])
            belongApp.append(line_elm[1].rstrip("\n"))
        self.data = np.array(dataSet)
        self.belongApp = np.array(belongApp)
        f.close()

    def clsf(self):
        self.clf.fit(self.data)

    def show(self):
        cents = self.clf.centroids
        labels = self.clf.labels
        sse = self.clf.sse
        colors = ['b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868', '#00FF00', '#330000',
                  '#333300', '#333333', '#CC0099', '#FFFF00', '#FF99CC', '#CCCC66', '#003333', '#66FFFF']
        for i in range(self.n_clusters):
            index = np.nonzero(labels==i)[0]
            x0 = self.data[index, 0]
            x1 = self.data[index, 1]
            y_i = self.belongApp[index]
            for j in range(len(x0)):
                plt.scatter(x0[j], x1[j], marker='o', color=colors[i])
                # plt.text(x0[j],x1[j],str(y_i[j]),color=colors[i],fontdict={'weight': 'bold', 'size': 9})
            plt.scatter(cents[i,0], cents[i,1], marker='x', color=colors[i], linewidths=5)
        plt.title("SSE={:.2f}".format(sse))
        plt.axis([0, 1600, -2, 2])
        plt.show()

    def showBar(self):
        n = 1600
        X = np.arange(n)
        Y1 = (1-X/float(n) * np.random.uniform(0.5, 1.0, n))
        rect = plt.bar(X, +Y1, facecolor='#524C90', edgecolor='white')

        for x,y in zip(X, Y1):
            plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va='bottom')

        plt.xlim(-0.5, 12.5)
        plt.ylim(-0.1, +1.25)
        plt.xlabel("xlabel")
        plt.ylabel("ylabel")
        plt.title("title")
        plt.legend((rect,), ("example",))
        plt.show()

    def genResFile(self, i):
        cents = self.clf.centroids
        sse = self.clf.sse

        f = open("Res" + "-" + str(i), "w")
        f.write(str(cents.shape[0]) + '\n')
        for cent in cents:
            f.write(str(cent[0]) + '\t' + str(cent[1]) + '\n')
        f.write(str(sse) + '\n')

        # test
        f.write("\n")
        for clu in self.clf.clusterAssment:
            f.write(str(clu[0]) + '\t' + str(clu[1]) + '\n')
        # test

        f.close()
Beispiel #38
0
iris_data = iris_data.data[:, 1:3]  # uzima se druga i treca osobina iz data seta (sirina sepala i duzina petala)

plt.figure()
for i in range(len(iris_data)):
    plt.scatter(iris_data[i, 0], iris_data[i, 1])

plt.xlabel('Sepal width')
plt.ylabel('Petal length')
plt.show()


# --- INICIJALIZACIJA I PRIMENA K-MEANS ALGORITMA --- #

# TODO 2: K-means na Iris data setu
kmeans = KMeans(n_clusters=2, max_iter=100)
kmeans.fit(iris_data, normalize=True)

colors = {0: 'red', 1: 'green'}
plt.figure()
for idx, cluster in enumerate(kmeans.clusters):
    plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200)  # iscrtavanje centara
    for datum in cluster.data:  # iscrtavanje tacaka
        plt.scatter(datum[0], datum[1], c=colors[idx])

plt.xlabel('Sepal width')
plt.ylabel('Petal length')
plt.show()


# --- ODREDJIVANJE OPTIMALNOG K --- #
Beispiel #39
0
from em import calculate_log_likelihood
from kmeans import KMeans


n_clusters = 4
X = utils.load_data('EMGaussienne.data')
Xtest = utils.load_data('EMGaussienne.test')

max_iterations = 150
ridge = 1e-6
verbose = True

n_samples, n_features = X.shape
# Initialise the data using kmeans
k_means = KMeans(k=n_clusters)
k_means_labels, _ = k_means.fit(X.copy())
k_means_cluster_centers = k_means.centers_

mu, sigma, tpi = calculate_em(X, n_clusters)
print 'Log likelihood %d' % calculate_log_likelihood(X, mu, sigma, tpi)
print 'Log likelihood %d' % calculate_log_likelihood(Xtest, mu, sigma, tpi)

p = np.zeros((n_clusters, n_samples))
for i in range(n_clusters):
    p[i, :] = _calculate_normal(X, mu[i, :], sigma[i, :])

em_labels = p.argmax(axis=0)

p = np.zeros((n_clusters, n_samples))
for i in range(n_clusters):
    p[i, :] = _calculate_normal(Xtest, mu[i, :], sigma[i, :])
Beispiel #40
0
    def fit(self, X):
        self.X = X
        self.N = X.shape[0]
        self.ndim = X.shape[1]
        np.random.seed(self.random_seed)
        matX = np.asmatrix(X)

        # initialization schemes
        if self.init_method == 'random':
            if self.init_means is not None:
                mu = self.init_means
            else:
                mu = X[np.random.choice(range(0, len(X)), self.num_gaussians), :]  # sample from the data
            if self.init_cov is not None:
                sigma = self.init_cov
            else:
                sigma = list()
                for k in range(self.num_gaussians):
                    sigma.append(np.identity(self.ndim, dtype=np.float64))
                    sigma[k] += np.random.rand(self.ndim, self.ndim)  # purely synthetic
                    sigma[k] = np.dot(sigma[k], sigma[k].T)  # making it positive semi-definite and symmetric
                    sigma[k] /= sigma[k].sum()

                    # lowerbound = k * self.N / self.num_gaussians  # sample from data
                    # upperbound = lowerbound + 20
                    # sigma[k] = np.cov(X[lowerbound:upperbound, :].T)

            if self.init_weights is not None:
                lmbda = self.init_weights
            else:
                lmbda = np.random.rand(self.num_gaussians)
                lmbda /= lmbda.sum()

        elif self.init_method == 'kmeans':  # use means of kmeans as initial means, and calculate cov from the clusters
            model = KMeans(K=self.num_gaussians, max_iter=5)
            model.fit(X)
            labels = model.pred(X)
            mu = np.zeros((self.num_gaussians, self.ndim))
            sigma = [np.zeros((self.ndim, self.ndim))] * self.num_gaussians
            for k in range(self.num_gaussians):
                cluster = X[labels == k]
                mu[k] = cluster.mean(axis=0)
                sigma[k] = np.cov(cluster.T)
            if self.init_weights is not None:
                lmbda = self.init_weights
            else:
                lmbda = np.random.rand(self.num_gaussians)
                lmbda /= lmbda.sum()


        ######## BEGIN ACTUAL ALGORITHM ###################
        for iter in range(self.max_iter):
            phat = np.zeros((self.N, self.num_gaussians))
            N = np.zeros(self.num_gaussians)

            # E step
            for k in range(0, self.num_gaussians):
                normal_var = normal(mean=mu[k], cov=sigma[k])
                phat[:, k] = lmbda[k] * normal_var.pdf(X)
            phat /= phat.sum(axis=1)[:, None]
            # faster to do it all with numpy than use loops

            # for n in range(0, self.N):  # loop over each data point
            #     for k in range(0, self.num_gaussians):
            #         normalx = normal(mean=mu[k], cov=sigma[k]).pdf(X[n, :])
            #         phat[n, k] = lmbda[k] * normalx
            #     phat[n, :] /= phat[n, :].sum()

            # M step
            for k in range(self.num_gaussians):
                N[k] = phat[:, k].sum()
                mu[k] = np.dot(phat[:, k], X) / N[k]
                intermed = np.multiply(phat[:, k], (matX - mu[k]).T).T
                sigma[k] = np.dot(intermed.T, (matX - mu[k])) / N[k]
                lmbda[k] = N[k] / self.N

            pass  # end of this iteration
        self.mu = mu
        self.sigma = sigma
        self.lmbda = lmbda
Beispiel #41
0
import matplotlib.pyplot as plt
import numpy as np
from kmeans import KMeans,biKMeans
    
if __name__ == "__main__":
    #加载数据
    X,y = cPickle.load(open('data.pkl','r'))

    #依次画出迭代1次、2次、3次...的图
    for max_iter in range(6):
        #设置参数
        n_clusters = 10
        initCent = X[50:60] #将初始质心初始化为X[50:60]
        #训练模型
        clf = KMeans(n_clusters,initCent,max_iter)
        clf.fit(X)
        cents = clf.centroids
        labels = clf.labels
        sse = clf.sse
        #画出聚类结果,每一类用一种颜色
        colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']
        for i in range(n_clusters):
            index = np.nonzero(labels==i)[0]
            x0 = X[index,0]
            x1 = X[index,1]
            y_i = y[index]
            for j in range(len(x0)):
                plt.text(x0[j],x1[j],str(int(y_i[j])),color=colors[i],\
                         fontdict={'weight': 'bold', 'size': 9})
            plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)
        plt.title("SSE={:.2f}".format(sse))