def kmeans_builder(centroid_func):
    samples_per_cluster = 50
    n_cluster = 9

    x, y = toy_dataset(n_cluster, samples_per_cluster)
    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=y)
    fig.savefig('plots/toy_dataset_real_labels.png')

    fig.ax.scatter(x[:, 0], x[:, 1])
    fig.savefig('plots/toy_dataset.png')

    k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8)

    centroids, membership, i = k_means.fit(x, centroid_func)



    assert centroids.shape == (n_cluster, 2), \
        ('centroids for toy dataset should be numpy array of size {} X 2'
            .format(n_cluster))

    assert membership.shape == (samples_per_cluster * n_cluster,), \
        'membership for toy dataset should be a vector of size {}'.format(len(membership))

    assert type(i) == int and i > 0,  \
        'Number of updates for toy datasets should be integer and positive'

    print('[success] : kmeans clustering done on toy dataset')
    print('Toy dataset K means clustering converged in {} steps'.format(i))

    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=membership)
    fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red')
    fig.savefig('plots/toy_dataset_predicted_labels.png')
Beispiel #2
0
def kmeans_image_compression():
    im = plt.imread('baboon.tiff')
    N, M = im.shape[:2]
    im = im / 255

    # convert to RGB array
    data = im.reshape(N * M, 3)

    k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6)
    centroids, _, i = k_means.fit(data)

    print('RGB centroids computed in {} iteration'.format(i))
    new_im = transform_image(im, centroids)

    assert new_im.shape == im.shape, \
        'Shape of transformed image should be same as image'

    mse = np.sum((im - new_im)**2) / (N * M)
    print('Mean square error per pixel is {}'.format(mse))
    plt.imsave('plots/compressed_baboon.png', new_im)

    np.savez('results/k_means_compression.npz',
             im=im,
             centroids=centroids,
             step=i,
             new_image=new_im,
             pixel_error=mse)
Beispiel #3
0
def B4(pca=False):
    '''
		Evaluate using NMI and visualize in 2D.
	'''
    fnames = [
        'digits-embedding.csv', 'digits-embedding-2467.csv',
        'digits-embedding-67.csv'
    ]
    nmi = zeros(len(fnames))
    for i, k, fname in zip([0, 1, 2], [8, 4, 2], fnames):
        raw = genfromtxt(fname, delimiter=',')
        X = raw[:, 2:]
        y = get_normalized_labels(raw[:, 1])
        kmeans = KMeans(n_clusters=k)
        ind = kmeans.fit(X, y)
        _, _, nmi[i] = kmeans.get_evals()
        figure()
        perm = permutation(X.shape[0])[:1000]
        X = X[perm]
        ind = ind[perm]
        colors = rand(k, 3)[ind, :]
        scatter(X[:, 0], X[:, 1], c=colors, alpha=0.9, s=30)
    print(fnames)
    print("NMI =", nmi)
    show()
Beispiel #4
0
 def test_predict(self):
     test_samples = [[-3, -3], [3, 3], [-1, -1], [1, 1]]
     expected_predictions = [0, 1, 0, 1]
     k_means = KMeans(num_clusters=self.num_clusters, seed=1)
     k_means.fit(self.data)
     predictions = k_means.predict(test_samples)
     self.assertEqual(expected_predictions, predictions)
Beispiel #5
0
    def __init__(self,
                 K_max,
                 embedding_mats,
                 vec_ids_dict,
                 durations_dict,
                 landmarks_dict,
                 n_slices_min=0,
                 n_slices_max=20,
                 min_duration=0,
                 p_boundary_init=0.5,
                 init_assignments="rand",
                 wip=0):

        # Attributes from parameters
        self.n_slices_min = n_slices_min
        self.n_slices_max = n_slices_max
        self.wip = wip

        # Process embeddings into a single matrix, and vec_ids into a list (entry for each utterance)
        embeddings, vec_ids, ids_to_utterance_labels = process_embeddings(
            embedding_mats,
            vec_ids_dict  #, n_slices_min=n_slices_min
        )
        self.ids_to_utterance_labels = ids_to_utterance_labels
        N = embeddings.shape[0]

        # Initialize `utterances`
        lengths = [len(landmarks_dict[i]) for i in ids_to_utterance_labels]
        landmarks = [landmarks_dict[i] for i in ids_to_utterance_labels]
        durations = [durations_dict[i] for i in ids_to_utterance_labels]
        self.utterances = Utterances(lengths,
                                     vec_ids,
                                     durations,
                                     landmarks,
                                     p_boundary_init=p_boundary_init,
                                     n_slices_min=n_slices_min,
                                     n_slices_max=n_slices_max,
                                     min_duration=min_duration)

        # Embeddings in the initial segmentation
        init_embeds = []
        for i in range(self.utterances.D):
            init_embeds.extend(self.utterances.get_segmented_embeds_i(i))
        init_embeds = np.array(init_embeds, dtype=int)
        init_embeds = init_embeds[np.where(init_embeds != -1)]
        print("No. initial embeddings: {}".format(init_embeds.shape[0]))

        # Initialize the K-means components
        assignments = -1 * np.ones(N, dtype=int)
        if init_assignments == "rand":
            assignments[init_embeds] = np.random.randint(
                0, K_max, len(init_embeds))
        elif init_assignments == "spread":
            n_init_embeds = len(init_embeds)
            assignment_list = (
                range(K_max) *
                int(np.ceil(float(n_init_embeds) / K_max)))[:n_init_embeds]
            random.shuffle(assignment_list)
            assignments[init_embeds] = np.array(assignment_list)
        self.acoustic_model = KMeans(embeddings, K_max, assignments)
def cluster_colors(img, n_clusters):
    kmeans = KMeans(n_clusters=n_clusters)
    color_vectors = cv.cvtColor(img, cv.COLOR_BGR2RGB).reshape([-1, 3])
    centroids = kmeans.fit(color_vectors)
    labels = kmeans.predict(color_vectors)
    pred = labels.reshape(img.shape[:-1])

    # Initialize img for clusters
    cluster_img = np.zeros(img.shape)
    for i in range(n_clusters):
        cluster_img[np.where(pred == i)] = centroids[i]

    cluster_img = cluster_img.astype(np.uint8)
    plt.figure(figsize=(10, 10))
    plt.imshow(cluster_img)

    colors = ["Cluster {}".format(i) for i in range(n_clusters)]
    patches = [
        mpatches.Patch(color=centroids[i] / 255, label=colors[i])
        for i in range(len(colors))
    ]
    plt.legend(handles=patches,
               bbox_to_anchor=(1.05, 1),
               loc=2,
               borderaxespad=0.)
    plt.show()

    return kmeans
Beispiel #7
0
def kmeans_toy():
    x, y = toy_dataset(4)
    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=y)
    fig.savefig('plots/toy_dataset_real_labels.png')

    fig.ax.scatter(x[:, 0], x[:, 1])
    fig.savefig('plots/toy_dataset.png')
    n_cluster = 4
    k_means = KMeans(n_cluster=n_cluster, max_iter=100, e=1e-8)
    centroids, membership, i = k_means.fit(x)

    assert centroids.shape == (n_cluster, 2), \
        ('centroids for toy dataset should be numpy array of size {} X 2'
            .format(n_cluster))

    assert membership.shape == (50 * n_cluster,), \
        'membership for toy dataset should be a vector of size 200'

    assert type(i) == int and i > 0,  \
        'Number of updates for toy datasets should be integer and positive'

    print('[success] : kmeans clustering done on toy dataset')
    print('Toy dataset K means clustering converged in {} steps'.format(i))

    fig = Figure()
    fig.ax.scatter(x[:, 0], x[:, 1], c=membership)
    fig.ax.scatter(centroids[:, 0], centroids[:, 1], c='red')
    fig.savefig('plots/toy_dataset_predicted_labels.png')

    np.savez('results/k_means_toy.npz',
             centroids=centroids,
             step=i,
             membership=membership,
             y=y)
Beispiel #8
0
 def test_fit_with_different_initial_centroids(self):
     expected_labels = [0, 0, 0, 1, 1, 1]
     expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]]
     k_means = KMeans(num_clusters=self.num_clusters, seed=0)
     k_means.fit(self.data)
     self.assertEqual(expected_labels, k_means.labels_)
     np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
Beispiel #9
0
    def _initialise_prams(self, X):

        # Get initial clusters using Kmeans
        kmeans = KMeans(k=self.k, max_iters=500)
        kmeans.fit(X)
        kmeans_preds = kmeans.predict(X)

        N, col_length = X.shape
        mixture_labels = np.unique(kmeans_preds)
        initial_mean = np.zeros((self.k, col_length))
        initial_cov = np.zeros((self.k, col_length, col_length))
        initial_pi = np.zeros(self.k)

        for index, mixture_label in enumerate(mixture_labels):
            mixture_indices = (kmeans_preds == mixture_label)
            Nk = X[mixture_indices].shape[0]

            # Initial pi
            initial_pi[index] = Nk / N

            # Intial mean
            initial_mean[index, :] = np.mean(X[mixture_indices], axis=0)

            # Initial covariance
            de_meaned = X[mixture_indices] - initial_mean[index, :]
            initial_cov[index] = np.dot(initial_pi[index] * de_meaned.T,
                                        de_meaned) / Nk
        assert np.sum(initial_pi) == 1
        return initial_pi, initial_mean, initial_cov
Beispiel #10
0
 def test02_non_fitted_model_raises_not_fitted_error_message(self):
     model = KMeans(k=2)
     try:
         model.predict(np.array([[1, 0], [0, 1]]))
         self.fail()
     except Exception as e:
         self.assertEqual(str(e), KMeans.NOT_FITTED_ERROR_MESSAGE)
Beispiel #11
0
def cluster_newsgroups():
    """ Cluster newsgroup categories. """

    from kmeans import KMeans
    from similarity import simMatrix

    corpus, dictionary = build_dictionary(bigram=True)
    tfidf = TFIDF(dictionary)
    newsgroups = tfidf.vectorize(corpus)
    dictionary = tfidf.dictionary

    categories = sorted(corpus.keys())

    N = 6
    print "\n{}-Most Common Words".format(N)
    for index, category in enumerate(categories):
        nlargest = np.argpartition(newsgroups[index, :], -N)[-N:]
        nlargest = nlargest[np.argsort(newsgroups[index, nlargest])][::-1]
        print "{:>24} {}".format(category, dictionary[nlargest])
    print

    K = 3
    km = KMeans(n_clusters=K)
    km.fit(newsgroups)

    labels = km.labels_

    print "\nKMeans Label Assignment, K = {}".format(K)
    for category, label, in zip(categories, labels):
        print int(label), category

    simMatrix(newsgroups).plot().show()
Beispiel #12
0
Datei: main.py Projekt: L4v/ori
def main():
    # NOTE(Jovan): Load data
    data = pd.read_csv("data/skincancer.csv", delimiter=',', index_col=0)
    mort = data.Mort.values
    lat = data.Lat.values
    lon = data.Long.values

    # NOTE(Jovan): Init LinearRegression and predict
    lin_reg = LinearRegression(lat, mort)
    hawaii = lin_reg.predict(20)
    print("Prediction for hawaii[lat=20]:", hawaii)

    # NOTE(Jovan): Init KMeans and add lat and long points
    k_means = KMeans()
    for i, j in zip(lat, lon):
        k_means.points.append(Point(i, j))
    k_means.split(2, 0.01)

    # NOTE(Jovan): Plot clusters
    fig = plt.figure()
    ax = fig.add_axes([0,0,1,1])
    # NOTE(Jovan): First clusters
    for p in k_means._clusters[0].points:
        ax.scatter(p.x, p.y, c="#ff0000")
    # NOTE(Jovan): Second clusters
    for p in k_means._clusters[1].points:
        ax.scatter(p.x, p.y, c="#00ff00")

    # NOTE(Jovan): Plot cluster centers
    center1 = k_means._clusters[0].center
    center2 = k_means._clusters[1].center
    ax.scatter(center1.x, center1.y, marker="P", c="#ff0000")
    ax.scatter(center2.x, center2.y, marker="P", c="#00ff00")
    plt.show()
def spectral(X, sigma, k, centroids):
    """
    Ng谱聚类算法
    :param X: 数据点
    :param sigma: 参数
    :param k: 参数
    :return: accu聚类精度
    """
    (n, d) = X.shape
    L_sym, L = get_L(X, k, sigma)
    eig, eigvec = np.linalg.eig(L_sym)  # eigvec按列
    # eig_index = np.argsort(eig)[1:d+1]
    eig_index = np.argsort(eig)[:d]  # 最小的d个特征值的索引
    U = eigvec[:, eig_index]
    T = np.zeros(U.shape)
    for i in range(n):
        for j in range(d):
            T[i][j] = U[i][j] / np.linalg.norm(U[i])
    Y = T
    # visual(Y, k=k, sigma=sigma, save=1)

    cluster = KMeans(2, 100, centroids)
    cluster.fit(Y)
    labels = cluster.labels

    if labels[0] == 0:
        n1 = 100 - sum(labels[:100])
        n2 = sum(labels[100:])
    else:
        n1 = sum(labels[:100])
        n2 = 100 - sum(labels[100:])
    accu = (n1 + n2) / n
    print('---------------------sigma=%.2f, k=%d, accu=%.4f' %
          (sigma, k, accu))
    return accu
Beispiel #14
0
 def __init__(self,
              n_cluster: int,
              data: np.ndarray,
              use_kmeans: bool = False,
              w: float = 0.9,
              c1: float = 0.5,
              c2: float = 0.3,
              flag: int = 1,
              weights: list = None):
     index = np.random.choice(list(range(len(data))), n_cluster)
     self.centroids = data[index].copy()
     if use_kmeans:
         kmeans = KMeans(n_cluster=n_cluster, init_pp=False)
         kmeans.fit(data)
         self.centroids = kmeans.centroid.copy()
     self.best_position = self.centroids.copy()
     self.best_score = quantization_error(self.centroids, self._predict(data), data)
     self.flag=flag
     if self.flag%2==1:
         self.best_sse = calc_sse(self.centroids, self._predict(data), data)
     else:
         self.best_sse = calc_sse2(self.centroids, self._predict(data), data, weights)
     self.velocity = np.zeros_like(self.centroids)
     self._w = w
     self._c1 = c1
     self._c2 = c2
Beispiel #15
0
 def test_select_initial_centroids(self):
     expected_initial_centroids = [[2, 1], [-1, -2]]
     k_means = KMeans(num_clusters=self.num_clusters, seed=3)
     k_means.fit(self.data)
     initial_centroids = k_means._select_initial_centroids(self.data)
     self.assertEqual(expected_initial_centroids, initial_centroids)
     self.assertEqual(self.num_clusters, len(initial_centroids))
Beispiel #16
0
def main2():
    df = pd.read_csv('credit_card_data.csv')
    df = df.fillna(df.median())
    original_data = df.iloc[:, 1:].values
    data = copy.deepcopy(original_data)

    columns = list(df.columns)[1:]  # lista naziva kolona
    print(columns)

    # min_max_data(df, columns)

    normalizacija(data)  # radimo normalizaciju nad ucitanim podacima

    pca = PCA()
    pca.fit(data)

    # odredjujem na koliiko cu da smanjim dimenzionalnost
    plt.plot(range(1, 18), pca.explained_variance_ratio_.cumsum(), marker='x', linestyle='--')
    plt.xlabel('Components')  # features
    plt.ylabel('Variance')
    plt.show()

    components = 7  # vidimo iz plota
    pca = PCA(n_components=components)
    pca.fit(data)
    scores = pca.transform(data)
    # print(scores)  # ima onoliko komponenti koliko smo stavili

    # pokazujemo da prve dve dimenzije uticu najvise na grafik
    plt.bar(range(pca.n_components_), pca.explained_variance_ratio_, color='black')
    plt.xlabel('PCA components')
    plt.ylabel('Variance %')  # procenat koliko uticu na grafik, da tako kazemo
    plt.xticks(range(pca.n_components_))
    plt.show()

    # dobijam optimal k = 5 za 500 prvih ucitanih
    # za sve ucitane dobijam 6
    # optimal_k_plot(data)

    broj_klastera = 6

    k_means = MyKMeans(n_clusters=broj_klastera, max_iter=100)
    k_means.fit(scores, normalize=False)
    klaster_indeksi = k_means.klaster_indeksi
    print(klaster_indeksi)

    lista_klastera_sa_originalnim_podacima = []  # lista klastera sa originalnim podacima
    for i in range(broj_klastera):
        lista_klastera_sa_originalnim_podacima.append([])

    for i in range(len(original_data)):
        lista_klastera_sa_originalnim_podacima[klaster_indeksi[i]].append(original_data[i])

    # printujem osobine i stablo odlucivanja
    print_descriptions(lista_klastera_sa_originalnim_podacima, columns)
    # print_decision_tree(original_data, klaster_indeksi, columns)
    print_clusters_description()

    # iscrtavamo tacke
    plot_2_D(k_means)
Beispiel #17
0
def main():
    tagged_words = brown.tagged_words()
    words_corpus = brown.words()

    word2vec = Word2Vec()
    word2vec.train(words_corpus)

    word_vecs = [word2vec.word2vec(word) for word in words_corpus]

    n_clusters = 10 # random number for now
    kmeans = KMeans(n_clusters)
    kmeans.compute(word_vecs)

    # word-cluster HMM
    p_word = {}
    p_cluster = {}

    p_cluster_given_word = None # softmax
    p_word_given_cluster = None # joint probability formula

    p_transition_cluster = None # count
    p_initial_cluster = None # count

    # cluster-tag HMM
    p_cluster_given_tag = None # softmax
    p_transition_tag = None # count from tagged data
    p_initial_tag = None # count from tagged data

    hmm_word_cluster = HMM(p_initial_cluster, p_transition_cluster, p_word_given_cluster)
    hmm_cluster_tag = HMM(p_initial_tag, p_transition_tag, p_cluster_given_tag)

    words = []
    clusters = hmm_word_cluster.viterbi(words)
    tags = hmm_cluster_tag.viterbi(clusters)
Beispiel #18
0
def main():
    filepath = "./data/self_test.csv"
    #filepath = "./data/self_test_petit.csv"
    #filepath = "./data/iris.csv"

    # chargement des données
    data, labels = load_dataset(filepath)

    # initialisation de l'objet KMeans
    kmeans = KMeans(n_clusters=3,
                    max_iter=100,
                    early_stopping=True,
                    tol=1e-6,
                    display=True)

    # calcule les clusters
    kmeans.fit(data)

    # calcule la pureté de nos clusters
    score = kmeans.score(data, labels)
    print("Pureté : {}".format(score))



    input("Press any key to exit...")
Beispiel #19
0
    def test_whole(self):
        """
        Tests the score method.
        """

        X, y, centers = generate_cluster_samples()
        n_samples = X.shape[0]
        n_features = X.shape[1]
        k = centers.shape[0]

        # run N_TRIALS, pick best model
        best_model = None
        for i in range(N_TRIALS):
            kmeans = KMeans(k, N_ITER)
            kmeans.fit(X)
            if best_model is None:
                best_model = kmeans
            elif kmeans.score(X) < best_model.score(X):
                best_model = kmeans

        # check sum squared errors
        sum_squared_errors = best_model.score(X)
        self.assertLess(sum_squared_errors / n_samples, EPS)

        # compare centers to expected centers
        smallest_distances = find_smallest_distances(
            best_model.cluster_centers, centers)
        for distance in smallest_distances:
            self.assertLess(distance, EPS)
Beispiel #20
0
def do_KMeans_clustering(N_cluster, X, device):
	"""
	This function will use KMeans Clustering method to label training data
	according to its proximity with a cluster
	Input:
		N_cluster: number of cluster estimated by Gap Statistics
		X: Training data for the input layer
	Output:
		cluster_label: label assigned to every point
		over_coef: this will be used in the oversampling method to increase
				   number of points in the less densed cluster region
	"""

	X = X.to(device)

	#Instantiating kmeans object
	kmeans = KMeans(n_clusters=N_cluster, mode='euclidean', verbose=1)
	cluster_label = kmeans.fit_predict(X)

	#Calculating the size of cluster (number of data near the cluster centroid)
	cluster_size = torch.zeros(N_cluster, dtype=torch.int32).to(device)
	for cluster in range(N_cluster):
		cluster_size[cluster] = len(torch.where(cluster_label==cluster)[0])

	over_coef = torch.zeros(N_cluster, dtype=torch.int32).to(device)
	for cluster in range(N_cluster):
		over_coef[cluster] = torch.clone((max(cluster_size))/cluster_size[cluster]).to(device)
		if over_coef[cluster] > 10:
			over_coef[cluster] = 10

	return cluster_label.cpu(), over_coef.cpu()
Beispiel #21
0
def B1(pca=False):
    '''
		Plot WC_SSD and SC over K.
	'''
    K = [2, 4, 6, 8, 16, 32]
    fnames = [
        'digits-embedding.csv', 'digits-embedding-2467.csv',
        'digits-embedding-67.csv'
    ]
    wc_ssd_val = zeros((len(fnames), len(K)))
    sc_val = zeros((len(fnames), len(K)))
    for i, fname in enumerate(fnames):
        X = genfromtxt(fname, delimiter=',')[:, 2:]
        for j, k in enumerate(K):
            kmeans = KMeans(n_clusters=k)
            kmeans.fit(X)
            wc_ssd_val[i, j], sc_val[i, j], _ = kmeans.get_evals()
    # Plot WC_SSD
    figure()
    for i, fname in enumerate(fnames):
        plot(K, wc_ssd_val[i], label=fname)
    legend()
    title('WC_SSD v.s. K')
    figure()
    for i, fname in enumerate(fnames):
        plot(K, sc_val[i], label=fname)
    legend()
    title('SC v.s. K')
    show()
Beispiel #22
0
    def __init_parameters(self):
        N = self.X.shape[0]
        n_features = self.X.shape[1]

        kmeans = KMeans(n_clusters=self.n_components, n_init=5)
        kmeans.fit(self.X)

        # mu, means for each component
        self.means_ = kmeans.cluster_centers_
        # sigma, covariances for each component
        self.covariances_ = np.zeros(
            [self.n_components, n_features, n_features])
        # pi, weights for each component
        self.weights_ = np.zeros(self.n_components)
        for k in range(self.n_components):
            logic = (kmeans.labels_ == k)
            Nk = logic.sum()

            # otherwise error
            if Nk > 1:
                Xk = self.X[logic]
                self.covariances_[k] = np.cov(Xk.T)

            self.weights_[k] = Nk / N

        # gamma(Znk)
        self.gamma = np.zeros([N, self.n_components])
        # log_likelihood
        self.lower_bound_ = -np.inf

        return self
Beispiel #23
0
def kmeans_image_compression():

    print("[+] K-Means Image Compression")
    im = plt.imread('baboon.tiff')
    N, M = im.shape[:2]
    im = im / 255

    # convert to RGB array
    data = im.reshape(N * M, 3)
    # print(im)

    k_means = KMeans(n_cluster=16, max_iter=100, e=1e-6)
    centroids, _, i = k_means.fit(data)

    # print(centroids.shape)

    print('[+] RGB centroids computed in {} iteration'.format(i))
    new_im = transform_image(im, centroids)

    assert new_im.shape == im.shape, \
        'Shape of transformed image should be same as image'

    mse = np.sum((im - new_im)**2) / (N * M)
    print('[+] Mean square error per pixel is {}\n'.format(mse))
    plt.imsave('plots/compressed_baboon.png', new_im)
Beispiel #24
0
    def test_cluster_points_two_cluster(self):
        test_vector = self.create_test_data_vector()

        kmeans = KMeans(test_vector, 2)

        test_point0 = datapoint.DataPoint()
        test_point0.add_dimension(1.1)
        test_point0.add_dimension(2.1)
        test_point0.add_dimension(3.1)
        test_point1 = datapoint.DataPoint()
        test_point1.add_dimension(3.1)
        test_point1.add_dimension(1.1)
        test_point1.add_dimension(2.1)
        test_cluster = datapoint.DataVector()
        test_cluster.add_point(test_point0)
        test_cluster.add_point(test_point1)

        self.assertEqual(
            [1.0, 2.0, 3.0],
            kmeans.cluster_points(test_cluster)[0].data_points[0].coordinates)
        self.assertEqual(
            [2.0, 3.0, 1.0],
            kmeans.cluster_points(test_cluster)[0].data_points[1].coordinates)
        self.assertEqual(
            [3.0, 1.0, 2.0],
            kmeans.cluster_points(test_cluster)[1].data_points[0].coordinates)
Beispiel #25
0
    def test_assign_points(self):
        """
        Tests initialize methods of the KMeans class. 
        """
        X, y, centers = generate_cluster_samples()
        n_samples = X.shape[0]
        k = centers.shape[0]

        kmeans = KMeans(k, N_ITER)

        # Set cluster centers so that assignment is deterministic
        kmeans.cluster_centers = centers
        assignments, distances = kmeans.assign_points(X)

        # check assignment array shape
        self.assertEqual(assignments.ndim, 1)
        self.assertEqual(assignments.shape[0], n_samples)

        # check distances array shape
        self.assertEqual(distances.ndim, 1)
        self.assertEqual(distances.shape[0], n_samples)

        # check that assignments only include valid cluster indices (0 <= idx < k)
        self.assertTrue(
            np.all(np.logical_and(assignments < k, assignments >= 0)))

        # Check cluster assignments are correct
        self.assertTrue(np.all(assignments[:25] == 0))
        self.assertTrue(np.all(assignments[25:50] == 1))
        self.assertTrue(np.all(assignments[50:75] == 2))
        self.assertTrue(np.all(assignments[75:] == 3))
Beispiel #26
0
    def test_initialize(self):
        """
        Tests initialize methods of the KMeans class. 
        """
        k = 3
        n_samples = 100
        n_features = 10

        for i in range(N_TRIALS):
            X = np.random.randn(n_samples, n_features)

            kmeans = KMeans(k, N_ITER)
            kmeans.initialize_clusters(X)

            # ensure that the cluster_centers matrix has the right shape
            self.assertEqual(kmeans.cluster_centers.ndim, 2)
            self.assertEqual(kmeans.cluster_centers.shape[0], k)
            self.assertEqual(kmeans.cluster_centers.shape[1], n_features)

            # Check that every center is one the points in X.
            # Calculcate the distances between every cluster center
            # and every point in X.  Find the closest matches.
            # Checks that the distances are nearly 0.0
            distances = find_smallest_distances(X, kmeans.cluster_centers)
            for d in distances:
                self.assertAlmostEqual(d, 0.0)
Beispiel #27
0
def cluster_newsgroups():
    """ Cluster newsgroup categories. """

    from kmeans import KMeans
    from similarity import simMatrix

    corpus, dictionary = build_dictionary(bigram=True)
    tfidf = TFIDF(dictionary)
    newsgroups = tfidf.vectorize(corpus)
    dictionary = tfidf.dictionary

    categories = sorted(corpus.keys())

    N = 6
    print "\n{}-Most Common Words".format(N)
    for index, category in enumerate(categories):
        nlargest = np.argpartition(newsgroups[index,:], -N)[-N:]
        nlargest = nlargest[np.argsort(newsgroups[index,nlargest])][::-1]
        print "{:>24} {}".format(category, dictionary[nlargest])
    print

    K = 3
    km = KMeans(n_clusters=K)
    km.fit(newsgroups)

    labels = km.labels_

    print "\nKMeans Label Assignment, K = {}".format(K)
    for category, label, in zip(categories, labels):
        print int(label), category

    simMatrix(newsgroups).plot().show()
Beispiel #28
0
    def test_kmeans(self):

        locations = [[1, 1], [1, 2], [2, 1], [1, 3], [3, 1], [2, 2], [10, 10],
                     [10, 20], [20, 10], [10, 30], [30, 10], [20, 20]]

        clusterer = KMeans(2)
        clusterer.train(locations)
Beispiel #29
0
def plot_elbow(interval, data, random_seed=None):
    inertia = []
    for n_clusters in interval:
        clf = KMeans(k=n_clusters, init='kmeans++', random_seed=random_seed)
        clf.fit(data)
        inertia.append(clf.inertia)
    plot_metrics(interval, inertia, 'Elbow method', 'Number of clusters (K)',
                 'Sum of Squared Error')
Beispiel #30
0
    def test06_fit_two_clusters(self):
        np.random.seed(1)
        model = KMeans(k=2, init=init.forgy_initialization)
        data = np.array([[-1.0, 0.0], [-1.001, 0.0], [-0.999, 0.0], [0.0, 1.0],
                         [0.0, 0.999], [0.0, 1.001]])

        model.fit(data)
        self.assertEquals(model.predict(data), [1, 1, 1, 0, 0, 0])
Beispiel #31
0
 def test_fit(self):
     expected_labels = [0, 0, 0, 1, 1, 1]
     expected_centroids = [[-1.6666667, -1.6666667], [1.6666667, 1.6666667]]
     expected_inertia = 2.6666667
     k_means = KMeans(num_clusters=self.num_clusters, seed=1)
     k_means.fit(self.data)
     self.assertEqual(expected_labels, k_means.labels_)
     np.testing.assert_almost_equal(expected_centroids, k_means.centroids_)
     self.assertAlmostEqual(expected_inertia, k_means.inertia_)
def squared_clustering_errors(inputs, k):
    """finds the total squared error from k-means clustering the inputs"""
    clusterer = KMeans(k)
    clusterer.train(inputs)
    means = clusterer.means()
    assignments = map(clusterer.classify, inputs)

    return sum(squared_distance(input, means[cluster])
               for input, cluster in zip(inputs, assignments))
Beispiel #33
0
def task3(dataset):
    dimensions = len(dataset.training[0].data)
    print('k-means')
    for k in [9, 10, 20]:
        kmeans = KMeans(dimensions, k)
        kmeans.train(dataset.training)
        predictions = [kmeans(x) for x in dataset.testing]
        print('k=', k, ' ', sep='', end='')
        print_error(predictions, dataset.testing)
def process_articles(input_file, num_partitions=8):
    sc = SparkContext()
    try:
        input_rdd = sc.textFile(input_file)
        vectorized_docs = CorpusVectorizer(input_rdd).vectorize_corpus()
        centroids       = KMeans(vectorized_docs).centroids
        print >> sys.stdout, centroids.take(4)
    except Exception as e:
        print >> sys.stderr, "Unable to load file"
        print >> sys.stderr, e
        sys.exit(0)
Beispiel #35
0
def ClusterQueryDoc(dataset,rankerPath,feature_count, path_train_dataset, path_test_dataset, iterations, click_model, clusterData,queryDataPath,from_var,to_var):
    
   C = Fake(dataset, path_train_dataset, rankerPath,feature_count)
   C.Save()
   
   bestRankersFile = 'QueryData/'+dataset+'.data'
   KM = KMeans(from_var, to_var, bestRankersFile, dataset)
   (queryToCluster, clusterToRanker) = KM.runScript()
   
   g=GroupRanker(path_train_dataset,path_test_dataset,feature_count,iterations,click_model,dataset,clusterData,queryDataPath)
   g.groupRanker()
Beispiel #36
0
def _compute_k_means_clusters(data, similarity_calculator, similarity_diff_threshold):
    computed_clusters = {}
    k_means = KMeans(data.persons, similarity_calculator)
    for personID in data.originalPeople:
        friends_of_person = data.persons.getPerson(personID).getFriends()
        if len(friends_of_person) > 250:
            k = 12
        else:
            k = 6
        clusters = k_means.computeClusters(friends_of_person, k, similarity_diff_threshold)
        computed_clusters[personID] = clusters
    return computed_clusters
Beispiel #37
0
def calculate_em(X, n_clusters, diag=False, ridge=1e-10, verbose=False, max_iterations=100):
    """
    Returns mu, sigma and tpi
    """
    n_samples, n_features = X.shape
    # Initialise the data using kmeans
    k_means = KMeans(k=n_clusters)
    k_means_labels, _ = k_means.fit(X.copy())
    k_means_cluster_centers = k_means.centers_

    # OK, so we've got the centers and the labels. Let's now compute the EM
    # algorithm
    tau = np.zeros((n_samples, n_clusters))
    mu = np.zeros((n_clusters, n_features))
    sigma = np.zeros((n_clusters, n_features, n_features))
    p = np.zeros((n_clusters, n_samples))
    # FIXME shouldbe able to do the following using pure matric arithmetics
    for i, element in enumerate(k_means_labels):
        tau[i, element] = 1

    for j in range(max_iterations):
        old_mu = mu.copy()
        for i in range(n_clusters):
            mu[i] = (tau[:, i].reshape((tau.shape[0], 1)) * X).sum(axis=0) / (tau[:, i]).sum()

        for i in range(n_clusters):
            a = 0
            for n in range(n_samples):
                b = (X[n, :] - mu[i]).reshape((2, 1))
                if diag:
                    a += tau[n, i] * np.dot(b.T, b)
                else:
                    a += tau[n, i] * np.dot(b, b.T)

            if diag:
                sigma[i, :] = a.mean() / tau[:, i].sum() * np.identity(mu.shape[1])
            else:
                sigma[i, :] = a / tau[:, i].sum()

        tpi = tau.sum(axis=1) / n_samples
        for i in range(n_clusters):
            p[i, :] = _calculate_normal(X, mu[i, :], sigma[i, :])

        for i in range(n_clusters):
            tau.T[i, :] = tpi[i] * p[i, :] / (tpi * p).sum(axis=0)

        if ((old_mu - mu) ** 2).sum() < ridge:
            if verbose:
                print "break at iterations %d" % j
            break

    return mu, sigma, tpi
Beispiel #38
0
class Reducer:
    def __init__(self):
        self.k = int(self.params.get("k", "10"))
        self.max_iterations = int(self.params.get("max_iterations", "100"))
        self.kmeans = KMeans(self.k, self.max_iterations)

    def __call__(self, key, values):
        # convert input to numpy_array and feed the vectors to KMeans instance
        for _vid, _vector_array in enumerate(values):
            _vector_array = numpy.array(_vector_array)
            self.kmeans.add_vector(_vid, _vector_array)
        self.kmeans.initialize()

        for _cluster in self.kmeans.run():
            for item in _cluster:
                yield item.cid, item
Beispiel #39
0
    def apply_decluster(self):
        """
        apply window method to the whole catalog and write mainshocks on file
        """
        # get instances of classes we'll need
        catalog = Catalog()
        kmeans = KMeans() 

        # from the catalog we want, get earthquakes array on memory
        earthquake_array = catalog.get_earthquake_array('../catalogs/new_jma.txt')

        # decluster array, separating mainshocks and aftershocks
        declustered_array = kmeans.do_kmeans(earthquake_array)
        
        # record the mainshocks on a catalog
        catalog.record_mainshocks(declustered_array, file_write='../results/mainshocks.txt', file_read='../catalogs/jma.txt')
def main(args):
    df = pd.read_csv(args.data_csv)
    data = np.array(df[['X', 'Y']])
    plt.clf()
    plt.scatter(data[:, 0], data[:, 1], s=3, color='blue')

    if args.algorithm == 'gmm':
        gmm = GaussianMixtureModel(args.num_clusters)
        gmm.fit(data)
        y = gmm.predict_cluster(data)
    else:
        km = KMeans(args.num_clusters)
        km.fit(data)
        y = km.predict(data)
    plt.scatter(data[:, 0], data[:, 1], c=y)
    plt.show()
Beispiel #41
0
    def apply_decluster_smaller(self):
        """
        apply window method to a smaller catalog and write mainshocks on file
        """
        # get instances of classes we'll need
        catalog = Catalog()
        kmeans = KMeans() 

        # obtain a smaller catalog, so we can run this function faster
        catalog.get_smaller_catalog(300)

        # from the catalog we want, get earthquakes array on memory
        earthquake_array = catalog.get_earthquake_array()
        
        # decluster array, separating mainshocks and aftershocks
        declustered_array = kmeans.do_kmeans(earthquake_array, 25)
        
        # record the mainshocks on a catalog
        catalog.record_mainshocks(declustered_array, file_write='../results/mainshocks.txt', file_read='../catalogs/reduced_jma.txt')
Beispiel #42
0
def kmeans():
    filename = 0
    threshold = 0

    numClusters = 4
    filename = 'data/4clusters.csv'
            
    kmeans = KMeans(filename, numClusters)
    clusters = kmeans.cluster()

    formatted = dict()
    for i, cluster in enumerate(clusters):
        formatted[i] = []
        for point in cluster:
            #f_cluster = dict()
            #f_cluster[point[0]] = point[1]
            #formatted[i].append(f_cluster)
            formatted[i].append(point)

    print formatted
    return {'clusters':formatted, 'k':len(formatted), 'get_url': app.get_url}
Beispiel #43
0
	def train(self, X, Y, beta, nb_epochs=5, normalize=False, gradients=True, alpha=1e-3):
		
		# m = nb_examples
		# n = nb_features
		# k = nb_hidden (nb of neurons in hidden layer)
		(m, n), k = X.shape, self.nb_hidden

		# K-Means for input layer:
		self.km = KMeans(nb_cluster=k)

		# select k centroids from training set:
		self.km.train(X, nb_iters=100, init_from_train=True)
		
		# set the centroids as ours weights from of the layer:
		self.weights[0] = self.km.get_centroids()


		# calcule the activation of rbf:
		# in pythonic way:
		A = np.array([np.exp(-beta * np.sum(np.power(self.weights[0] - x, 2), axis=1)) for x in X]).reshape(m, k)
		
		# in noob way:
		# A, betas = np.zeros((m, k)), np.ones((k, 1)) * beta
		# for i in range(m):
		# 	A[i] = self.get_activation(X[i, :], betas).T

		# speed up the convergence and its necessary 
		# for pseudo-inverse method
		if normalize:
			# divide each row by its sum
			A = A / np.sum(A, axis=1)[:,None]

		# for bias
		A = add_column_with_ones(A)
		
		if gradients:
			errors = []
			while not reached_precision(errors, precision=1e-7):
				# the same process as used in linear regression, i.e.
				# use the gradient descent for minimize the loss function
				self.weights[1] = self.gradient_descent(A, Y, beta, alpha)

				# calculate the errors with the trained weights:
				errors.append(np.sum(np.power(self.predict(X, beta, normalize=normalize) - Y, 2)) / m)
		else:
			# learn the weights using pseudo-inverse function:
			self.weights[1] = pinv(A.T.dot(A)).dot(A.T.dot(Y))
		
			# calculate the errors with the trained weights:
			errors = [0, np.sum(np.power(self.predict(X, beta, normalize=normalize) - Y, 2)) / m]

		return errors
Beispiel #44
0
def main():
    dataset = _load_csv_data(CSV_PATH, CSV_COLUMN_DELIMITER)
    k = 2
    max_iter = 10

    handler = KMeans(dataset, k=k)
    handler.kmeans()
    while k < max_iter:
        handler.reinitialize(k=k)
        handler.kmeans()
        k += 1
Beispiel #45
0
def compare_window_kmeans(num_entries):
    """
    receives a number of entries 
    the function applies to a catalog with that number of entries
    the window method and the kmeans to decluster the catalog
    the function returns a comparation, showing how much there is 
    of a difference between the two

    Complexity: O(n^2)
    """

    # obtain a smaller catalog 
    catalog = Catalog()
    catalog.get_smaller_catalog(num_entries)

    # get earthquake array of that catalog 
    quakes = catalog.get_earthquake_array()

    # get declustered array of that catalog, according to the window method
    window = Window()
    window_quakes = window.decluster(quakes)

    # get the number of mainshocks of that catalog, according to the kmeans clustering method
    num_mainshocks = 0 
    for i in range(len(window_quakes)):
        if window_quakes[i].is_aftershock == False:
            num_mainshocks += 1
    print(num_mainshocks)

    # apply declustering using the kmeans method to the catalog
    kmeans = KMeans()
    kmeans_quakes = kmeans.do_kmeans(quakes, num_mainshocks)

    # show what are the differences between both methods
    for i in range(len(quakes)):
        if window_quakes[i].is_aftershock != kmeans_quakes[i].is_aftershock:
            print("found a difference!")
Beispiel #46
0
def main():
    print "K-Means algorithm illustrated through the iris dataset"
    print "The algorithm uses random initialization and iterates until no iris"
    print "switches clusters."
    print "If matplotlib is installed, the resulting clusters are illustrated"
    print "in a graphical manner."
    print

    # import the data
    irii = import_csv()

    # create and initialize the algorithm
    kmeans = KMeans(3, irii, euclidean_similarity)

    # run the algorithm
    num_iterations = kmeans.run()
    print "K-Means ran in %d iterations" % (num_iterations)
    print

    print "SSE Values for each cluster:"
    for cluster_num, sse in enumerate(kmeans.sses()):
        num_members = len(kmeans.clusters[cluster_num])
        print "Cluster %d (%d members): %f" % (cluster_num, num_members, sse)

    print

    # create a plot
    try:
        print "Plotting sepal length vs sepal width."
        print "Colors indicate the cluster, as identified by k-means across all"
        print "attributes. The symbol indicates the 'correct' group, as"
        print "determined by the name of the IRIS. The black + symbols indicate"
        print "the centroids."
        create_plot(kmeans)
    except:
        print "There was a plotting error. Do you have matplotlib installed?"
def urf_games():
    my_kmeans = KMeans()
    results = []
    results.append({"Control: Wards placed/Wards destroyed" : my_kmeans.calculate('control')})
    results.append({'Damage: Physical/Magic/True' : my_kmeans.calculate('damage')})
    results.append({'Economy: Gold Earned/Gold Spent' : my_kmeans.calculate('economy')})
    results.append({'Kills: Kills/Deaths/Assists' : my_kmeans.calculate('kills')})
    results.append({'Multi Kills: Combo Kills/Killing Sprees/Largest Killing Spree': my_kmeans.calculate('multi_kills')})

    return render_template('results.html', data=results)
from kmeans import KMeans
from matplotlib import pyplot as plt

path_to_file = "casino1.jpg"
import matplotlib.image as mpimg
img = mpimg.imread(path_to_file)

pixels = [pixel for row in img for pixel in row]
clusterer = KMeans(5)
clusterer.train(pixels)     # this might take a while

def recolor(pixel):
    cluster = clusterer.classify(pixel)             # index of the closest cluster
    return clusterer.means[cluster]

new_img = [[recolor(pixel) for pixel in row]        # recolor this row of pixels
           for row in img]

plt.imshow(new_img)
plt.axis('off')
plt.show()
Beispiel #49
0
 def __init__(self, n_clusters, initCent, max_iter):
     self.data = np.array([])
     self.belongApp = np.array([])
     self.n_clusters = n_clusters
     self.clf = KMeans(n_clusters, initCent, max_iter)
Beispiel #50
0
class ClsfCRTL(object):

    def __init__(self, n_clusters, initCent, max_iter):
        self.data = np.array([])
        self.belongApp = np.array([])
        self.n_clusters = n_clusters
        self.clf = KMeans(n_clusters, initCent, max_iter)

    def genDataset(self, file_name):
        dataSet, belongApp = [], []
        f = open(file_name, "r")
        lines = f.readlines()
        for line in lines:
            line_elm = line.split("\t")
            dataSet.append([int(line_elm[0]), 0])
            belongApp.append(line_elm[1].rstrip("\n"))
        self.data = np.array(dataSet)
        self.belongApp = np.array(belongApp)
        f.close()

    def clsf(self):
        self.clf.fit(self.data)

    def show(self):
        cents = self.clf.centroids
        labels = self.clf.labels
        sse = self.clf.sse
        colors = ['b', 'g', 'r', 'k', 'c', 'm', 'y', '#e24fff', '#524C90', '#845868', '#00FF00', '#330000',
                  '#333300', '#333333', '#CC0099', '#FFFF00', '#FF99CC', '#CCCC66', '#003333', '#66FFFF']
        for i in range(self.n_clusters):
            index = np.nonzero(labels==i)[0]
            x0 = self.data[index, 0]
            x1 = self.data[index, 1]
            y_i = self.belongApp[index]
            for j in range(len(x0)):
                plt.scatter(x0[j], x1[j], marker='o', color=colors[i])
                # plt.text(x0[j],x1[j],str(y_i[j]),color=colors[i],fontdict={'weight': 'bold', 'size': 9})
            plt.scatter(cents[i,0], cents[i,1], marker='x', color=colors[i], linewidths=5)
        plt.title("SSE={:.2f}".format(sse))
        plt.axis([0, 1600, -2, 2])
        plt.show()

    def showBar(self):
        n = 1600
        X = np.arange(n)
        Y1 = (1-X/float(n) * np.random.uniform(0.5, 1.0, n))
        rect = plt.bar(X, +Y1, facecolor='#524C90', edgecolor='white')

        for x,y in zip(X, Y1):
            plt.text(x+0.4, y+0.05, '%.2f' % y, ha='center', va='bottom')

        plt.xlim(-0.5, 12.5)
        plt.ylim(-0.1, +1.25)
        plt.xlabel("xlabel")
        plt.ylabel("ylabel")
        plt.title("title")
        plt.legend((rect,), ("example",))
        plt.show()

    def genResFile(self, i):
        cents = self.clf.centroids
        sse = self.clf.sse

        f = open("Res" + "-" + str(i), "w")
        f.write(str(cents.shape[0]) + '\n')
        for cent in cents:
            f.write(str(cent[0]) + '\t' + str(cent[1]) + '\n')
        f.write(str(sse) + '\n')

        # test
        f.write("\n")
        for clu in self.clf.clusterAssment:
            f.write(str(clu[0]) + '\t' + str(clu[1]) + '\n')
        # test

        f.close()
Beispiel #51
0
iris_data = load_iris()  # ucitavanje Iris data seta
iris_data = iris_data.data[:, 1:3]  # uzima se druga i treca osobina iz data seta (sirina sepala i duzina petala)

plt.figure()
for i in range(len(iris_data)):
    plt.scatter(iris_data[i, 0], iris_data[i, 1])

plt.xlabel('Sepal width')
plt.ylabel('Petal length')
plt.show()


# --- INICIJALIZACIJA I PRIMENA K-MEANS ALGORITMA --- #

# TODO 2: K-means na Iris data setu
kmeans = KMeans(n_clusters=2, max_iter=100)
kmeans.fit(iris_data, normalize=True)

colors = {0: 'red', 1: 'green'}
plt.figure()
for idx, cluster in enumerate(kmeans.clusters):
    plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200)  # iscrtavanje centara
    for datum in cluster.data:  # iscrtavanje tacaka
        plt.scatter(datum[0], datum[1], c=colors[idx])

plt.xlabel('Sepal width')
plt.ylabel('Petal length')
plt.show()


# --- ODREDJIVANJE OPTIMALNOG K --- #
Beispiel #52
0
  def doTrain(self, feats, clusters, maxIters = 1024, epsilon = 1e-4):
    # Initialise using kmeans...
    km = KMeans()
    kmAssignment = numpy.empty(feats.shape[0],dtype=numpy.float_)
    km.train(feats,clusters,assignOut = kmAssignment)
    
    # Create the assorted data structures needed...
    mix = numpy.ones(clusters,dtype=numpy.float_)/float(clusters)
    mean = numpy.empty((clusters,feats.shape[1]),dtype=numpy.float_)
    for c in xrange(clusters): mean[c,:] = km.getCentre(c)
    sd = numpy.zeros(clusters,dtype=numpy.float_)

    tempCount = numpy.zeros(clusters,dtype=numpy.int_)
    for f in xrange(feats.shape[0]):
      c = kmAssignment[f]
      dist = ((feats[f,:] - mean[c,:])**2).sum()
      tempCount[c] += 1
      sd += (dist-sd)/float(tempCount[c])
    sd = numpy.sqrt(sd/float(feats.shape[1]))

    wv = numpy.ones((feats.shape[0],clusters),dtype=numpy.float_) # Weight vectors calculated in e-step.
    pwv = numpy.empty(clusters,dtype=numpy.float_) # For convergance detection.
    norms = numpy.empty(clusters,dtype=numpy.float_) # Normalising constants for the distributions, to save repeated calculation.

    sqrt2pi = math.sqrt(2.0*math.pi)

    # The code...
    code = """
    for (int iter=0;iter<maxIters;iter++)
    {
     // e-step - for all features calculate the weight vector (Also do convergance detection.)...
     for (int c=0;c<Nmean[0];c++)
     {
      norms[c] = pow(sqrt2pi*sd[c], Nmean[1]);
     }
     
     bool done = true;
     for (int f=0;f<Nfeats[0];f++)
     {
      float sum = 0.0;
      for (int c=0;c<Nmean[0];c++)
      {
       float distSqr = 0.0;
       for (int i=0;i<Nmean[1];i++)
       {
        float diff = FEATS2(f,i) - MEAN2(c,i);
        distSqr += diff*diff;
       }
       pwv[c] = WV2(f,c);
       float core = -0.5*distSqr / (sd[c]*sd[c]);
       WV2(f,c) = mix[c]*exp(core); // Unnormalised.
       WV2(f,c) /= norms[c]; // Normalisation
       sum += WV2(f,c);
      }
      for (int c=0;c<Nmean[0];c++)
      {
       WV2(f,c) /= sum;
       done = done && (fabs(WV2(f,c)-pwv[c])<epsilon);
      }
     }

     if (done) break;


     // Zero out mix,mean and sd, ready for filling...
     for (int c=0;c<Nmean[0];c++)
     {
      mix[c] = 0.0;
      for (int i=0;i<Nmean[1];i++) MEAN2(c,i) = 0.0;
      sd[c] = 0.0;
     }

     
     // m-step - update the mixing vector, means and sd...
     // *Calculate mean and mixing vector incrimentally...
     for (int f=0;f<Nfeats[0];f++)
     {
      for (int c=0;c<Nmean[0];c++)
      {
       mix[c] += WV2(f,c);
       if (WV2(f,c)>1e-6) // Msut not update if value is too low due to division in update - NaN avoidance.
       {
        for (int i=0;i<Nmean[1];i++)
        {
         MEAN2(c,i) += WV2(f,c) * (FEATS2(f,i) - MEAN2(c,i)) / mix[c];
        }
       }
      }
     }
     
     // prevent the mix of any given component getting too low - will cause the algorithm to NaN...
     for (int c=0;c<Nmean[0];c++)
     {
      if (mix[c]<1e-6) mix[c] = 1e-6;
     }

     // *Calculate the sd simply, initial calculation is sum of squared differences...
     for (int f=0;f<Nfeats[0];f++)
     {
      for (int c=0;c<Nmean[0];c++)
      {
       float distSqr = 0.0;
       for (int i=0;i<Nmean[1];i++)
       {
        float delta = FEATS2(f,i) - MEAN2(c,i);
        distSqr += delta*delta;
       }
       sd[c] += WV2(f,c) * distSqr;
      }
     }

     // *Final adjustments for the new state...
     float mixSum = 0.0;
     for (int c=0;c<Nmean[0];c++)
     {
      sd[c] = sqrt(sd[c]/(mix[c]*float(Nfeats[1])));
      mixSum += mix[c];
     }
     
     for (int c=0;c<Nmean[0];c++) mix[c] /= mixSum;
    }
    """

    # Weave it...
    weave.inline(code,['feats', 'maxIters', 'epsilon', 'mix', 'mean', 'sd', 'wv', 'pwv', 'norms', 'sqrt2pi'])

    # Store result...
    self.mix = mix
    self.mean = mean
    self.sd = sd
Beispiel #53
0
    def fit(self, X):
        self.X = X
        self.N = X.shape[0]
        self.ndim = X.shape[1]
        np.random.seed(self.random_seed)
        matX = np.asmatrix(X)

        # initialization schemes
        if self.init_method == 'random':
            if self.init_means is not None:
                mu = self.init_means
            else:
                mu = X[np.random.choice(range(0, len(X)), self.num_gaussians), :]  # sample from the data
            if self.init_cov is not None:
                sigma = self.init_cov
            else:
                sigma = list()
                for k in range(self.num_gaussians):
                    sigma.append(np.identity(self.ndim, dtype=np.float64))
                    sigma[k] += np.random.rand(self.ndim, self.ndim)  # purely synthetic
                    sigma[k] = np.dot(sigma[k], sigma[k].T)  # making it positive semi-definite and symmetric
                    sigma[k] /= sigma[k].sum()

                    # lowerbound = k * self.N / self.num_gaussians  # sample from data
                    # upperbound = lowerbound + 20
                    # sigma[k] = np.cov(X[lowerbound:upperbound, :].T)

            if self.init_weights is not None:
                lmbda = self.init_weights
            else:
                lmbda = np.random.rand(self.num_gaussians)
                lmbda /= lmbda.sum()

        elif self.init_method == 'kmeans':  # use means of kmeans as initial means, and calculate cov from the clusters
            model = KMeans(K=self.num_gaussians, max_iter=5)
            model.fit(X)
            labels = model.pred(X)
            mu = np.zeros((self.num_gaussians, self.ndim))
            sigma = [np.zeros((self.ndim, self.ndim))] * self.num_gaussians
            for k in range(self.num_gaussians):
                cluster = X[labels == k]
                mu[k] = cluster.mean(axis=0)
                sigma[k] = np.cov(cluster.T)
            if self.init_weights is not None:
                lmbda = self.init_weights
            else:
                lmbda = np.random.rand(self.num_gaussians)
                lmbda /= lmbda.sum()


        ######## BEGIN ACTUAL ALGORITHM ###################
        for iter in range(self.max_iter):
            phat = np.zeros((self.N, self.num_gaussians))
            N = np.zeros(self.num_gaussians)

            # E step
            for k in range(0, self.num_gaussians):
                normal_var = normal(mean=mu[k], cov=sigma[k])
                phat[:, k] = lmbda[k] * normal_var.pdf(X)
            phat /= phat.sum(axis=1)[:, None]
            # faster to do it all with numpy than use loops

            # for n in range(0, self.N):  # loop over each data point
            #     for k in range(0, self.num_gaussians):
            #         normalx = normal(mean=mu[k], cov=sigma[k]).pdf(X[n, :])
            #         phat[n, k] = lmbda[k] * normalx
            #     phat[n, :] /= phat[n, :].sum()

            # M step
            for k in range(self.num_gaussians):
                N[k] = phat[:, k].sum()
                mu[k] = np.dot(phat[:, k], X) / N[k]
                intermed = np.multiply(phat[:, k], (matX - mu[k]).T).T
                sigma[k] = np.dot(intermed.T, (matX - mu[k])) / N[k]
                lmbda[k] = N[k] / self.N

            pass  # end of this iteration
        self.mu = mu
        self.sigma = sigma
        self.lmbda = lmbda
Beispiel #54
0
dimensions = 3

for tests in range(0, 4):

    precisions = []
    times = []

    for executions in range(0, 100):

        print(str(executions) + "%")

        start_time = time.time()

        particles = []

        k_means = KMeans()

        # Particles Initialization
        for i in range(population_size):
            p = Particle()

            num_clusters = random.randint(2, 7)

            plist = [num_clusters, ]
            plist.extend([random.uniform(0, 10) for i in range(0, k_means.dimens * 7)])

            p.current_position = array(plist)
            p.best_position = p.current_position
            p.fitness = 0.0
            p.velocity = 0.0
            particles.append(p)
Beispiel #55
0
import cPickle
import matplotlib.pyplot as plt
import numpy as np
from kmeans import KMeans,biKMeans
    
if __name__ == "__main__":
    #加载数据
    X,y = cPickle.load(open('data.pkl','r'))

    #依次画出迭代1次、2次、3次...的图
    for max_iter in range(6):
        #设置参数
        n_clusters = 10
        initCent = X[50:60] #将初始质心初始化为X[50:60]
        #训练模型
        clf = KMeans(n_clusters,initCent,max_iter)
        clf.fit(X)
        cents = clf.centroids
        labels = clf.labels
        sse = clf.sse
        #画出聚类结果,每一类用一种颜色
        colors = ['b','g','r','k','c','m','y','#e24fff','#524C90','#845868']
        for i in range(n_clusters):
            index = np.nonzero(labels==i)[0]
            x0 = X[index,0]
            x1 = X[index,1]
            y_i = y[index]
            for j in range(len(x0)):
                plt.text(x0[j],x1[j],str(int(y_i[j])),color=colors[i],\
                         fontdict={'weight': 'bold', 'size': 9})
            plt.scatter(cents[i,0],cents[i,1],marker='x',color=colors[i],linewidths=12)
import numpy as np
import matplotlib.pyplot as plt
from scipy.misc import *
import scipy.io as sio

from kmeans import KMeans


plt.close('all')

X = sio.loadmat('ex7data2.mat')['X']

classifier = KMeans(X)

initial_centroids = np.asarray([[3, 3], [6, 2], [8, 5]])
idx = classifier.find_closest_centroids(initial_centroids)
print("Closest centroids for the first 3 examples:")
print(np.str(idx[0:3]))
print("(the closest centroids should be 0, 2, 1 respectively)")

centroids = classifier.compute_centroids(idx)
print("Centroids computed after initial finding of closest centroids: \n")
print(np.str(centroids))
print('(the centroids should be');
print('   [ 2.428301 3.157924 ]');
print('   [ 5.813503 2.633656 ]');
print('   [ 7.119387 3.616684 ]\n');

centroids, idx = classifier.run(plot_progress=True)
plt.show()
print("K-Means Done.")
    s1[i] = (x1, y1)

    r2, theta2 = np.random.normal(5, 0.25), np.random.uniform(0, 2*np.pi)
    x2, y2 = r2 * np.cos(theta2), r2 * np.sin(theta2)
    s2[i] = (x2, y2, r2, theta2)

    plt.scatter(x1, y1)
    plt.scatter(x2, y2)

    data.append((x1, y1))
    data.append((x2, y2))

plt.show()

# TODO 5: K-means nad ovim podacima
kmeans = KMeans(n_clusters=2, max_iter=100)
kmeans.fit(data)

colors = {0: 'red', 1: 'green'}
plt.figure()
for idx, cluster in enumerate(kmeans.clusters):
    plt.scatter(cluster.center[0], cluster.center[1], c=colors[idx], marker='x', s=200)  # iscrtavanje centara
    for datum in cluster.data:  # iscrtavanje tacaka
        plt.scatter(datum[0], datum[1], c=colors[idx])

plt.show()

# TODO 7: DBSCAN nad ovim podacima
dbscan = DBScan(epsilon=1.2, min_points=3)
dbscan.fit(data)
Beispiel #58
0
from em import calculate_em, _calculate_normal
from em import calculate_log_likelihood
from kmeans import KMeans


n_clusters = 4
X = utils.load_data('EMGaussienne.data')
Xtest = utils.load_data('EMGaussienne.test')

max_iterations = 150
ridge = 1e-6
verbose = True

n_samples, n_features = X.shape
# Initialise the data using kmeans
k_means = KMeans(k=n_clusters)
k_means_labels, _ = k_means.fit(X.copy())
k_means_cluster_centers = k_means.centers_

mu, sigma, tpi = calculate_em(X, n_clusters)
print 'Log likelihood %d' % calculate_log_likelihood(X, mu, sigma, tpi)
print 'Log likelihood %d' % calculate_log_likelihood(Xtest, mu, sigma, tpi)

p = np.zeros((n_clusters, n_samples))
for i in range(n_clusters):
    p[i, :] = _calculate_normal(X, mu[i, :], sigma[i, :])

em_labels = p.argmax(axis=0)

p = np.zeros((n_clusters, n_samples))
for i in range(n_clusters):
	def preConfigureModel(self):
		
		configureStart = time.time() # the pre-configuration starting time
		
		# get the previous trading day
		yesterday = self.dateOfTrade
		while True:
			yesterday -= datetime.timedelta(days = 1)
			datasetPath = 'dataset/' + yesterday.strftime('%d-%h-%G')
			if isdir(datasetPath):
				break
			else:
				continue
		
		try:
			remove('data/all_features.pkl')
		except:
			pass
		
		print 'Pre-Configuration stage, on date :', yesterday
		
		# the last day best features and last day points are saved 
		featureObject = GetFeatures(datasetPath + '/corpora')
		
		extractionStart = time.time()
		featureObject.extractFeatures(parameters.ourFeatureType) # 2-word combinations fe method
		extractionEnd = time.time()
		
		print '\nFeature Extraction time : %.2f minutes' %((extractionEnd - extractionStart)/60)
		
		selectionStart = time.time()
		featureObject.selectFeatures(parameters.ourSelectionType, parameters.initialNumberOfFeatures) # bns fs method, no of features
		selectionEnd = time.time()
		
		print 'Feature Selection time : %.2f minutes' %((selectionEnd - selectionStart)/60)
		
		numberOfVectors = featureObject.representFeatures()
		print 'Document vectors formed .. '
		
		copy('data/best_features.pkl', 'data/all_best_features.pkl')
		
		#------------------------------------ K-means Running ---------------------------------------
		
		print '\nRunning K-means ..'
		
		kmeansStart = time.time()
		
		kmeansObject = KMeans()
		kmeansObject.getDataPoints()
		kmeansObject.getInitialCenters()
		
		iterationNumber = 1
		notConverged = True
		
		while notConverged == True and iterationNumber < parameters.maximumIterations :
			timeNow = time.time()
			
			if iterationNumber % 20 == 0:
				print '..Iteration Number : %3d Time Elapsed till now : %.2f minutes' %(iterationNumber, (timeNow - kmeans_start) / 60.0)
			else:
				pass
			
			kmeansObject.assignToCluster()
			notConverged = kmeansObject.recalculateCentroids()
			
			iterationNumber += 1
		
		kmeansObject.saveClusters()

		kmeansEnd = time.time()
		
		print 'Kmeans running time : %.2f minutes' %((kmeansEnd - kmeansStart)/60)
		
		#-------------------------------------------------------------------------------------------------------
		
		# cluster info
		fileReader = open('data/cluster_info.pkl', 'r')
		clusterInfo = pickle.load(fileReader)
		fileReader.close()
		
		# for projected clustering
		print '\nPreparing the initial fading clusters ..'
		projectedClusteringStart = time.time()
		
		self.projectedClusteringObject = ProjectedClustering()
		self.projectedClusteringObject.prepareFadingClusters(clusterInfo)
		
		projectedClusteringEnd = time.time()
		
		print 'Fading clusters preparation time : %.2f minutes' %((projectedClusteringEnd - projectedClusteringStart) / 60)
		
		# take the last 10 files of 'yesterday' and store in the 'last' folder
		lastFileNames = []
		
		fileReader = open(datasetPath + '/log_file.txt', 'r')
		for line in fileReader:
			lastFileNames.append(line.split(' ')[0])
		fileReader.close()
		
		fileWriter = open('last/log.txt', 'w') # write the names of files
		
		lastFileNames = lastFileNames[-10:]
		for fileName in lastFileNames:
			try:
				copy(datasetPath + '/corpora/' + fileName, 'last/' + fileName)
			except:
				pass
			fileWriter.write(fileName + '\n')
		
		fileWriter.close()
		
		# get the ann ready
		self.annObject.loadAnnWeights()
		
		print '\nANN locked and loaded ..'
		
		configureEnd = time.time()
		
		print "Total time taken to pre-configure : %.2f minutes" %((configureEnd - configureStart)/60)