Ejemplo n.º 1
0
    def test_init(self):
        # Case 1: Do not give the data, give the counts, give the number of
        # data points you want it to auto-generate.
        K = 3
        N = 10
        random_counts = np.random.random_sample((N,)) * 100
        alpha = 3
        beta = 0.9
        max_runs = 10
        label = 'Test'
        verbose = True
        mu = [[0, 0], [1, 2], [6, 4]]
        max_diff = 0.005
        w = wkm.KPlusPlus(K, N=N, c=random_counts, alpha=alpha, beta=beta,
                          max_runs=max_runs, label=label, verbose=verbose,
                          mu=mu, max_diff=max_diff)
        self.assertTrue(w.K == K)
        self.assertTrue(w.N == N)
        self.assertTrue((w.counts_per_data_point == random_counts).all())
        self.assertTrue(w.alpha == alpha)
        self.assertTrue(w.beta == beta)
        self.assertTrue(w.max_runs == max_runs)
        self.assertTrue(w.label == label)
        self.assertTrue(w.verbose == verbose)
        self.assertTrue(w.mu == mu)
        self.assertTrue(w.method == 'manual')  # because we gave it mu
        self.assertTrue(w.clusters is None)
        self.assertTrue((w.cluster_indices == [None for i in w.X]).all())
        self.assertTrue(w.counts_per_cluster == [0 for x in range(w.K)])
        self.assertIsInstance(w.X, np.ndarray)
        self.assertTrue(len(w.X.shape) == 2)
        self.assertTrue((w.scaling_factor == np.ones((w.K)) / w.K).all())
        self.assertTrue(w.runs == 0)
        self.assertTrue(w.max_diff == max_diff)

        # Case 2: Do not give the data, do not give the counts, give the number
        # of datapoints you want it to generate.
        w = wkm.KPlusPlus(K, N=N)
        self.assertTrue(w.K == K)
        self.assertTrue(w.N == N)
        self.assertTrue(w.counts_per_data_point ==
                        [1 for x in w.X])
        self.assertTrue(w.alpha == 0)
        self.assertTrue(w.beta == 0)
        self.assertIsInstance(w.X, np.ndarray)
        self.assertTrue(len(w.X.shape) == 2)
        self.assertTrue(w.counts_per_cluster == [0 for x in range(w.K)])
        self.assertTrue(w.method == 'random')

        # Case 3: Give data, do not give counts, do not give number of
        # datapoints.
        X = [[1, 2], [3, 12], [8, 200]]
        N = len(X)
        w = wkm.KPlusPlus(K, X=X)
        self.assertTrue(w.K == K)
        self.assertTrue(w.N == N)
Ejemplo n.º 2
0
 def test__init_gauss(self):
     K = 3
     N = 10
     random_counts = np.random.random_sample((N, )) * 100
     w = wkm.KPlusPlus(K, N=N, c=random_counts)
     data = w._init_gauss(N)
     self.assertTrue(len(data) == N)
Ejemplo n.º 3
0
    def test__cluster_points(self):
        K = 3
        N = 10
        random_counts = np.random.random_sample((N, )) * 100
        w = wkm.KPlusPlus(K, N=N, c=random_counts, verbose=False)
        w.mu = random.sample(list(w.X), w.K)
        self.assertTrue(w.clusters == None)
        w._cluster_points()
        self.assertTrue(len(w.clusters) == K)

        K = 3
        X = [[0, 0], [3, 3], [300, 300]]
        # Because the data is so simplistic, I know that clusters will be:
        clusters = [[[0, 0]], [[3, 3]], [[300, 300]]]
        w = wkm.KPlusPlus(K, X=X, verbose=False)
        w.mu = random.sample(w.X, w.K)
        self.assertTrue(w.clusters == None)
        w._cluster_points()
        self.assertTrue(len(w.clusters) == K)
        # Confirm clusters computed inside the object are identical (different
        # order allowed) to those I know:
        for c in w.clusters:
            for t, test_c in enumerate(clusters):
                if c == test_c:
                    del clusters[t]
        # Assert it has been consumed:
        self.assertTrue(clusters == [])

        K = 2
        X = [[0, 0], [3, 3], [300, 300]]
        # Because the data is so simplistic, I know that clusters will be:
        clusters = [[[0, 0], [3, 3]], [[300, 300]]]
        w = wkm.KPlusPlus(K, X=X, verbose=False)
        w.mu = [[0, 0], [300, 300]]
        self.assertTrue(w.clusters == None)
        w._cluster_points()
        self.assertTrue(len(w.clusters) == K)

        # Confirm clusters computed inside the object are identical (different
        # order allowed) to those I know:
        for c in w.clusters:
            for t, test_c in enumerate(clusters):
                if c == test_c:
                    del clusters[t]
        # Assert it has been consumed:
        self.assertTrue(clusters == [])
Ejemplo n.º 4
0
 def test_plot_clusters(self):
     K = 3
     N = 10
     random_counts = np.random.random_sample((N, )) * 100
     w = wkm.KPlusPlus(K, N=N, c=random_counts)
     w.plot_clusters()
     cwd = os.getcwd()
     filename = '/kpp_N_%i_K_%i_alpha_%i_beta_%i_%i.png' % (
         w.N, w.K, w.alpha, w.beta, 0)
     self.assertTrue(cwd + filename)
     os.remove(cwd + filename)
Ejemplo n.º 5
0
# weighted_k_means, as would happen if you clone this repo) use the following
# to import instead:
# import weighted_k_means.wkmeans as wkm

from datetime import datetime
startTime = datetime.now()

# Number of data points we want to generate:
N = 5000
# Random counts because each data point is not unique and can even be "empty",
# i.e., have a count of zero:
# random_counts = np.random.randint(100, size=(N))  # integers for the counts
random_counts = np.random.random_sample((N,)) * 100  # floats for the counts

# Initialise the class with some default values:
wkmeans = wkm.KPlusPlus(3, N=N, c=random_counts, alpha=3, beta=0.9)

# If you have your own data use:
# wkmeans = wkm.KPlusPlus(3, X=my_data, c=my_counts, alpha=3, beta=0.9)

# Initialise centroids using k-means++...
wkmeans.init_centers()
# and run to find clusters:
wkmeans.find_centers(method='++')

# Now plot the result:
wkmeans.plot_clusters(wkmeans.plot_clusters.calls)

# We're done so print some useful info:
print('The End!')
print('\tRun time: ', datetime.now() - startTime)