Beispiel #1
0
    def test_fit(self):
        """ Tests that the fit method returns the expected centers using toy
        data.
        """
        arr = np.array([[1, 2], [2, 1], [-1, -2], [-2, -1]])
        x = ds.array(arr, block_size=(2, 2))

        km = KMeans(n_clusters=2, random_state=666, verbose=False)
        km.fit(x)

        expected_centers = np.array([[1.5, 1.5], [-1.5, -1.5]])

        self.assertTrue((km.centers == expected_centers).all())
Beispiel #2
0
    def test_predict(self):
        """ Tests that labels are correctly predicted using toy data. """
        p1, p2, p3, p4 = [1, 2], [2, 1], [-1, -2], [-2, -1]

        arr1 = np.array([p1, p2, p3, p4])
        x = ds.array(arr1, block_size=(2, 2))

        km = KMeans(n_clusters=2, random_state=666)
        km.fit(x)

        p5, p6 = [10, 10], [-10, -10]

        arr2 = np.array([p1, p2, p3, p4, p5, p6])
        x_test = ds.array(arr2, block_size=(2, 2))

        labels = km.predict(x_test).collect()
        expected_labels = np.array([0, 0, 1, 1, 0, 1])

        self.assertTrue(np.array_equal(labels, expected_labels))
Beispiel #3
0
    def test_fit_predict(self):
        """ Tests fit_predict."""
        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))

        x_train = ds.array(x_filtered, block_size=(300, 2))

        kmeans = KMeans(n_clusters=3, random_state=170)
        labels = kmeans.fit_predict(x_train).collect()

        skmeans = SKMeans(n_clusters=3, random_state=170)
        sklabels = skmeans.fit_predict(x_filtered)

        centers = np.array([[-8.941375656533449, -5.481371322614891],
                            [-4.524023204953875, 0.06235042593214654],
                            [2.332994701667008, 0.37681003933082696]])

        self.assertTrue(np.allclose(centers, kmeans.centers))
        self.assertTrue(np.allclose(labels, sklabels))
Beispiel #4
0
def main():
    n_samples = 300000000
    n_chunks = 1536
    chunk_size = int(np.ceil(n_samples / n_chunks))
    n_features = 100
    n_clusters = 500

    x = ds.random_array((n_samples, n_features), (chunk_size, n_features))

    km = KMeans(n_clusters=n_clusters, max_iter=5, tol=0, arity=48)
    performance.measure("KMeans", "300M", km.fit, x)
Beispiel #5
0
    def test_init_params(self):
        """ Tests that KMeans object correctly sets the initialization
        parameters """
        n_clusters = 2
        max_iter = 1
        tol = 1e-4
        seed = 666
        arity = 2
        init = "random"

        km = KMeans(n_clusters=n_clusters,
                    max_iter=max_iter,
                    tol=tol,
                    arity=arity,
                    random_state=seed)

        expected = (n_clusters, init, max_iter, tol, arity)
        real = (km._n_clusters, km._init, km._max_iter, km._tol, km._arity)
        self.assertEqual(expected, real)
Beispiel #6
0
    def test_init(self):
        # With dense data
        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))
        x_train = ds.array(x_filtered, block_size=(300, 2))

        init = np.random.random((5, 2))
        km = KMeans(n_clusters=5, init=init)
        km.fit(x_train)

        self.assertTrue(np.array_equal(km._init, init))
        self.assertFalse(np.array_equal(km.centers, init))

        # With sparse data
        x_sp = ds.array(csr_matrix(x_filtered), block_size=(300, 2))
        init = csr_matrix(np.random.random((5, 2)))

        km = KMeans(n_clusters=5, init=init)
        km.fit(x_sp)

        self.assertTrue(np.array_equal(km._init.toarray(), init.toarray()))
        self.assertFalse(np.array_equal(km.centers.toarray(), init.toarray()))
Beispiel #7
0
    def test_sparse(self):
        """ Tests K-means produces the same results using dense and sparse
        data structures. """
        file_ = "tests/files/libsvm/2"

        x_sp, _ = ds.load_svmlight_file(file_, (10, 300), 780, True)
        x_ds, _ = ds.load_svmlight_file(file_, (10, 300), 780, False)

        kmeans = KMeans(random_state=170)

        y_sparse = kmeans.fit_predict(x_sp).collect()
        sparse_c = kmeans.centers.toarray()

        kmeans = KMeans(random_state=170)

        y_dense = kmeans.fit_predict(x_ds).collect()
        dense_c = kmeans.centers

        self.assertTrue(np.allclose(sparse_c, dense_c))
        self.assertTrue(np.array_equal(y_sparse, y_dense))
Beispiel #8
0
    def test_kmeans(self):
        """ Tests K-means fit_predict and compares the result with
            regular ds-arrays """
        config.session.execute("TRUNCATE TABLE hecuba.istorage")
        config.session.execute("DROP KEYSPACE IF EXISTS hecuba_dislib")

        x, y = make_blobs(n_samples=1500, random_state=170)
        x_filtered = np.vstack(
            (x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))

        block_size = (x_filtered.shape[0] // 10, x_filtered.shape[1])

        x_train = ds.array(x_filtered, block_size=block_size)
        x_train_hecuba = ds.array(x=x_filtered, block_size=block_size)
        x_train_hecuba.make_persistent(name="hecuba_dislib.test_array")

        kmeans = KMeans(n_clusters=3, random_state=170)
        labels = kmeans.fit_predict(x_train).collect()

        kmeans2 = KMeans(n_clusters=3, random_state=170)
        h_labels = kmeans2.fit_predict(x_train_hecuba).collect()

        self.assertTrue(np.allclose(kmeans.centers, kmeans2.centers))
        self.assertTrue(np.allclose(labels, h_labels))
Beispiel #9
0
def main():
    """
    Usage example copied from scikit-learn's webpage.

    """
    plt.figure(figsize=(12, 12))

    n_samples = 1500
    random_state = 170
    x, y = make_blobs(n_samples=n_samples, random_state=random_state)

    dis_x = ds.array(x, block_size=(300, 2))

    # Incorrect number of clusters
    kmeans = KMeans(n_clusters=2, random_state=random_state)
    y_pred = kmeans.fit_predict(dis_x).collect()

    plt.subplot(221)
    plt.scatter(x[:, 0], x[:, 1], c=y_pred)
    centers = kmeans.centers
    plt.scatter(centers[:, 0], centers[:, 1], c="red")
    plt.title("Incorrect Number of Blobs")

    # Anisotropicly distributed data
    transformation = [[0.60834549, -0.63667341], [-0.40887718, 0.85253229]]
    x_aniso = np.dot(x, transformation)

    dis_x_aniso = ds.array(x_aniso, block_size=(300, 2))

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    y_pred = kmeans.fit_predict(dis_x_aniso).collect()

    plt.subplot(222)
    plt.scatter(x_aniso[:, 0], x_aniso[:, 1], c=y_pred)
    centers = kmeans.centers
    plt.scatter(centers[:, 0], centers[:, 1], c="red")
    plt.title("Anisotropicly Distributed Blobs")

    # Different variance
    x_varied, y_varied = make_blobs(n_samples=n_samples,
                                    cluster_std=[1.0, 2.5, 0.5],
                                    random_state=random_state)

    dis_x_varied = ds.array(x_varied, block_size=(300, 2))

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    y_pred = kmeans.fit_predict(dis_x_varied).collect()

    plt.subplot(223)
    plt.scatter(x_varied[:, 0], x_varied[:, 1], c=y_pred)
    centers = kmeans.centers
    plt.scatter(centers[:, 0], centers[:, 1], c="red")
    plt.title("Unequal Variance")

    # Unevenly sized blobs
    x_filtered = np.vstack((x[y == 0][:500], x[y == 1][:100], x[y == 2][:10]))

    dis_x_filtered = ds.array(x_filtered, block_size=(300, 2))

    kmeans = KMeans(n_clusters=3, random_state=random_state)
    y_pred = kmeans.fit_predict(dis_x_filtered).collect()

    plt.subplot(224)
    plt.scatter(x_filtered[:, 0], x_filtered[:, 1], c=y_pred)
    centers = kmeans.centers
    plt.scatter(centers[:, 0], centers[:, 1], c="red")
    plt.title("Unevenly Sized Blobs")
    plt.show()
def main():
    np.random.seed(0)

    # ============
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    # ============
    n_samples = 1500
    noisy_circles = make_circles(n_samples=n_samples,
                                 factor=.5,
                                 noise=.05,
                                 random_state=170)
    noisy_moons = make_moons(n_samples=n_samples, noise=.05)
    blobs = make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    # Anisotropicly distributed data
    random_state = 170
    X, y = make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)

    # blobs with varied variances
    varied = make_blobs(n_samples=n_samples,
                        cluster_std=[1.0, 2.5, 0.5],
                        random_state=random_state)

    # ============
    # Set up cluster parameters
    # ============
    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)

    plot_num = 1

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    datasets = [(noisy_circles, {
        'damping': .77,
        'preference': -240,
        'quantile': .2,
        'n_clusters': 2
    }), (noisy_moons, {
        'damping': .75,
        'preference': -220,
        'n_clusters': 2
    }), (varied, {
        'eps': .18,
        'n_neighbors': 2
    }), (aniso, {
        'eps': .15,
        'n_neighbors': 2
    }), (blobs, {}), (no_structure, {})]

    for i_dataset, (dataset, algo_params) in enumerate(datasets):
        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # ============
        # Create cluster objects
        # ============
        kmeans = KMeans(n_clusters=params["n_clusters"])
        dbscan = DBSCAN(eps=params["eps"], n_regions=1)
        gm = GaussianMixture(n_components=params["n_clusters"])

        clustering_algorithms = (('K-Means', kmeans), ('DBSCAN', dbscan),
                                 ('Gaussian mixture', gm))

        for name, algorithm in clustering_algorithms:
            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore",
                                        message="the number of connected "
                                        "components of the "
                                        "connectivity matrix is ["
                                        "0-9]{1,2} > 1. Completing "
                                        "it to avoid stopping the "
                                        "tree early.",
                                        category=UserWarning)
                warnings.filterwarnings("ignore",
                                        message="Graph is not fully "
                                        "connected, "
                                        "spectral "
                                        "embedding may not "
                                        "work as "
                                        "expected.",
                                        category=UserWarning)

                data = ds.array(X, block_size=(300, 2))
                algorithm.fit(data)

            t1 = time.time()
            y_pred = algorithm.fit_predict(data).collect()

            plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            colors = np.array(
                list(
                    islice(
                        cycle([
                            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                            '#a65628', '#984ea3', '#999999', '#e41a1c',
                            '#dede00'
                        ]), int(max(y_pred) + 1))))
            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99,
                     .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')
            plot_num += 1

    plt.show()
Beispiel #11
0
def initialize(alg_names, args):
    return [{
        'KMeans': lambda x: KMeans(**get_kmeans_kwargs(x)),
        'DBSCAN': lambda x: DBSCAN(**get_dbscan_kwargs(x)),
        'GaussianMixture': lambda x: GaussianMixture(**get_gm_kwargs(x))
    }[name](args) for name in alg_names]
Beispiel #12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight", help="read files in SVMLight format",
                        action="store_true")
    parser.add_argument("-dt", "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-a", "--arity", metavar="CASCADE_ARITY", type=int,
                        help="default is 50", default=50)
    parser.add_argument("-c", "--centers", metavar="N_CENTERS", type=int,
                        help="default is 2", default=2)
    parser.add_argument("-b", "--block_size", metavar="BLOCK_SIZE", type=str,
                        help="two comma separated ints that represent the "
                             "size of the blocks in which to divide the input "
                             "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-i", "--iteration", metavar="MAX_ITERATIONS",
                        type=int, help="default is 5", default=5)
    parser.add_argument("-f", "--features", metavar="N_FEATURES",
                        help="number of features of the input data "
                             "(only for SVMLight files)",
                        type=int, default=None, required=False)
    parser.add_argument("--dense", help="store data in dense format (only "
                                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("--labeled", help="the last column of the input file "
                                          "represents labels (only for text "
                                          "files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format", type=str)
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)

    n_features = x.shape[1]

    if args.labeled and not args.svmlight:
        x = x[:, :n_features - 1]

    if args.detailed_times:
        barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    kmeans = KMeans(n_clusters=args.clusters, max_iter=args.iteration,
                    arity=args.arity, verbose=True)
    kmeans.fit(x)

    barrier()
    fit_time = time.time() - s_time

    out = [args.clusters, args.arity, args.part_size, read_time, fit_time]

    print(out)
Beispiel #13
0
    def _initialize_parameters(self, x, random_state):
        """Initialization of the Gaussian mixture parameters.

        Parameters
        ----------
        x : ds-array, shape=(n_samples, n_features)
            Data points.

        random_state : RandomState
            A random number generator instance.
        """
        if self.weights_init is not None:
            self.weights_ = self.weights_init / np.sum(self.weights_init)
        if self.means_init is not None:
            self.means_ = self.means_init
        if self.precisions_init is not None:
            if self.covariance_type == 'full':
                self.precisions_cholesky_ = np.array(
                    [linalg.cholesky(prec_init, lower=True)
                     for prec_init in self.precisions_init])
            elif self.covariance_type == 'tied':
                self.precisions_cholesky_ = linalg.cholesky(
                    self.precisions_init, lower=True)
            else:
                self.precisions_cholesky_ = self.precisions_init
        initialize_params = (self.weights_init is None or
                             self.means_init is None or
                             self.precisions_init is None)
        if initialize_params:
            n_components = self.n_components
            resp_blocks = []
            if self.init_params == 'kmeans':
                if self.verbose:
                    print("KMeans initialization start")
                seed = random_state.randint(0, int(1e8))
                kmeans = KMeans(n_clusters=n_components, random_state=seed,
                                verbose=self.verbose)
                y = kmeans.fit_predict(x)
                self.kmeans = kmeans
                for y_part in y._iterator(axis=0):
                    resp_blocks.append([_resp_subset(y_part._blocks,
                                                     n_components)])

            elif self.init_params == 'random':
                chunks = x._n_blocks[0]
                seeds = random_state.randint(np.iinfo(np.int32).max,
                                             size=chunks)
                for i, x_row in enumerate(x._iterator(axis=0)):
                    resp_blocks.append([_random_resp_subset(x_row.shape[0],
                                                            n_components,
                                                            seeds[i])])
            else:
                raise ValueError("Unimplemented initialization method '%s'"
                                 % self.init_params)
            resp = Array(blocks=resp_blocks,
                         top_left_shape=(x._top_left_shape[0], n_components),
                         reg_shape=(x._reg_shape[0], n_components),
                         shape=(x.shape[0], n_components), sparse=False)
            weights, nk, means = self._estimate_parameters(x, resp)
            if self.means_init is None:
                self.means_ = means
            if self.weights_init is None:
                self.weights_ = weights

            if self.precisions_init is None:
                cov, p_c = _estimate_covariances(x, resp, nk,
                                                 self.means_, self.reg_covar,
                                                 self.covariance_type,
                                                 self.arity)
                self.covariances_ = cov
                self.precisions_cholesky_ = p_c

            for resp_block in resp._blocks:
                compss_delete_object(resp_block)