Ejemplo n.º 1
0
    def test_cluster_between_regions_2(self):
        """ Tests that DBSCAN can find clusters between regions. """
        x = np.array([[0, 0], [0.6, 0], [0.9, 0], [1.1, 0.2], [0.9, 0.6],
                      [1.1, 0.8], [1.4, 0.8], [2, 2]])
        ds_x = ds.array(x, block_size=(5, 2))

        dbscan = DBSCAN(n_regions=2, eps=0.5, min_samples=3)
        dbscan.fit(ds_x)
        self.assertEqual(dbscan.n_clusters, 1)
Ejemplo n.º 2
0
    def test_small_cluster_2(self):
        """ Tests that DBSCAN can find clusters with less than min_samples. """
        x = np.array([[0, 0], [0, 1], [1, 0], [3, 0], [5.1, 0], [6, 0], [6, 1],
                      [10, 10]])
        ds_x = ds.array(x, block_size=(5, 2))

        # n_regions=10
        dbscan2 = DBSCAN(n_regions=10, eps=2.5, min_samples=4)
        dbscan2.fit(ds_x)
        self.assertEqual(dbscan2.n_clusters, 2)
Ejemplo n.º 3
0
    def test_cluster_between_regions_1(self):
        """ Tests that DBSCAN can find clusters between regions. """
        x = np.array([[0, 0], [3.9, 0], [4.1, 0], [4.1, 0.89], [4.1, 0.88],
                      [5.9, 0], [5.9, 0.89], [5.9, 0.88], [6.1, 0], [10, 10],
                      [4.6, 0], [5.4, 0]])
        ds_x = ds.array(x, block_size=(5, 2))

        dbscan = DBSCAN(n_regions=10, eps=0.9, min_samples=4)
        dbscan.fit(ds_x)
        self.assertEqual(dbscan.n_clusters, 1)
Ejemplo n.º 4
0
 def test_zero_samples(self):
     """ Tests DBSCAN fit when some regions contain zero samples.
     """
     n_samples = 2
     x, y = make_blobs(n_samples=n_samples, n_features=2, random_state=8)
     dbscan = DBSCAN(n_regions=3, eps=.2, max_samples=100)
     x = StandardScaler().fit_transform(x)
     ds_x = ds.array(x, block_size=(2, 2))
     dbscan.fit(ds_x)
     self.assertEqual(dbscan.n_clusters, 0)
Ejemplo n.º 5
0
    def test_random_clusters_2(self):
        """ Tests DBSCAN on random data with multiple clusters. """
        # 2 dimensions
        np.random.seed(2)
        x = np.random.uniform(0, 10, size=(1000, 2))
        ds_x = ds.array(x, block_size=(300, 2))
        dbscan = DBSCAN(n_regions=10, max_samples=10, eps=0.5, min_samples=10)
        y = dbscan.fit_predict(ds_x).collect()

        self.assertEqual(dbscan.n_clusters, 27)
        self.assertEqual(np.count_nonzero(y == -1), 206)
Ejemplo n.º 6
0
 def test_n_clusters_moons_grid(self):
     """ Tests that DBSCAN finds the correct number of clusters when
     setting n_regions > 1 with moon data.
     """
     n_samples = 1500
     x, y = make_moons(n_samples=n_samples, noise=.05)
     dbscan = DBSCAN(n_regions=4, eps=.3, max_samples=600)
     x = StandardScaler().fit_transform(x)
     ds_x = ds.array(x, block_size=(300, 2))
     dbscan.fit(ds_x)
     self.assertEqual(dbscan.n_clusters, 2)
Ejemplo n.º 7
0
 def test_n_clusters_blobs_grid(self):
     """ Tests that DBSCAN finds the correct number of clusters when
     setting n_regions > 1 with blob data.
     """
     n_samples = 1500
     x, y = make_blobs(n_samples=n_samples, n_features=2, random_state=8)
     dbscan = DBSCAN(n_regions=4, eps=.3, max_samples=300)
     x = StandardScaler().fit_transform(x)
     ds_x = ds.array(x, block_size=(300, 2))
     dbscan.fit(ds_x)
     self.assertEqual(dbscan.n_clusters, 3)
Ejemplo n.º 8
0
 def test_n_clusters_circles_max_samples(self):
     """ Tests that DBSCAN finds the correct number of clusters when
     defining max_samples with circle data.
     """
     n_samples = 1500
     x, y = make_circles(n_samples=n_samples, factor=.5, noise=.05)
     dbscan = DBSCAN(n_regions=1, eps=.15, max_samples=500)
     x = StandardScaler().fit_transform(x)
     ds_x = ds.array(x, block_size=(300, 2))
     dbscan.fit(ds_x)
     self.assertEqual(dbscan.n_clusters, 2)
Ejemplo n.º 9
0
    def test_random_clusters_3(self):
        """ Tests DBSCAN on random data with multiple clusters. """
        # 3 dimensions
        np.random.seed(3)
        x = np.random.uniform(0, 10, size=(1000, 3))
        ds_x = ds.array(x, block_size=(300, 3))
        dbscan = DBSCAN(n_regions=10,
                        dimensions=[0, 1],
                        eps=0.9,
                        min_samples=4)
        y = dbscan.fit_predict(ds_x).collect()

        self.assertEqual(dbscan.n_clusters, 50)
        self.assertEqual(np.count_nonzero(y == -1), 266)
Ejemplo n.º 10
0
def main():
    data = ds.load_txt_file("/gpfs/projects/bsc19/COMPSs_DATASETS/dislib/gaia"
                            "/dbscan/data_scaled.csv", block_size=(10000, 5))

    dbscan = DBSCAN(eps=0.19, min_samples=5, max_samples=5000, n_regions=17,
                    dimensions=[0, 1])
    performance.measure("DBSCAN", "gaia", dbscan.fit, data)
Ejemplo n.º 11
0
    def test_sparse(self):
        """ Tests that DBSCAN produces the same results with sparse and
        dense data.
        """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        dbscan = DBSCAN(n_regions=1, eps=.15)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        x = StandardScaler().fit_transform(x)

        dense = ds.array(x, block_size=(300, 2))
        sparse = ds.array(csr_matrix(x), block_size=(300, 2))

        y_dense = dbscan.fit_predict(dense).collect()
        y_sparse = dbscan.fit_predict(sparse).collect()

        self.assertTrue(np.array_equal(y_dense, y_sparse))
Ejemplo n.º 12
0
def main():
    file = "/fefs/scratch/bsc19/bsc19029/PERFORMANCE/datasets/data_scaled.csv"
    data = ds.load_txt_file(file, block_size=(10000, 5))

    dbscan = DBSCAN(eps=0.19,
                    min_samples=5,
                    max_samples=5000,
                    n_regions=17,
                    dimensions=[0, 1])
    performance.measure("DBSCAN", "gaia", dbscan.fit, data)
Ejemplo n.º 13
0
    def test_n_clusters_aniso_dimensions(self):
        """ Tests that DBSCAN finds the correct number of clusters when
        dimensions is not None.
        """
        n_samples = 1500
        x, y = make_blobs(n_samples=n_samples, random_state=170)
        dbscan = DBSCAN(n_regions=5, dimensions=[1], eps=.15)
        transformation = [[0.6, -0.6], [-0.4, 0.8]]
        x = np.dot(x, transformation)
        x = StandardScaler().fit_transform(x)
        ds_x = ds.array(x, block_size=(300, 2))
        y_pred = dbscan.fit_predict(ds_x).collect()
        true_sizes = {19, 496, 491, 488, 6}
        cluster_sizes = {
            y_pred[y_pred == -1].size, y_pred[y_pred == 0].size,
            y_pred[y_pred == 1].size, y_pred[y_pred == 2].size,
            y_pred[y_pred == 3].size
        }

        self.assertEqual(dbscan.n_clusters, 4)
        self.assertEqual(true_sizes, cluster_sizes)
Ejemplo n.º 14
0
def main():
    np.random.seed(0)

    # ============
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    # ============
    n_samples = 1500
    noisy_circles = make_circles(n_samples=n_samples,
                                 factor=.5,
                                 noise=.05,
                                 random_state=170)
    noisy_moons = make_moons(n_samples=n_samples, noise=.05)
    blobs = make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    # Anisotropicly distributed data
    random_state = 170
    X, y = make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)

    # blobs with varied variances
    varied = make_blobs(n_samples=n_samples,
                        cluster_std=[1.0, 2.5, 0.5],
                        random_state=random_state)

    # ============
    # Set up cluster parameters
    # ============
    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)

    plot_num = 1

    default_base = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 3
    }

    datasets = [(noisy_circles, {
        'damping': .77,
        'preference': -240,
        'quantile': .2,
        'n_clusters': 2
    }), (noisy_moons, {
        'damping': .75,
        'preference': -220,
        'n_clusters': 2
    }), (varied, {
        'eps': .18,
        'n_neighbors': 2
    }), (aniso, {
        'eps': .15,
        'n_neighbors': 2
    }), (blobs, {}), (no_structure, {})]

    for i_dataset, (dataset, algo_params) in enumerate(datasets):
        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # ============
        # Create cluster objects
        # ============
        kmeans = KMeans(n_clusters=params["n_clusters"])
        dbscan = DBSCAN(eps=params["eps"], n_regions=1)
        gm = GaussianMixture(n_components=params["n_clusters"])

        clustering_algorithms = (('K-Means', kmeans), ('DBSCAN', dbscan),
                                 ('Gaussian mixture', gm))

        for name, algorithm in clustering_algorithms:
            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings("ignore",
                                        message="the number of connected "
                                        "components of the "
                                        "connectivity matrix is ["
                                        "0-9]{1,2} > 1. Completing "
                                        "it to avoid stopping the "
                                        "tree early.",
                                        category=UserWarning)
                warnings.filterwarnings("ignore",
                                        message="Graph is not fully "
                                        "connected, "
                                        "spectral "
                                        "embedding may not "
                                        "work as "
                                        "expected.",
                                        category=UserWarning)

                data = ds.array(X, block_size=(300, 2))
                algorithm.fit(data)

            t1 = time.time()
            y_pred = algorithm.fit_predict(data).collect()

            plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            colors = np.array(
                list(
                    islice(
                        cycle([
                            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                            '#a65628', '#984ea3', '#999999', '#e41a1c',
                            '#dede00'
                        ]), int(max(y_pred) + 1))))
            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99,
                     .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes,
                     size=15,
                     horizontalalignment='right')
            plot_num += 1

    plt.show()
Ejemplo n.º 15
0
def initialize(alg_names, args):
    return [{
        'KMeans': lambda x: KMeans(**get_kmeans_kwargs(x)),
        'DBSCAN': lambda x: DBSCAN(**get_dbscan_kwargs(x)),
        'GaussianMixture': lambda x: GaussianMixture(**get_gm_kwargs(x))
    }[name](args) for name in alg_names]
Ejemplo n.º 16
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--svmlight",
                        help="read file in SVMlLight format",
                        action="store_true")
    parser.add_argument("-dt",
                        "--detailed_times",
                        help="get detailed execution times (read and fit)",
                        action="store_true")
    parser.add_argument("-e",
                        "--epsilon",
                        metavar="EPSILON",
                        type=float,
                        help="default is 0.5",
                        default=0.5)
    parser.add_argument("-r",
                        "--regions",
                        metavar="N_REGIONS",
                        type=int,
                        help="number of regions to create",
                        default=1)
    parser.add_argument("-d",
                        "--dimensions",
                        metavar="DIMENSIONS",
                        type=str,
                        help="comma separated dimensions to use in the grid",
                        required=False)
    parser.add_argument("-x",
                        "--max_samples",
                        metavar="MAX_SAMPLES",
                        type=int,
                        help="maximum samples to process per task ("
                        "default is 1000)",
                        default=1000)
    parser.add_argument("-m",
                        "--min_samples",
                        metavar="MIN_SAMPLES",
                        type=int,
                        help="default is 5",
                        default=5)
    parser.add_argument("-b",
                        "--block_size",
                        metavar="BLOCK_SIZE",
                        type=str,
                        help="two comma separated ints that represent the "
                        "size of the blocks in which to divide the input "
                        "data (default is 100,100)",
                        default="100,100")
    parser.add_argument("-f",
                        "--features",
                        metavar="N_FEATURES",
                        help="number of features of the input data "
                        "(only for SVMLight files)",
                        type=int,
                        default=None,
                        required=False)
    parser.add_argument("--dense",
                        help="store data in dense format (only "
                        "for SVMLight files)",
                        action="store_true")
    parser.add_argument("--labeled",
                        help="the last column of the input file "
                        "represents labels (only for text "
                        "files)",
                        action="store_true")
    parser.add_argument("train_data",
                        help="input file in CSV or SVMLight format",
                        type=str)
    args = parser.parse_args()

    train_data = args.train_data

    s_time = time.time()
    read_time = 0

    sparse = not args.dense

    bsize = args.block_size.split(",")
    block_size = (int(bsize[0]), int(bsize[1]))

    if args.svmlight:
        x, y = ds.load_svmlight_file(train_data, block_size, args.features,
                                     sparse)
    else:
        x = ds.load_txt_file(train_data, block_size)

    n_features = x.shape[1]

    if args.labeled and not args.svmlight:
        x = x[:, :n_features - 1]

    if args.detailed_times:
        compss_barrier()
        read_time = time.time() - s_time
        s_time = time.time()

    dims = range(args.features)

    if args.dimensions:
        dims = args.dimensions.split(",")
        dims = np.array(dims, dtype=int)

    dbscan = DBSCAN(eps=args.epsilon,
                    min_samples=args.min_samples,
                    max_samples=args.max_samples,
                    n_regions=args.regions,
                    dimensions=dims)
    dbscan.fit(x)

    compss_barrier()
    fit_time = time.time() - s_time

    out = [
        dbscan.eps, dbscan.min_samples, dbscan.max_samples, dbscan.n_regions,
        len(dims), args.part_size, dbscan.n_clusters, read_time, fit_time
    ]

    print(out)