def test_DBSCAN(): X = np.array([[1, 1.1, 1], [1.2, .8, 1.1], [.8, 1, 1.2], [3.7, 3.5, 3.6], [3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]]) eps = 0.5 min_points = 2 dbscanalgo = DBSCAN(eps=eps, min_points=min_points) dbscanalgo.run(X, "Synthetic Data")
def test_duplicate_dual_dbscan(): data = np.array([ [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [1, 1], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1], [-1, -1], ]) impl = DBSCAN(0.1, 5) impl.Fit(data) r_labels = np.array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]) i_labels = impl.GetLabels(data.shape[0]) assert t.check_clusters(r_labels, i_labels, 2) == True
def _run_model(self): eps = float(self.epsilonLineEdit.text()) minPoints = int(self.minPointsLineEdit.text()) self.model = DBSCAN(epsilon=eps, minPoints=minPoints) self.model.fit(self.data) self.run = True self.accept() self.close()
def test_large_dbscan(): data, r_labels = datasets.make_blobs(n_samples=1000, centers=1) impl = DBSCAN(0.6, 4) ref = cluster.DBSCAN(0.6, 4) impl.Fit(data) ref.fit(data) r_labels = ref.labels_ i_labels = impl.GetLabels(data.shape[0]) assert t.check_clusters(r_labels, i_labels, 1) == True
def test_simple_single_dbscan(): data = np.array([ [1, 1], ]) impl = DBSCAN(1, 1) impl.Fit(data) r_labels = np.array([0]) i_labels = impl.GetLabels(data.shape[0]) assert t.check_clusters(r_labels, i_labels, 1) == True
def test_dimensionality_dbscan(): data, r_labels = datasets.make_blobs(n_samples=288, n_features=16, cluster_std=0.2, random_state=31) impl = DBSCAN(0.6, 4) impl.Fit(data) ref = cluster.DBSCAN(0.6, 4) ref.fit(data) assert True == t.check_clusters(ref.labels_, impl.GetLabels(data.shape[0]), 3)
def test_fit_with_small_eps(self): expected_core_sample_indices = np.array([]) expected_components = np.array([]) expected_labels = np.array([-1, -1, -1, -1, -1, -1, -1]) data = self.get_two_clusters() dbscan = DBSCAN(eps=0.1, min_samples=3) dbscan.fit(data) np.testing.assert_equal(expected_core_sample_indices, dbscan.core_sample_indices_) np.testing.assert_equal(expected_components, dbscan.components_) np.testing.assert_equal(expected_labels, dbscan.labels_)
def test_clear_blobs_dbscan(): centers = ((-5, -5), (5, 5)) data, _ = datasets.make_blobs(n_samples=100, centers=centers, cluster_std=0.1) ref = cluster.DBSCAN(0.6, 4) impl = DBSCAN(0.6, 4) impl.Fit(data) ref.fit(data) r_labels = ref.labels_ i_labels = impl.GetLabels(data.shape[0]) assert t.check_clusters(r_labels, i_labels, 2) == True
def test_epsilon_dbscan(dbscanparams): data, _ = datasets.make_blobs(n_samples=512, n_features=2, random_state=73) impl = DBSCAN(dbscanparams[0], dbscanparams[1]) ref = cluster.DBSCAN(dbscanparams[0], dbscanparams[1]) impl.Fit(data) ref.fit(data) i_labels = impl.GetLabels(data.shape[0]) r_labels = ref.labels_ assert np.unique(i_labels).shape[0] == np.unique(r_labels).shape[0] assert True == t.check_clusters_with_allowance( r_labels, i_labels, np.unique(i_labels).shape[0], 0.05)
class DbscanDialog(QDialog): def __init__(self, data): super().__init__() self.setWindowTitle("DBSCAN özellikleri") self.data = data self.model = None self.run = False self._configure() def _configure(self): mainLayout = QVBoxLayout() hor1 = QHBoxLayout() epsilonLabel = QLabel("Epsilon: ") self.epsilonLineEdit = QLineEdit("0.3") hor1.addWidget(epsilonLabel) hor1.addWidget(self.epsilonLineEdit) hor2 = QHBoxLayout() minPointsLabel = QLabel("Min Points: ") self.minPointsLineEdit = QLineEdit("10") hor2.addWidget(minPointsLabel) hor2.addWidget(self.minPointsLineEdit) trainButton = QPushButton("Modeli çalıştır") trainButton.clicked.connect(self._run_model) mainLayout.addLayout(hor1) mainLayout.addLayout(hor2) mainLayout.addWidget(trainButton) self.setLayout(mainLayout) def _run_model(self): eps = float(self.epsilonLineEdit.text()) minPoints = int(self.minPointsLineEdit.text()) self.model = DBSCAN(epsilon=eps, minPoints=minPoints) self.model.fit(self.data) self.run = True self.accept() self.close()
def test_dbscan(self): """Test the dbscan algorithm on a small test data. """ data = np.array([[1, 1.1], [1.2, 0.8], [0.8, 1], [3.7, 4], [3.9, 3.9], [3.6, 4.1], [10, 10]]) clusters = DBSCAN(eps=0.5, min_pts=2).fit(data) self.assertEqual(clusters, [1, 1, 1, 2, 2, 2, -1])
def main(): datasets = get_datasets() min_points = 5 eps = [20, 17, 11, 4] for i, dataset in enumerate(datasets): # Plot kdist plot to determine EPS param kdist_data = get_kdist_data(dataset, min_points) plot_data(kdist_data) # Get dbscan object dbscan = DBSCAN(min_points, eps[i]) labels = dbscan.fit(dataset) print_labels(labels) plot_labeled_data(dataset, labels)
def algorithm_router(choice, data): if choice == '1': #kmeans kmeans_obj = KMeans(data=data, k=3, iteration=500) kmeans_obj.kmeans_main() print(kmeans_obj.cluster_avg) kmeans_obj.show_res() else: #dbscan dbscan_obj = DBSCAN(data=data, epsilon=0.9, min_pts=6) dbscan_obj.dbscan_main() dbscan_obj.show_res()
def test_double_fit_dbscan(): data1, r_labels1 = datasets.make_blobs(n_samples=288, centers=6, cluster_std=0.2, random_state=31) data2, r_labels2 = datasets.make_blobs(n_samples=288, centers=6, cluster_std=0.2, random_state=31) impl = DBSCAN(0.6, 4) impl.Fit(data1) i_labels1 = impl.GetLabels(data1.shape[0]) impl.Fit(data2) i_labels2 = impl.GetLabels(data2.shape[0]) t2 = t.check_clusters_with_allowance(i_labels2, i_labels1, 6, .01) assert t2 == True
def dbscan_visualization_test(data, eps=0.3, minPoints=10): test = DBSCAN(eps, minPoints) test.fit(data) plot_automation(test)
import pyspark as ps from dbscan import DBSCAN from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler from time import time import numpy as np import sys if __name__ == '__main__': i = int(sys.argv[1]) centers = [[1, 1], [-1, -1], [1, -1]] samples = [750, 7500, 75000, 750000, 7500000] eps = [0.3, 0.1, 0.03, 0.01, 0.003] n_part = [16, 128, 1024, 8192, 65536] sc = ps.SparkContext() X, labels_true = make_blobs(n_samples=samples[i], centers=centers, cluster_std=0.4, random_state=0) X = StandardScaler().fit_transform(X) test_data = sc.parallelize(enumerate(X)) start = time() dbscan = DBSCAN(eps[i], 10, max_partitions=n_part[i]) dbscan.train(test_data) result = np.array(dbscan.assignments()) run_time = time() - start with open('benchmark.csv', 'w') as f: f.write('\n%i,%f,%i,%i' % (samples[i], eps[i], n_part[i], run_time))
# -*- coding: utf-8 -*- """ Project Code: DBSCAN v1.1 @author: Deep.I Inc. @Jongwon Kim Revision date: 2020-12-09 Contact Info: : https://deep-eye.tistory.com https://deep-i.net """ from dbscan import DBSCAN from scipy import io #%% Run DEMO x = io.loadmat('./sample/sample.mat')['X'] # INIT DBSCAN dbscan = DBSCAN(x, 1.5, 4) # CLUSTERING idx, noise = dbscan.run() # SORTING g_cluster, n_cluster = dbscan.sort() # Visualization dbscan.plot()
from data_loader import load_file from k_means import Kmeans from dbscan import DBSCAN from random import shuffle from utils import calculate_accuracy from sklearn.cluster import KMeans from utils import euclidean_distance import pry raw_data = load_file('iris.data') classes = set([x[-1] for x in raw_data]) class_dict = {} test_data = {} train_data = [] for kelas in classes: class_dict[kelas] = list(filter(lambda x: x[-1] == kelas, raw_data)) shuffle(class_dict[kelas]) test_data[kelas] = [x[:-1] for x in class_dict[kelas][:10]] train_data += [x[:-1] for x in class_dict[kelas][10:]] db_scan = DBSCAN(1, 0.5) pry() db_scan.fit(train_data[:10]) db_scan.clusters
import numpy as np import matplotlib.pyplot as plt from sklearn import metrics from sklearn.metrics.cluster import normalized_mutual_info_score from utils import * from datasets import * from classifiers import * from metrics import * from agglomerative_clustering import AgglomerativeClustering from dbscan import DBSCAN X, y = read_dataset(dataset='Iris') print("--- AgglomerativeClustering ---") model = AgglomerativeClustering(n_clusters=3, verbose=False, linkage='complete', distance_metric='l1') cluster_pred = model.fit_predict(X) print("adjusted_rand_score", metrics.adjusted_rand_score(y, cluster_pred)) print(" normalized_mutual_info_score", normalized_mutual_info_score(y, cluster_pred)) print("--- DBSCAN ---") cluster_pred = DBSCAN(eps=1, MinPts=5).fit_predict(X) print("adjusted_rand_score", metrics.adjusted_rand_score(y, cluster_pred)) print(" normalized_mutual_info_score", normalized_mutual_info_score(y, cluster_pred))
[3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]]) eps = 0.5 min_points = 2 dbscanalgo = DBSCAN(eps=eps, min_points=min_points) dbscanalgo.run(X, "Synthetic Data") def test_HAC(): test = [[1, 1.1, 1], [1.2, .8, 1.1], [.8, 1, 1.2], [3.7, 3.5, 3.6], [3.9, 3.9, 3.5], [3.4, 3.5, 3.7], [15, 15, 15]] hac = HAC() for i in xrange(1, 4): hac.clusterLevel = i + 1 hac.run(test, "Synthetic Data with Cluster Level " + str(i)) dbscan = DBSCAN() hac = HAC() experiment = Experiments() experiment.runSynthetic(dbscan) experiment.runSynthetic(hac) ind = 500 dim = 3 #experiment.run(dbscan, True, ind, dim) # test_HAC() # experiment.runSynthetic(hac) # experiment.run(hac, True, ind, dim)
# # heatmap_point = [] if __name__ == '__main__': # Example of pypadis.DBSCAN from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler import matplotlib.pyplot as plt import matplotlib.patches as patches import matplotlib.cm as cm from time import time from itertools import izip import os X = [] with open("staypoints.csv","rb") as f: reader = csv.reader(f) for line in reader: X.append(line) X = np.array(X, np.float) sc = ps.SparkContext() test_data = sc.parallelize(enumerate(X)) start = time() dbscan = DBSCAN(eps=0.02, min_samples=20, metric='precomputed') dbscan.train(test_data) result = np.array(dbscan.assignments()) print 'clusters count: %s' % len(set(result[: 1])) import pdb; pdb.set_trace()
def main(): num_clusters = 4 clusters = generate_data(num_clusters, seed=1) dbscan = DBSCAN(eps=7, min_samples=5) dbscan.fit(clusters) plot_clusters(clusters, dbscan.labels_, dbscan.components_)
import numpy as np import matplotlib.pyplot as plt from dbscan import DBSCAN if __name__ == '__main__': epsilon = 0.5 min_pts = 2 points = np.array( [(np.cos(x), np.sin(x)) for x in np.linspace(0, 2 * np.pi, 100)] + [(2 * np.cos(x), 2 * np.sin(x)) for x in np.linspace(0, 2 * np.pi, 100)] + [(3 * np.cos(x), 3 * np.sin(x)) for x in np.linspace(0, 2 * np.pi, 100)] ) def euclidean_distance_2d(x, y): return np.sqrt((x[0] - y[0]) ** 2 + (x[1] - y[1])**2) dbscan = DBSCAN(euclidean_distance_2d, epsilon, min_pts) clusters = dbscan.cluster(points) for points in clusters.values(): pt_cluster = np.array(points) plt.scatter(pt_cluster[:, 0], pt_cluster[:, 1]) plt.show()
def fit(self): data = self.get_two_clusters() dbscan = DBSCAN(eps=self.eps, min_samples=self.min_samples) dbscan.fit(data) return dbscan
from dbscan import DBSCAN from sklearn.datasets import make_moons x, _ = make_moons(n_samples=300, noise=0.1) radius = 0.2 min_points = 10 print('Radius = ' + str(radius) + ', Minpoints = ' + str(min_points)) model = DBSCAN(x, radius, min_points) #Fitting model to dataset point_labels, clusters = model.fit() print('Number of clusters: ' + str(clusters - 1)) #Plotting result model.plot_result(x, point_labels, clusters)
cnoise = 0.1 # standard deviation of Gaussian noise added to the data cfactor = 0.3 # scale factor between inner and outer circles # Setting for moons mnoise = 0.1 # standard deviation of Gaussian noise added to the data # Blobs # Generate points -- "blobs" bX, _ = generate_dataset("blobs", n_samples=n_samples, centers=bcenters, n_features=bn_features, cluster_std=bcluster_std, random_state=random_state) # Cluster points by DBSCAN bdbs = DBSCAN(epsilon=1.0, min_samples=5) bdbs.fit_predict(bX) # Plot clustering results plot_clusters(bdbs) # Circles # Generate points -- "circles" cX, _ = generate_dataset("circles", n_samples=n_samples, noise=cnoise, factor=cfactor, random_state=random_state) # Cluster points by DBSCAN cdbs = DBSCAN(epsilon=0.2, min_samples=5) cdbs.fit_predict(cX) # Plot clustering results
if __name__ == '__main__': X1 = create_artificial_gaussiandata(np.array([1, 2]), np.array([[2, 1], [1, 2]]), 20) X2 = create_artificial_gaussiandata(np.array([10, 8]), np.array([[2, 1], [1, 2]]), 20) X = np.concatenate([X1, X2], 0) # 2つのndarrayを結合 # データの可視化 plotter = PlotUtility() plotter.scatter_plot(X1[:, 0], X1[:, 1], [1 for _ in range(len(X1))], size=5) plotter.scatter_plot(X2[:, 0], X2[:, 1], [2 for _ in range(len(X2))], size=5) plotter.show() # クラスタリング dbscan = DBSCAN(2, 3) dist_matrix = make_distance_matrix(X) cluster = dbscan.fit(dist_matrix) print(cluster) # 可視化 plotter = PlotUtility() for i in range(int(min(cluster)), int(max(cluster)) + 1): c = devide(X, cluster, i) plotter.scatter_plot(c[:, 0], c[:, 1], [i for _ in range(len(c))], size=5) plotter.show()
def test_raises_error_for_invalid_eps(self): with self.assertRaises(ValueError): DBSCAN(eps=0)
elif linkage_type == 'average' : agglo_accuracy_average += accuracy elif linkage_type == 'average-group' : agglo_accuracy_average_group += accuracy print ('Agglomerative - ' + str(linkage_type)) print ('Accuracy\t', accuracy) print ('Format {Real class : cluster}') print ('Dict\t\t', str(dict)) print () # DBSCAN for i in range (0, len(epss)) : eps = epss[i] min_pts = min_ptss[i] dbscan = DBSCAN(eps, min_pts) sk_dbscan = sklearn_DBSCAN(eps=eps, min_samples=min_pts) dbscan.fit(X_train) result = dbscan.predict(X_test) accuracy, dict = clustering_accuracy_score(np.asarray(y_test), np.asarray(result)) dbscan_accuracy += accuracy print ('DBSCAN') print ('Epsilon : {} Min Points : {}'.format(eps, min_pts)) print ('Accuracy\t', accuracy) print ('Format {Real class : cluster}') print ('Dict\t\t', str(dict)) print () k += 1
def test_raises_error_for_invalid_min_samples(self): with self.assertRaises(ValueError): DBSCAN(min_samples=0)