def test_dbscan_precomputed_metric_with_degenerate_input_arrays(): # see https://github.com/scikit-learn/scikit-learn/issues/4641 for # more details X = np.eye(10) labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_ assert_equal(len(set(labels)), 1) X = np.zeros((10, 10)) labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_ assert_equal(len(set(labels)), 1)
def cluster(data_values, all_fv, plot_options=[True, True], save_plot=False, write=False, adress="../"): dtw_all = create_distance_matrix(all_fv) DB = DBSCAN(metric="precomputed").fit(dtw_all) clusters = DB.labels_ if plot_options[0] == True: if plot_options[1] == True: data = pd.DataFrame(normalize_dataframe(data_values)) else: data = pd.DataFrame(data_values) data['Clusters'] = clusters #print clusters f_i = 0 for i in np.unique(data['Clusters']): f_i = f_i + 1 plt.figure(f_i) plt.title("Cluster " + str(i)) plt.plot(data[data['Clusters'] == i].iloc[:, 0:( data.shape[1] - 1)].as_matrix().transpose(), color="b") if save_plot == True: plt.savefig(adress + 'Cluster_' + str(i) + '.png') plt.show() if write == True: clusters = np.asarray(clusters) clusters.astype(int) np.savetxt(adress + "clusters.txt", clusters, fmt='%i', delimiter=",") return clusters
def test_dbscan_optics_parity(eps, min_samples): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) # calculate optics with dbscan extract at 0.3 epsilon op = OPTICS(min_samples=min_samples).fit(X) core_optics, labels_optics = op.extract_dbscan(eps) # calculate dbscan labels db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) contingency = contingency_matrix(db.labels_, labels_optics) agree = min(np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))) disagree = X.shape[0] - agree # verify core_labels match assert_array_equal(core_optics, db.core_sample_indices_) non_core_count = len(labels_optics) - len(core_optics) percent_mismatch = np.round((disagree - 1) / non_core_count, 2) # verify label mismatch is <= 5% labels assert percent_mismatch <= 0.05
def test_dbscan_optics_parity(eps, min_samples): # Test that OPTICS clustering labels are <= 5% difference of DBSCAN for metric in ['minkowski', 'euclidean']: centers = [[1, 1], [-1, -1], [1, -1]] _X, labels_true = make_blobs(n_samples=750, centers=centers, cluster_std=0.4, random_state=0) X = _X if metric == 'minkowski' else sparse.lil_matrix(_X) # calculate optics with dbscan extract at 0.3 epsilon op = OPTICS(min_samples=min_samples, cluster_method='dbscan', eps=eps, metric=metric).fit(X) # calculate dbscan labels db = DBSCAN(eps=eps, min_samples=min_samples).fit(X) contingency = contingency_matrix(db.labels_, op.labels_) agree = min(np.sum(np.max(contingency, axis=0)), np.sum(np.max(contingency, axis=1))) disagree = X.shape[0] - agree percent_mismatch = np.round((disagree - 1) / X.shape[0], 2) # verify label mismatch is <= 5% labels assert percent_mismatch <= 0.05
def test_dbscan_callable(): # Tests the DBSCAN algorithm with a callable metric. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 # metric is the function reference, not the string key. metric = distance.euclidean # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples, algorithm='ball_tree') # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def route_clustering(self, params: dict) -> list: clf = DBSCAN(**params, n_jobs=-1) # clf = hdbscan.HDBSCAN( # algorithm='best', alpha=1.0, approx_min_span_tree=True, # gen_min_span_tree=False, leaf_size=40, memory=Memory(cachedir=None), # metric=params['metric'], min_cluster_size=params['eps'], # min_samples=params['min_samples'], # p=None) return clf.fit_predict(self.dissimilarity_matrix)
def test_dbscan_no_core_samples(): rng = np.random.RandomState(0) X = rng.rand(40, 10) X[X < .8] = 0 for X_ in [X, sparse.csr_matrix(X)]: db = DBSCAN(min_samples=6).fit(X_) assert_array_equal(db.components_, np.empty((0, X_.shape[1]))) assert_array_equal(db.labels_, -1) assert_equal(db.core_sample_indices_.shape, (0,))
def test_dbscan_balltree(): # Tests the DBSCAN algorithm with balltree for neighbor calculation. eps = 0.8 min_samples = 10 D = pairwise_distances(X) core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters) db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree') labels = db.fit(X).labels_ n_clusters_3 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_3, n_clusters) db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_4 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_4, n_clusters) db = DBSCAN(leaf_size=20, eps=eps, min_samples=min_samples, algorithm='ball_tree') labels = db.fit(X).labels_ n_clusters_5 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_5, n_clusters)
def test_dbscan_precomputed_metric_with_initial_rows_zero(): # sample matrix with initial two row all zero ar = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0], [0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1], [0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0]]) matrix = sparse.csr_matrix(ar) labels = DBSCAN(eps=0.2, metric='precomputed', min_samples=2).fit(matrix).labels_ assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])
def runDBSCAN(distance_matrix, my_eps, my_min_samples, number_of_threads): db = DBSCAN(eps=my_eps, min_samples=my_min_samples, metric='precomputed', n_jobs=number_of_threads) db.fit(distance_matrix) labels = db.labels_ n_clusters = len(set(labels)) - (1 if -1 in labels else 0) n_noises = list(labels).count(-1) print('Number of clusters' + str(n_clusters)) print('Number of noises' + str(n_noises)) return list(labels)
def test_dbscan_metric_params(): # Tests that DBSCAN works with the metrics_params argument. eps = 0.8 min_samples = 10 p = 1 # Compute DBSCAN with metric_params arg db = DBSCAN(metric='minkowski', metric_params={ 'p': p }, eps=eps, min_samples=min_samples, algorithm='ball_tree').fit(X) core_sample_1, labels_1 = db.core_sample_indices_, db.labels_ # Test that sample labels are the same as passing Minkowski 'p' directly db = DBSCAN(metric='minkowski', eps=eps, min_samples=min_samples, algorithm='ball_tree', p=p).fit(X) core_sample_2, labels_2 = db.core_sample_indices_, db.labels_ assert_array_equal(core_sample_1, core_sample_2) assert_array_equal(labels_1, labels_2) # Minkowski with p=1 should be equivalent to Manhattan distance db = DBSCAN(metric='manhattan', eps=eps, min_samples=min_samples, algorithm='ball_tree').fit(X) core_sample_3, labels_3 = db.core_sample_indices_, db.labels_ assert_array_equal(core_sample_1, core_sample_3) assert_array_equal(labels_1, labels_3)
def DBSCAN_user(user_dict, eps=0.5): user_mtx = [user_dict[key] for key in user_dict] # rec = StandardScaler().fit_transform(user_mtx) db = DBSCAN(eps=eps).fit(user_mtx) core_samples_mask = np.zeros_like(db.labels_, dtype=bool) core_samples_mask[db.core_sample_indices_] = True labels = db.labels_ dataHandler.writeFile('result/DBSCAN_user.csv', labels) # Number of clusters in labels, ignoring noise if present. # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) # print n_clusters_ return (labels, user_dict)
def detect_abnormal_flight_clustering(self, flight_data, is_viz=False): """ Detect flights that depart and land out of terminal Args: flight_data (pd DataFrame): flight data from source to des is_viz (boolean): Returns: pd DataFrame: filtered flight data """ land_flights = flight_data.drop_duplicates(subset=self.flight_column, keep='last') depart_flights = flight_data.drop_duplicates(subset=self.flight_column, keep='first') terminal_flights = land_flights.append(depart_flights) terminal_coors = terminal_flights[[self.lon_column, self.lat_column]].as_matrix() min_sample = int(len(land_flights) / 2) labels = DBSCAN(min_samples=min_sample, n_jobs=-1).fit_predict(terminal_coors) outlier_flights = set( terminal_flights[labels == -1][self.flight_column]) ''' Viz ''' if is_viz: plt.style.use('ggplot') plt.scatter(x=terminal_coors[labels != -1][:, 0], y=terminal_coors[labels != -1][:, 1], marker='o', s=10, c='blue') plt.scatter(x=terminal_coors[labels == -1][:, 0], y=terminal_coors[labels == -1][:, 1], marker='o', s=10, c='red') plt.xlabel("Longitude") plt.ylabel("Latitude") plt.title("Outlier clustering detect %s/%s outliers" % (len(outlier_flights), len(land_flights))) fig = plt.gcf() fig.set_size_inches((11, 8.5), forward=False) # fig.savefig(pic_name, dpi=500) # plt.close() plt.show() return outlier_flights
def test_dbscan_feature(): # Tests the DBSCAN algorithm with a feature vector array. # Parameters chosen specifically for this task. # Different eps to other test, because distance is not normalised. eps = 0.8 min_samples = 10 metric = 'euclidean' # Compute DBSCAN # parameters chosen for task core_samples, labels = dbscan(X, metric=metric, eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples) labels = db.fit(X).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def test_dbscan_similarity(): # Tests the DBSCAN algorithm with a similarity array. # Parameters chosen specifically for this task. eps = 0.15 min_samples = 10 # Compute similarities D = distance.squareform(distance.pdist(X)) D /= np.max(D) # Compute DBSCAN core_samples, labels = dbscan(D, metric="precomputed", eps=eps, min_samples=min_samples) # number of clusters, ignoring noise if present n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0) assert_equal(n_clusters_1, n_clusters) db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples) labels = db.fit(D).labels_ n_clusters_2 = len(set(labels)) - int(-1 in labels) assert_equal(n_clusters_2, n_clusters)
def cluster_textDB_wc(docs): """ Transform texts to coordinates using named entities and cluster texts using DBSCAN """ tfidf = HashingVectorizer(tokenizer=tokenize, stop_words='english') sortedValues = [token_dict[key] for key in sorted(token_dict.keys())] sortedLabels = [key for key in sorted(token_dict.keys())] tfidf_model = tfidf.fit_transform(sortedValues).todense() eps = .37 #radius min_samples = 2 #number of samples in a cluster metric = distance.cosine dbscan_model = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(tfidf_model) tfidf_cluster = collections.defaultdict(list) for idx, label in enumerate(dbscan_model.labels_): tfidf_cluster[label].append(sortedLabels[idx]) print(tfidf_cluster) #plot(tfidf_model, dbscan_model, sortedLabels) return tfidf_cluster
def cluster_textDB_ent(docs): """ Transform texts to coordinates using named entities and cluster texts using DBSCAN """ vec = DictVectorizer() #tfidf = HashingVectorizer(tokenizer=tokenize, stop_words='english') docFeaturesLabeled = [(docName, getKwEntityFeatures(doc)) for docName, doc in docs.items()] docFeatures = [item[1] for item in docFeaturesLabeled] labels = [item[0] for item in docFeaturesLabeled] model = vec.fit_transform(docFeatures).todense() eps = .6 #radius min_samples = 2 #number of samples in a cluster metric = distance.cosine dbscan_model = DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit(model) tfidf_cluster = collections.defaultdict(list) for idx, label in enumerate(dbscan_model.labels_): tfidf_cluster[label].append(labels[idx]) #plot(tfidf_model, dbscan_model, sortedLabels) return tfidf_cluster
'AdaBoostClassifier':AdaBoostClassifier(), 'AdaBoostRegressor':AdaBoostRegressor(), 'AdditiveChi2Sampler':AdditiveChi2Sampler(), 'AffinityPropagation':AffinityPropagation(), 'AgglomerativeClustering':AgglomerativeClustering(), 'BaggingClassifier':BaggingClassifier(), 'BaggingRegressor':BaggingRegressor(), 'BayesianGaussianMixture':BayesianGaussianMixture(), 'BayesianRidge':BayesianRidge(), 'BernoulliNB':BernoulliNB(), 'BernoulliRBM':BernoulliRBM(), 'Binarizer':Binarizer(), 'Birch':Birch(), 'CCA':CCA(), 'CalibratedClassifierCV':CalibratedClassifierCV(), 'DBSCAN':DBSCAN(), 'DPGMM':DPGMM(), 'DecisionTreeClassifier':DecisionTreeClassifier(), 'DecisionTreeRegressor':DecisionTreeRegressor(), 'DictionaryLearning':DictionaryLearning(), 'ElasticNet':ElasticNet(), 'ElasticNetCV':ElasticNetCV(), 'EmpiricalCovariance':EmpiricalCovariance(), 'ExtraTreeClassifier':ExtraTreeClassifier(), 'ExtraTreeRegressor':ExtraTreeRegressor(), 'ExtraTreesClassifier':ExtraTreesClassifier(), 'ExtraTreesRegressor':ExtraTreesRegressor(), 'FactorAnalysis':FactorAnalysis(), 'FastICA':FastICA(), 'FeatureAgglomeration':FeatureAgglomeration(), 'FunctionTransformer':FunctionTransformer(),
def test_weighted_dbscan(): # ensure sample_weight is validated assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2]) assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4]) # ensure sample_weight has an effect assert_array_equal([], dbscan([[0], [1]], sample_weight=None, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5], min_samples=6)[0]) assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5], min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6], min_samples=6)[0]) # points within eps of each other: assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5, sample_weight=[5, 1], min_samples=6)[0]) # and effect of non-positive and non-integer sample_weight: assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1], eps=1.5, min_samples=6)[0]) assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0], eps=1.5, min_samples=6)[0]) assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1], eps=1.5, min_samples=6)[0]) # for non-negative sample_weight, cores should be identical to repetition rng = np.random.RandomState(42) sample_weight = rng.randint(0, 5, X.shape[0]) core1, label1 = dbscan(X, sample_weight=sample_weight) assert_equal(len(label1), len(X)) X_repeated = np.repeat(X, sample_weight, axis=0) core_repeated, label_repeated = dbscan(X_repeated) core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool) core_repeated_mask[core_repeated] = True core_mask = np.zeros(X.shape[0], dtype=bool) core_mask[core1] = True assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask) # sample_weight should work with precomputed distance matrix D = pairwise_distances(X) core3, label3 = dbscan(D, sample_weight=sample_weight, metric='precomputed') assert_array_equal(core1, core3) assert_array_equal(label1, label3) # sample_weight should work with estimator est = DBSCAN().fit(X, sample_weight=sample_weight) core4 = est.core_sample_indices_ label4 = est.labels_ assert_array_equal(core1, core4) assert_array_equal(label1, label4) est = DBSCAN() label5 = est.fit_predict(X, sample_weight=sample_weight) core5 = est.core_sample_indices_ assert_array_equal(core1, core5) assert_array_equal(label1, label5) assert_array_equal(label1, est.labels_)
def test_pickle(): obj = DBSCAN() s = pickle.dumps(obj) assert_equal(type(pickle.loads(s)), obj.__class__)
def test_input_validation(): # DBSCAN.fit should accept a list of lists. X = [[1., 2.], [3., 4.]] DBSCAN().fit(X) # must not raise exception
def test_pickle(): obj = DBSCAN() s = pickle.dumps(obj) assert type(pickle.loads(s)) == obj.__class__