Beispiel #1
0
def test_dbscan_precomputed_metric_with_degenerate_input_arrays():
    # see https://github.com/scikit-learn/scikit-learn/issues/4641 for
    # more details
    X = np.eye(10)
    labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_
    assert_equal(len(set(labels)), 1)

    X = np.zeros((10, 10))
    labels = DBSCAN(eps=0.5, metric='precomputed').fit(X).labels_
    assert_equal(len(set(labels)), 1)
Beispiel #2
0
def cluster(data_values,
            all_fv,
            plot_options=[True, True],
            save_plot=False,
            write=False,
            adress="../"):
    dtw_all = create_distance_matrix(all_fv)
    DB = DBSCAN(metric="precomputed").fit(dtw_all)
    clusters = DB.labels_

    if plot_options[0] == True:
        if plot_options[1] == True:
            data = pd.DataFrame(normalize_dataframe(data_values))
        else:
            data = pd.DataFrame(data_values)
        data['Clusters'] = clusters
        #print clusters
        f_i = 0
        for i in np.unique(data['Clusters']):
            f_i = f_i + 1
            plt.figure(f_i)
            plt.title("Cluster " + str(i))

            plt.plot(data[data['Clusters'] == i].iloc[:, 0:(
                data.shape[1] - 1)].as_matrix().transpose(),
                     color="b")
            if save_plot == True:
                plt.savefig(adress + 'Cluster_' + str(i) + '.png')
        plt.show()
    if write == True:
        clusters = np.asarray(clusters)
        clusters.astype(int)
        np.savetxt(adress + "clusters.txt", clusters, fmt='%i', delimiter=",")
    return clusters
Beispiel #3
0
def test_dbscan_optics_parity(eps, min_samples):
    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN

    centers = [[1, 1], [-1, -1], [1, -1]]
    X, labels_true = make_blobs(n_samples=750,
                                centers=centers,
                                cluster_std=0.4,
                                random_state=0)

    # calculate optics with dbscan extract at 0.3 epsilon
    op = OPTICS(min_samples=min_samples).fit(X)
    core_optics, labels_optics = op.extract_dbscan(eps)

    # calculate dbscan labels
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)

    contingency = contingency_matrix(db.labels_, labels_optics)
    agree = min(np.sum(np.max(contingency, axis=0)),
                np.sum(np.max(contingency, axis=1)))
    disagree = X.shape[0] - agree

    # verify core_labels match
    assert_array_equal(core_optics, db.core_sample_indices_)

    non_core_count = len(labels_optics) - len(core_optics)
    percent_mismatch = np.round((disagree - 1) / non_core_count, 2)

    # verify label mismatch is <= 5% labels
    assert percent_mismatch <= 0.05
Beispiel #4
0
def test_dbscan_optics_parity(eps, min_samples):
    # Test that OPTICS clustering labels are <= 5% difference of DBSCAN

    for metric in ['minkowski', 'euclidean']:

        centers = [[1, 1], [-1, -1], [1, -1]]
        _X, labels_true = make_blobs(n_samples=750, centers=centers,
                                     cluster_std=0.4, random_state=0)
        X = _X if metric == 'minkowski' else sparse.lil_matrix(_X)

        # calculate optics with dbscan extract at 0.3 epsilon
        op = OPTICS(min_samples=min_samples, cluster_method='dbscan',
                    eps=eps,
                    metric=metric).fit(X)

        # calculate dbscan labels
        db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)

        contingency = contingency_matrix(db.labels_, op.labels_)
        agree = min(np.sum(np.max(contingency, axis=0)),
                    np.sum(np.max(contingency, axis=1)))
        disagree = X.shape[0] - agree

        percent_mismatch = np.round((disagree - 1) / X.shape[0], 2)

        # verify label mismatch is <= 5% labels
        assert percent_mismatch <= 0.05
Beispiel #5
0
def test_dbscan_callable():
    # Tests the DBSCAN algorithm with a callable metric.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    # metric is the function reference, not the string key.
    metric = distance.euclidean
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X,
                                  metric=metric,
                                  eps=eps,
                                  min_samples=min_samples,
                                  algorithm='ball_tree')

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric,
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Beispiel #6
0
 def route_clustering(self, params: dict) -> list:
     clf = DBSCAN(**params, n_jobs=-1)
     # clf = hdbscan.HDBSCAN(
     #   algorithm='best', alpha=1.0, approx_min_span_tree=True,
     #   gen_min_span_tree=False, leaf_size=40, memory=Memory(cachedir=None),
     #   metric=params['metric'], min_cluster_size=params['eps'],
     #   min_samples=params['min_samples'],
     #   p=None)
     return clf.fit_predict(self.dissimilarity_matrix)
Beispiel #7
0
def test_dbscan_no_core_samples():
    rng = np.random.RandomState(0)
    X = rng.rand(40, 10)
    X[X < .8] = 0

    for X_ in [X, sparse.csr_matrix(X)]:
        db = DBSCAN(min_samples=6).fit(X_)
        assert_array_equal(db.components_, np.empty((0, X_.shape[1])))
        assert_array_equal(db.labels_, -1)
        assert_equal(db.core_sample_indices_.shape, (0,))
Beispiel #8
0
def test_dbscan_balltree():
    # Tests the DBSCAN algorithm with balltree for neighbor calculation.
    eps = 0.8
    min_samples = 10

    D = pairwise_distances(X)
    core_samples, labels = dbscan(D,
                                  metric="precomputed",
                                  eps=eps,
                                  min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)

    db = DBSCAN(p=2.0, eps=eps, min_samples=min_samples, algorithm='kd_tree')
    labels = db.fit(X).labels_

    n_clusters_3 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_3, n_clusters)

    db = DBSCAN(p=1.0, eps=eps, min_samples=min_samples, algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_4 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_4, n_clusters)

    db = DBSCAN(leaf_size=20,
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree')
    labels = db.fit(X).labels_

    n_clusters_5 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_5, n_clusters)
Beispiel #9
0
def test_dbscan_precomputed_metric_with_initial_rows_zero():
    # sample matrix with initial two row all zero
    ar = np.array([[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0],
                   [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
                   [0.0, 0.0, 0.0, 0.0, 0.1, 0.0, 0.0],
                   [0.0, 0.0, 0.1, 0.1, 0.0, 0.0, 0.3],
                   [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.1],
                   [0.0, 0.0, 0.0, 0.0, 0.3, 0.1, 0.0]])
    matrix = sparse.csr_matrix(ar)
    labels = DBSCAN(eps=0.2, metric='precomputed',
                    min_samples=2).fit(matrix).labels_
    assert_array_equal(labels, [-1, -1, 0, 0, 0, 1, 1])
Beispiel #10
0
def runDBSCAN(distance_matrix, my_eps, my_min_samples, number_of_threads):
    db = DBSCAN(eps=my_eps,
                min_samples=my_min_samples,
                metric='precomputed',
                n_jobs=number_of_threads)
    db.fit(distance_matrix)

    labels = db.labels_
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noises = list(labels).count(-1)

    print('Number of clusters' + str(n_clusters))
    print('Number of noises' + str(n_noises))

    return list(labels)
Beispiel #11
0
def test_dbscan_metric_params():
    # Tests that DBSCAN works with the metrics_params argument.
    eps = 0.8
    min_samples = 10
    p = 1

    # Compute DBSCAN with metric_params arg
    db = DBSCAN(metric='minkowski',
                metric_params={
                    'p': p
                },
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree').fit(X)
    core_sample_1, labels_1 = db.core_sample_indices_, db.labels_

    # Test that sample labels are the same as passing Minkowski 'p' directly
    db = DBSCAN(metric='minkowski',
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree',
                p=p).fit(X)
    core_sample_2, labels_2 = db.core_sample_indices_, db.labels_

    assert_array_equal(core_sample_1, core_sample_2)
    assert_array_equal(labels_1, labels_2)

    # Minkowski with p=1 should be equivalent to Manhattan distance
    db = DBSCAN(metric='manhattan',
                eps=eps,
                min_samples=min_samples,
                algorithm='ball_tree').fit(X)
    core_sample_3, labels_3 = db.core_sample_indices_, db.labels_

    assert_array_equal(core_sample_1, core_sample_3)
    assert_array_equal(labels_1, labels_3)
Beispiel #12
0
def DBSCAN_user(user_dict, eps=0.5):
    user_mtx = [user_dict[key] for key in user_dict]

    # rec = StandardScaler().fit_transform(user_mtx)

    db = DBSCAN(eps=eps).fit(user_mtx)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_
    dataHandler.writeFile('result/DBSCAN_user.csv', labels)

    # Number of clusters in labels, ignoring noise if present.
    # n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    # print n_clusters_

    return (labels, user_dict)
Beispiel #13
0
    def detect_abnormal_flight_clustering(self, flight_data, is_viz=False):
        """
        Detect flights that depart and land out of terminal
        Args:
          flight_data (pd DataFrame): flight data from source to des
          is_viz (boolean):

        Returns:
          pd DataFrame: filtered flight data

        """
        land_flights = flight_data.drop_duplicates(subset=self.flight_column,
                                                   keep='last')
        depart_flights = flight_data.drop_duplicates(subset=self.flight_column,
                                                     keep='first')
        terminal_flights = land_flights.append(depart_flights)
        terminal_coors = terminal_flights[[self.lon_column,
                                           self.lat_column]].as_matrix()
        min_sample = int(len(land_flights) / 2)
        labels = DBSCAN(min_samples=min_sample,
                        n_jobs=-1).fit_predict(terminal_coors)
        outlier_flights = set(
            terminal_flights[labels == -1][self.flight_column])
        ''' Viz '''
        if is_viz:
            plt.style.use('ggplot')
            plt.scatter(x=terminal_coors[labels != -1][:, 0],
                        y=terminal_coors[labels != -1][:, 1],
                        marker='o',
                        s=10,
                        c='blue')
            plt.scatter(x=terminal_coors[labels == -1][:, 0],
                        y=terminal_coors[labels == -1][:, 1],
                        marker='o',
                        s=10,
                        c='red')
            plt.xlabel("Longitude")
            plt.ylabel("Latitude")
            plt.title("Outlier clustering detect %s/%s outliers" %
                      (len(outlier_flights), len(land_flights)))
            fig = plt.gcf()
            fig.set_size_inches((11, 8.5), forward=False)
            # fig.savefig(pic_name, dpi=500)
            # plt.close()
            plt.show()

        return outlier_flights
Beispiel #14
0
def test_dbscan_feature():
    # Tests the DBSCAN algorithm with a feature vector array.
    # Parameters chosen specifically for this task.
    # Different eps to other test, because distance is not normalised.
    eps = 0.8
    min_samples = 10
    metric = 'euclidean'
    # Compute DBSCAN
    # parameters chosen for task
    core_samples, labels = dbscan(X, metric=metric, eps=eps,
                                  min_samples=min_samples)

    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric=metric, eps=eps, min_samples=min_samples)
    labels = db.fit(X).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
Beispiel #15
0
def test_dbscan_similarity():
    # Tests the DBSCAN algorithm with a similarity array.
    # Parameters chosen specifically for this task.
    eps = 0.15
    min_samples = 10
    # Compute similarities
    D = distance.squareform(distance.pdist(X))
    D /= np.max(D)
    # Compute DBSCAN
    core_samples, labels = dbscan(D, metric="precomputed", eps=eps,
                                  min_samples=min_samples)
    # number of clusters, ignoring noise if present
    n_clusters_1 = len(set(labels)) - (1 if -1 in labels else 0)

    assert_equal(n_clusters_1, n_clusters)

    db = DBSCAN(metric="precomputed", eps=eps, min_samples=min_samples)
    labels = db.fit(D).labels_

    n_clusters_2 = len(set(labels)) - int(-1 in labels)
    assert_equal(n_clusters_2, n_clusters)
def cluster_textDB_wc(docs):
    """ Transform texts to coordinates using named entities and cluster texts using DBSCAN """
    tfidf = HashingVectorizer(tokenizer=tokenize, stop_words='english')
    sortedValues = [token_dict[key] for key in sorted(token_dict.keys())]
    sortedLabels = [key for key in sorted(token_dict.keys())]
    tfidf_model = tfidf.fit_transform(sortedValues).todense()
    eps = .37  #radius
    min_samples = 2  #number of samples in a cluster

    metric = distance.cosine
    dbscan_model = DBSCAN(eps=eps, min_samples=min_samples,
                          metric=metric).fit(tfidf_model)

    tfidf_cluster = collections.defaultdict(list)

    for idx, label in enumerate(dbscan_model.labels_):
        tfidf_cluster[label].append(sortedLabels[idx])

    print(tfidf_cluster)
    #plot(tfidf_model, dbscan_model, sortedLabels)
    return tfidf_cluster
def cluster_textDB_ent(docs):
    """ Transform texts to coordinates using named entities and cluster texts using DBSCAN """

    vec = DictVectorizer()
    #tfidf = HashingVectorizer(tokenizer=tokenize, stop_words='english')
    docFeaturesLabeled = [(docName, getKwEntityFeatures(doc))
                          for docName, doc in docs.items()]
    docFeatures = [item[1] for item in docFeaturesLabeled]
    labels = [item[0] for item in docFeaturesLabeled]
    model = vec.fit_transform(docFeatures).todense()
    eps = .6  #radius
    min_samples = 2  #number of samples in a cluster

    metric = distance.cosine
    dbscan_model = DBSCAN(eps=eps, min_samples=min_samples,
                          metric=metric).fit(model)

    tfidf_cluster = collections.defaultdict(list)

    for idx, label in enumerate(dbscan_model.labels_):
        tfidf_cluster[label].append(labels[idx])

    #plot(tfidf_model, dbscan_model, sortedLabels)
    return tfidf_cluster
Beispiel #18
0
			'AdaBoostClassifier':AdaBoostClassifier(),
			'AdaBoostRegressor':AdaBoostRegressor(),
			'AdditiveChi2Sampler':AdditiveChi2Sampler(),
			'AffinityPropagation':AffinityPropagation(),
			'AgglomerativeClustering':AgglomerativeClustering(),
			'BaggingClassifier':BaggingClassifier(),
			'BaggingRegressor':BaggingRegressor(),
			'BayesianGaussianMixture':BayesianGaussianMixture(),
			'BayesianRidge':BayesianRidge(),
			'BernoulliNB':BernoulliNB(),
			'BernoulliRBM':BernoulliRBM(),
			'Binarizer':Binarizer(),
			'Birch':Birch(),
			'CCA':CCA(),
			'CalibratedClassifierCV':CalibratedClassifierCV(),
			'DBSCAN':DBSCAN(),
			'DPGMM':DPGMM(),
			'DecisionTreeClassifier':DecisionTreeClassifier(),
			'DecisionTreeRegressor':DecisionTreeRegressor(),
			'DictionaryLearning':DictionaryLearning(),
			'ElasticNet':ElasticNet(),
			'ElasticNetCV':ElasticNetCV(),
			'EmpiricalCovariance':EmpiricalCovariance(),
			'ExtraTreeClassifier':ExtraTreeClassifier(),
			'ExtraTreeRegressor':ExtraTreeRegressor(),
			'ExtraTreesClassifier':ExtraTreesClassifier(),
			'ExtraTreesRegressor':ExtraTreesRegressor(),
			'FactorAnalysis':FactorAnalysis(),
			'FastICA':FastICA(),
			'FeatureAgglomeration':FeatureAgglomeration(),
			'FunctionTransformer':FunctionTransformer(),
Beispiel #19
0
def test_weighted_dbscan():
    # ensure sample_weight is validated
    assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2])
    assert_raises(ValueError, dbscan, [[0], [1]], sample_weight=[2, 3, 4])

    # ensure sample_weight has an effect
    assert_array_equal([], dbscan([[0], [1]], sample_weight=None,
                                  min_samples=6)[0])
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 5],
                                  min_samples=6)[0])
    assert_array_equal([0], dbscan([[0], [1]], sample_weight=[6, 5],
                                   min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 6],
                                      min_samples=6)[0])

    # points within eps of each other:
    assert_array_equal([0, 1], dbscan([[0], [1]], eps=1.5,
                                      sample_weight=[5, 1], min_samples=6)[0])
    # and effect of non-positive and non-integer sample_weight:
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[5, 0],
                                  eps=1.5, min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[5.9, 0.1],
                                      eps=1.5, min_samples=6)[0])
    assert_array_equal([0, 1], dbscan([[0], [1]], sample_weight=[6, 0],
                                      eps=1.5, min_samples=6)[0])
    assert_array_equal([], dbscan([[0], [1]], sample_weight=[6, -1],
                                  eps=1.5, min_samples=6)[0])

    # for non-negative sample_weight, cores should be identical to repetition
    rng = np.random.RandomState(42)
    sample_weight = rng.randint(0, 5, X.shape[0])
    core1, label1 = dbscan(X, sample_weight=sample_weight)
    assert_equal(len(label1), len(X))

    X_repeated = np.repeat(X, sample_weight, axis=0)
    core_repeated, label_repeated = dbscan(X_repeated)
    core_repeated_mask = np.zeros(X_repeated.shape[0], dtype=bool)
    core_repeated_mask[core_repeated] = True
    core_mask = np.zeros(X.shape[0], dtype=bool)
    core_mask[core1] = True
    assert_array_equal(np.repeat(core_mask, sample_weight), core_repeated_mask)

    # sample_weight should work with precomputed distance matrix
    D = pairwise_distances(X)
    core3, label3 = dbscan(D, sample_weight=sample_weight,
                           metric='precomputed')
    assert_array_equal(core1, core3)
    assert_array_equal(label1, label3)

    # sample_weight should work with estimator
    est = DBSCAN().fit(X, sample_weight=sample_weight)
    core4 = est.core_sample_indices_
    label4 = est.labels_
    assert_array_equal(core1, core4)
    assert_array_equal(label1, label4)

    est = DBSCAN()
    label5 = est.fit_predict(X, sample_weight=sample_weight)
    core5 = est.core_sample_indices_
    assert_array_equal(core1, core5)
    assert_array_equal(label1, label5)
    assert_array_equal(label1, est.labels_)
Beispiel #20
0
def test_pickle():
    obj = DBSCAN()
    s = pickle.dumps(obj)
    assert_equal(type(pickle.loads(s)), obj.__class__)
Beispiel #21
0
def test_input_validation():
    # DBSCAN.fit should accept a list of lists.
    X = [[1., 2.], [3., 4.]]
    DBSCAN().fit(X)             # must not raise exception
Beispiel #22
0
def test_pickle():
    obj = DBSCAN()
    s = pickle.dumps(obj)
    assert type(pickle.loads(s)) == obj.__class__