コード例 #1
0
def fit_dbscan(data, eps, min_samples, normalize=True,
               show=True, juxta_cluster_indices_grouped=None, threshold_legend=None):
    X = np.transpose(data)

    if normalize:
        from sklearn.preprocessing import minmax_scale
        minmax_scale(X, feature_range=(-1, 1), axis=0, copy=False)

    from sklearn.cluster import DBSCAN
    from sklearn import metrics
    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    labels = db.labels_

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    score = metrics.silhouette_score(X, labels, sample_size=5000)
    print('For eps={}, min_samples={}, estimated number of clusters={}'.format(eps, min_samples, n_clusters_))
    print("Silhouette Coefficient: {}".format(score))

    if show:
        pf.show_clustered_tsne(db, X, juxta_cluster_indices_grouped, threshold_legend)

    return db, n_clusters_, labels, core_samples_mask, score
コード例 #2
0
def fit_dbscan(data, eps, min_samples, show=True, juxta_cluster_indices_grouped=None, threshold_legend=None):
    X = np.transpose(data)

    db = DBSCAN(eps=eps, min_samples=min_samples).fit(X)
    core_samples_mask = np.zeros_like(db.labels_, dtype=bool)
    core_samples_mask[db.core_sample_indices_] = True
    db_labels = db.labels_

    # Number of clusters in db_labels, ignoring noise if present.
    n_clusters_ = len(set(db_labels)) - (1 if -1 in db_labels else 0)
    score = metrics.silhouette_score(X, db_labels, sample_size=5000)
    print('For eps={}, min_samples={}, estimated number of clusters={}'.format(eps, min_samples, n_clusters_))
    print("Silhouette Coefficient: {}".format(score))

    if show:
        pf.show_clustered_tsne(db, X, juxta_cluster_indices_grouped, threshold_legend)

    return db, n_clusters_, db_labels, core_samples_mask, score
コード例 #3
0
    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    score = metrics.silhouette_score(X, labels, sample_size=5000)
    print('For eps={}, min_samples={}, estimated number of clusters={}'.format(eps, min_samples, n_clusters_))
    print("Silhouette Coefficient: {}".format(score))

    if show:
        pf.show_clustered_tsne(db, X, juxta_cluster_indices_grouped, threshold_legend)

    return db, n_clusters_, labels, core_samples_mask, score


db, n_clusters_, labels, core_samples_mask, score = fit_dbscan(t_tsne, eps=0.0155, min_samples=43, normalize=True,
                                                               show=True)
pf.show_clustered_tsne(db, X, juxta_cluster_indices_grouped=None, threshold_legend=None)



from scipy import stats



# Loop over values of eps and min_samples to find best fit for DBSCAN
epses = np.arange(0.1, 0.5, 0.05)
min_sampleses = np.arange(5, 100, 5)
clustering_scores = []
params = []
for eps in epses:
    for min_samples in min_sampleses:
        db, n_clusters_, labels, core_samples_mask, score = fit_dbscan(t_tsne, eps, min_samples, show=False)