Exemple #1
0
def dbscan(data, use_csi=True, epsq=None, max_cl_number=None, **kwargs):
    """
    Finds cluster of users in data using DBSCAN

    :param data: pd.DataFrame with features for clustering indexed by users (sessions)
    :param use_csi: if True, then cluster stability index will be calculated (may take a lot of time)
    :param epsq: quantile of nearest neighbor positive distance between dots (value of it will be an eps),
        if None, then eps from key-words will be used.

    :param max_cl_number: maximal number of clusters for aggregation of small clusters
    :param kwargs: keyword arguments for sklearn.cluster.KMeans

    :return: np.array of clusters
    """
    kmargs = {
        i: j
        for i, j in kwargs.items() if i in DBSCAN.get_params(DBSCAN)
    }
    if epsq is not None:
        kmargs.update({'eps': find_best_eps(data, epsq)})
    km = DBSCAN(**kmargs)
    cl = km.fit_predict(data.values)
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    if max_cl_number is not None:
        cl = aggregate_cl(cl, max_cl_number)
    return cl, metrics
Exemple #2
0
def dbscan(training_vectors,
           clean_vectors,
           anomalous_vectors,
           eps=0.3,
           min_samples=3):
    print("Starting DB-Scan Fitting...")

    #Building the clustering model
    """eps is the max. distance between two neighbouring datapoints.
    min_samples gives the minimum number of samples that must be found to form a cluster.
    Both parameters must be chosen carefully, depending on the dataset.
    """
    dbscan = DBSCAN(eps=eps, min_samples=min_samples)

    print("Fitting with Parameters: ", dbscan.get_params())
    model = dbscan.fit(training_vectors)

    print("Training done! Switch to testing.")
    print("Start prediction...")

    result_training = __dbscan_predict(model, training_vectors)
    result_clean = __dbscan_predict(model, clean_vectors)
    result_anomalous = __dbscan_predict(model, anomalous_vectors)

    print("Predicting successful!")
    print("**************************")

    return result_clean, result_anomalous, result_training
Exemple #3
0
def affichage_PCA_leur_DBSCAN():
    matrice = np.genfromtxt("matrice.txt", delimiter=" ")
    print(matrice)
    reduced_data = PCA(n_components=2).fit_transform(matrice)
    print(reduced_data)

    dbscan = DBSCAN(eps=40,
                    min_samples=23,
                    metric='precomputed',
                    algorithm='auto')
    print(dbscan)
    dbscan.fit(matrice)
    print(dbscan.labels_)
    print(dbscan.get_params(deep=True))

    colors = ['g', 'b', 'r', 'y']
    plt.figure(figsize=(4, 4))
    for i in range(4):
        plt.plot(reduced_data[np.where(dbscan.labels_ == i), 0],
                 reduced_data[np.where(dbscan.labels_ == i), 1],
                 c=colors[i],
                 marker='o',
                 markerfacecolor=colors[i],
                 markeredgecolor='k')
    plt.xlabel("PC1")
    plt.ylabel("PC2")
    plt.title(
        "Représentation d'un clustering \n obtenu par l'algorithme DBSCAN")
    plt.text(-350, 350, r'$\epsilon=40,\ \ minPts=23$')
    plt.show()
Exemple #4
0
class DBSCANCluster(Intent):
    def __init__(self, eps: float) -> None:
        super().__init__()
        self.dbscan = DBSCAN(eps)

    def to_string(self) -> str:
        return 'Cluster:DBSCAN'

    def compute(self, df: pd.DataFrame) -> pd.DataFrame:
        nan_dropped = df.dropna()

        min_max_scaler = preprocessing.MinMaxScaler()
        scaled = min_max_scaler.fit_transform(nan_dropped.values)

        self.dbscan.fit(scaled)

        labels = pd.DataFrame(data=self.dbscan.labels_,
                              index=nan_dropped.index).applymap(str)

        inc_nan = labels.reindex(index=df.index, fill_value='NaN')
        values = inc_nan.iloc[:, 0].unique()
        result = pd.concat(
            map(
                lambda v: pd.DataFrame(  # type: ignore
                    data=(inc_nan.iloc[:, 0] == v).astype('int').values,
                    columns=[self.to_string() + ":" + v],
                    index=df.index,
                    dtype=int),
                values),
            axis='columns')
        return result

    def info(self) -> Optional[Dict[str, Any]]:
        return {"params": self.dbscan.get_params()}
Exemple #5
0
    def perform_dbscan(self):
        dbscan_clusterer = DBSCAN(**self.dbscan_params, metric="precomputed")
        dbscan_clusterer.fit(self.distance_matrix)
        self.dbscan_results = {
            "parameters": dbscan_clusterer.get_params(),
            "labels": dbscan_clusterer.labels_,
            "n_clusters": np.unique(dbscan_clusterer.labels_).max() + 1,
            'clusters': label_cnt_dict(dbscan_clusterer.labels_)
        }

        print_dict(self.dbscan_results)
Exemple #6
0
    def perform_dbscan(self):
        '''
		TODO : use ELKI's DBSCAN algorithm instead of scikit learns algorithm
		Reference : https://stackoverflow.com/questions/16381577/scikit-learn-dbscan-memory-usage
		'''
        dbscan_clusterer = DBSCAN(**self.dbscan_params, metric="precomputed")
        dbscan_clusterer.fit(self.distance_matrix, hdf5_file=self.hdf5_file)
        self.dbscan_results = {
            "parameters": dbscan_clusterer.get_params(),
            "labels": dbscan_clusterer.labels_,
            "n_clusters": np.unique(dbscan_clusterer.labels_).max() + 1,
            'clusters': label_cnt_dict(dbscan_clusterer.labels_)
        }

        print_dict(self.dbscan_results)
Exemple #7
0
def dbscan(data, use_csi=True, epsq=None, max_cl_number=None, **kwargs):
    """
    Finds cluster of users in data using DBSCAN

    Parameters
    -------
    data: pd.DataFrame
        Dataframe with features for clustering indexed by users (sessions)
    use_csi: bool, optional
        If ``True``, then cluster stability index will be calculated. IMPORTANT: it may take a lot of time. Default: ``True``
    epsq: float, optional
        Quantile of nearest neighbor positive distance between dots, its value will be an eps. If ``None``, then eps from keywords will be used. Default: ``None``
    max_cl_number: int, optional
        Maximal number of clusters for aggregation of small clusters. Default: ``None``
    kwargs: optional
        Parameters for ``sklearn.cluster.KMeans``

    Returns
    --------
    Array of clusters

    Return type
    -------
    np.array
    """
    kmargs = {
        i: j
        for i, j in kwargs.items() if i in DBSCAN.get_params(DBSCAN)
    }
    if epsq is not None:
        kmargs.update({'eps': find_best_eps(data, epsq)})
    km = DBSCAN(**kmargs)
    cl = km.fit_predict(data.values)
    bs = pd.get_dummies(cl)
    bs.index = data.index
    metrics = calc_all_metrics(data, km)
    if use_csi:
        metrics['csi'] = cluster_stability_index(data, km, bs, **kwargs)
    if max_cl_number is not None:
        cl = aggregate_cl(cl, max_cl_number)
    return cl, metrics
Exemple #8
0
class sklearn_DB(FitClusterBase):

    """
    https://scikit-learn.org/stable/modules/generated/sklearn.cluster.DBSCAN.html
    eps : float, optional
        The maximum distance between two samples for one to be considered as in the neighborhood of the other.
        This is not a maximum bound on the distances of points within a cluster.
        This is the most important DBSCAN parameter to choose appropriately for your dataset and distance function.

    min_samples : int, optional
        The number of samples (or total weight) in a neighborhood for a point to be considered as a core point.
        This includes the point itself.

    metric : string, or callable
        The metric to use when calculating distance between instances in a feature array.
        If metric is a string or callable, it must be one of the options allowed by sklearn.metrics.pairwise_distances for its metric parameter.
        If metric is “precomputed”, X is assumed to be a distance matrix and must be square.
        X may be a sparse matrix, in which case only “nonzero” elements may be considered neighbors for DBSCAN.
    """
    _pairwise = True

    def __init__(self, eps=None, min_samples=None, metrictype='precomputed'):
        super().__init__()
        self.name='DBSCAN'
        self.metric = metrictype  # e.g. 'euclidean'
        # distance metric
        self.eps = eps
        # The number of samples in a neighborhood for a point to be considered as a core point.
        self.min_samples = min_samples
        self.db = None

    def fit(self, dmatrix, rho=None):
        self.db = DBSCAN(eps=self.eps, min_samples=self.min_samples, metric=self.metric).fit(dmatrix)
        return self.db.labels_

    def pack(self):
        """return all the info"""
        return self.db.get_params()
Exemple #9
0
    plt.cm.Spectral(each) for each in np.linspace(0, 1, len(unique_labels))
]
plt.subplot(211)
for k, col in zip(unique_labels, colors):
    if k == -1:
        # Black used for noise.
        col = [0, 0, 0, 1]

    class_member_mask = (labels == k)

    xy = X[class_member_mask & core_samples_mask]
    plt.plot(xy[:, 0],
             xy[:, 1],
             'o',
             markerfacecolor=tuple(col),
             markeredgecolor='k',
             markersize=14)

    xy = X[class_member_mask & ~core_samples_mask]
    plt.plot(xy[:, 0],
             xy[:, 1],
             'o',
             markerfacecolor=tuple(col),
             markeredgecolor='k',
             markersize=6)

plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.subplot(212)
plt.scatter(X[:, 0], X[:, 1], s=1)
print(db.get_params())
plt.show()
Exemple #10
0
            label="outlier",
            marker='1')
plt.legend(prop={'size': 15})
plt.title("Hasil Pengelompokkan")
plt.show()

kluster["Cluster"] = kluster["Cluster"].astype("str")

px.scatter(data_frame=kluster,
           x="Income",
           y="SpendScore",
           color="Cluster",
           template='plotly_dark')

label_1 = klustering.labels_
params = klustering.get_params()
print("Dengan parameter")
print("eps\t\t\t\t:", params["eps"])
print("min_samples\t\t\t:", params["min_samples"])
print("Nilai silhoutte yang didapatkan\t:", silhouette_score(x, label_1))
"""# Mencari eps dan minPts terbaik"""

ls_eps = np.arange(8, 15.25, 0.25)
ls_min_samples = np.arange(3, 11)
dbscan_params = [(x, y) for x in ls_eps for y in ls_min_samples]

dbscan_params

jumlah_cluster = []
sil_score = []
def do_clustering(target_csv, cluster_method):
    num_cluster = 24
    df_data = pd.read_csv(os.path.join(CONFIG.CSV_PATH, target_csv + '.csv'),
                          index_col=0,
                          header=0,
                          encoding='utf-8-sig')
    df_data.index.name = 'short_code'
    print(df_data.iloc[:100])
    print(df_data.shape)

    start_time = time.time()
    if cluster_method == 0:
        clustering = DBSCAN(eps=0.3, min_samples=1000)
        clustering.fit(df_data)
        csv_name = 'clustered_dbscan_' + target_csv + '.csv'
    elif cluster_method == 1:
        clustering = OPTICS(min_samples=1000, metric='cosine')
        clustering.fit(df_data)
        csv_name = 'clustered_optics_' + target_csv + '.csv'
    elif cluster_method == 2:
        clustering = AgglomerativeClustering(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_ward_' + target_csv + '.csv'
    elif cluster_method == 3:
        clustering = AgglomerativeClustering(affinity='cosine',
                                             linkage='complete',
                                             n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_agglo_complete_' + target_csv + '.csv'
    elif cluster_method == 4:
        clustering = AgglomerativeClustering(affinity='cosine',
                                             linkage='single',
                                             n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_agglo_single_' + target_csv + '.csv'
    elif cluster_method == 5:
        clustering = Birch(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_birch_' + target_csv + '.csv'
    elif cluster_method == 6:
        clustering = KMeans(n_clusters=num_cluster)
        clustering.fit(df_data)
        csv_name = 'clustered_kmeans_' + target_csv + '.csv'
    elif cluster_method == 7:
        clustering = SpectralClustering(n_clusters=num_cluster,
                                        random_state=42,
                                        assign_labels='discretize')
        clustering.fit(df_data)
        csv_name = 'clustered_spectral_' + target_csv + '.csv'
    print("time elapsed for clustering: " + str(time.time() - start_time))
    print(clustering.get_params())
    print(clustering.labels_)
    count_percentage(clustering.labels_)
    result_df = pd.DataFrame(data=clustering.labels_,
                             index=df_data.index,
                             columns=['cluster'])

    start_time = time.time()
    print("calinski_harabasz_score: ",
          calinski_harabasz_score(df_data, result_df['cluster'].squeeze()))
    print("silhouette_score: ",
          silhouette_score(df_data, result_df['cluster'].squeeze()))
    print("davies_bouldin_score: ",
          davies_bouldin_score(df_data, result_df['cluster'].squeeze()))
    print("time elapsed for scoring: " + str(time.time() - start_time))
    result_df.to_csv(os.path.join(CONFIG.CSV_PATH, csv_name),
                     encoding='utf-8-sig')
Exemple #12
0
                score = silhouette_score(data, model.fit_predict(data))
                #on conserve celui qui a le meilleur score de Silhouette
                if score > best_score:
                    best_model = model
                    best_score = score
    return best_model


#va contenir les scores pour chaque valeur de K et pour chaque modèle : KMeans (col 0), GaussianMixture(col 1),
#CAH (col 2) et DBSCAN (col 3)
K_silhouette_scores = np.zeros((19, 4))
#Modèle optimal pour DBSCAN
eps = np.arange(0.1, 2, 0.1)
min_samples = range(3, 20)
Dbscan = best_model_Dbscan(Z, eps, min_samples)
print(Dbscan.get_params())
clusters = Dbscan.fit_predict(Z)
print("Nombre de clusters pour DBSCAN : ", clusters)
#Calcul du score de Silhouette pour DBSCAN
K_silhouette_scores[:, 3] = silhouette_score(Z, clusters)
M = hierarchy.linkage(Z, method='average', metric='euclidean')
#Clustering pour différents nombres de classe
#Matrice pour contenir les inerties intra-classe pour chaque valeur de K et chaque modèle
inertia = []
for K in range(2, 21):
    K_model_scores = []
    #Différents modèles
    models = {
        'KMeans':
        KMeans(n_clusters=K, random_state=0),
        'Gaussian':
Exemple #13
0
 dpgmm_labels = dpgmm.fit_predict(Ysc)    
 toc = time()    
 dpgmm_posterior = dpgmm.predict_proba(Ysc)
 dpgmm.get_params()
 print "elapsed time: %.4f sec" %(toc - tic)
 
 I_label = np.reshape(dpgmm_labels, (num_rows, num_cols, 1))
 I_seg = I_label * 255.0 / np.max(I_label)  #scale
 I_seg = I_seg.astype(np.uint8)             #cast
 I_seg_bgr = cv2.cvtColor(I_seg, cv2.COLOR_GRAY2BGR)
 cv2.imshow('DPGMM seg', I_seg_bgr)
 #cv2.imwrite('./figures/dpgmm_seg.png', I_seg_bgr)
     
 #DB-SCAN
 print "running DBSCAN..."
 dbscan = DBSCAN(eps=0.5, min_samples=50, metric='euclidean', algorithm='auto')
 tic = time()
 dbscan_labels = dbscan.fit_predict(Ysc)
 toc = time()
 print "elapsed time: %.4f sec" %(toc - tic)
 dbscan.get_params()
     
 #noise samples are labeled -1
 dbscan_labels = dbscan_labels + 1
 
 I2_label = np.reshape(dbscan_labels, (num_rows, num_cols, 1))
 I2_seg = I2_label * 255.0 / np.max(I2_label)  #scale
 I2_seg = I2_seg.astype(np.uint8)              #cast
 I2_seg_bgr = cv2.cvtColor(I2_seg, cv2.COLOR_GRAY2BGR)
 cv2.imshow('DPGMM seg', I2_seg_bgr)
 #cv2.imwrite('./figures/dbscan_seg.png', I2_seg_bgr)
Exemple #14
0
    dpgmm.get_params()
    print "elapsed time: %.4f sec" % (toc - tic)

    I_label = np.reshape(dpgmm_labels, (num_rows, num_cols, 1))
    I_seg = I_label * 255.0 / np.max(I_label)  #scale
    I_seg = I_seg.astype(np.uint8)  #cast
    I_seg_bgr = cv2.cvtColor(I_seg, cv2.COLOR_GRAY2BGR)
    cv2.imshow('DPGMM seg', I_seg_bgr)
    #cv2.imwrite('./figures/dpgmm_seg.png', I_seg_bgr)

    #DB-SCAN
    print "running DBSCAN..."
    dbscan = DBSCAN(eps=0.5,
                    min_samples=50,
                    metric='euclidean',
                    algorithm='auto')
    tic = time()
    dbscan_labels = dbscan.fit_predict(Ysc)
    toc = time()
    print "elapsed time: %.4f sec" % (toc - tic)
    dbscan.get_params()

    #noise samples are labeled -1
    dbscan_labels = dbscan_labels + 1

    I2_label = np.reshape(dbscan_labels, (num_rows, num_cols, 1))
    I2_seg = I2_label * 255.0 / np.max(I2_label)  #scale
    I2_seg = I2_seg.astype(np.uint8)  #cast
    I2_seg_bgr = cv2.cvtColor(I2_seg, cv2.COLOR_GRAY2BGR)
    cv2.imshow('DPGMM seg', I2_seg_bgr)
    #cv2.imwrite('./figures/dbscan_seg.png', I2_seg_bgr)