def clustering(the_image_autoencoded,
               the_image_shape,
               number_of_clusters,
               extra_parameters=""):
    print()
    print("***   OPTICS clustering   ***")
    print("---------------------------------")
    # https://scikit-learn.org/stable/modules/clustering.html
    # https://scikit-learn.org/stable/auto_examples/cluster/plot_optics.html
    # #sphx-glr-auto-examples-cluster-plot-optics-py
    # https://scikit-learn.org/stable/modules/clustering.html#optics

    print("Image shape: ", the_image_shape)

    print("OPTICS clustering")
    clust = OPTICS(min_samples=10, xi=.0005, min_cluster_size=.005)

    print("Running fit function for OPTICS clustering")
    clust.fit(the_image_autoencoded)

    labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=0.5)

    labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=2)

    labels_300 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=3)

    print("---------------------------")
    reachability = clust.reachability_[clust.ordering_]
    print("Reachability: ", reachability)
    print("---------------------------")

    print("Creating list for clustered data")
    clustered_data = np.zeros((the_image_shape[0], the_image_shape[1]))
    print("Clustered data shape:  ", np.shape(clustered_data))

    x = 0
    y = 0
    for i in range(the_image_shape[0] * the_image_shape[1]):
        clustered_data[y, x] = labels_050[y * the_image_shape[1] + x]
        x = x + 1
        if x == the_image_shape[1]:
            x = 0
            y = y + 1

    return clustered_data
コード例 #2
0
def optics_fit_predict(X, min_samples=50, cluster_method='dbscan', eps=2):
    """Perform OPTICS clustering
    Extracts an ordered list of points and reachability distances, and
    performs initial clustering using ``max_eps`` distance specified at
    OPTICS object instantiation.
    
    Parameters
    ----------
    X               : array, shape (n_samples, n_features), or (n_samples, n_samples)  
    min_samples     : The number of samples in a neighborhood for a point to be considered as a core point.
    cluster_method  : 'dbscan' by default. Other available: 'xi'
    eps             : The maximum distance between two samples for one to be considered as in the neighborhood of the other.

    Returns
    -------
    labels: Prediction/labels  
    """
    opt = OPTICS(min_samples=min_samples, cluster_method=str(cluster_method))
    opt.fit(X)
    labels = cluster_optics_dbscan(reachability=opt.reachability_,
                                   core_distances=opt.core_distances_,
                                   ordering=opt.ordering_,
                                   eps=eps)

    return labels
コード例 #3
0
    def _extract_best_optics(self, clusterer):
        max_score = -inf
        best_pred = None

        # Traverse epsilon to detect the best cut
        for my_eps in arange(0.01, 0.5, 0.01):
            pred = cluster_optics_dbscan(
                    reachability=clusterer.reachability_,
                    core_distances=clusterer.core_distances_,
                    ordering=clusterer.ordering_, eps=my_eps)

            if not len(unique(pred)) in (1, len(self.data)):
                score = silhouette_score(X=self.data,
                                         labels=pred,
                                         metric=self.distance_metric,
                                         random_state=13712)

                if score > max_score:
                    max_score = score
                    best_pred = pred

        if best_pred is not None:
            return self._process_noise_as_singletons(best_pred)
        else:
            # All outputs are either one cluster or n clusters
            return self._process_noise_as_singletons(pred)
コード例 #4
0
def UseDBScan():
    db = get_db()
    cursor = db.cursor()
    sql = "Select starttime,longitude,latitude from userdata Where imsi = %s order by starttime"
    cursor.execute(sql, (request.form['imsi'], ))
    results = cursor.fetchall()

    data = np.array(results)

    #公式计算两点间距离(m)
    def distance(p1, p2):
        #lng1,lat1,lng2,lat2 = (120.12802999999997,30.28708,115.86572000000001,28.7427)
        lng1, lat1, lng2, lat2 = map(
            radians, [float(p1[0]),
                      float(p1[1]),
                      float(p2[0]),
                      float(p2[1])])  # 经纬度转换成弧度
        dlon = lng2 - lng1
        dlat = lat2 - lat1
        a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
        distance = 2 * asin(sqrt(a)) * 6378.137 * 1000  # 地球平均半径,6371km
        return distance

    dbscan_cluster = DBSCAN(eps=500,
                            min_samples=5,
                            metric=lambda a, b: distance(a, b)).fit(data[:,
                                                                         1:3])

    optics_cluster = OPTICS(min_samples=5,
                            cluster_method='dbscan',
                            metric=lambda a, b: distance(a, b)).fit(data[:,
                                                                         1:3])

    print(optics_cluster.reachability_)

    optics_label = cluster_optics_dbscan(
        reachability=optics_cluster.reachability_,
        core_distances=optics_cluster.core_distances_,
        ordering=optics_cluster.ordering_,
        eps=300)

    print(optics_label)

    results = np.c_[np.array(results), dbscan_cluster.labels_,
                    optics_label].tolist()

    array = {}
    index = 0
    for item in results:
        tmp = {}
        tmp['time'] = item[0]
        tmp['longitude'] = item[1]
        tmp['latitude'] = item[2]
        tmp['dbscan'] = item[3]
        tmp['optics'] = item[4]
        array[index] = tmp
        index += 1

    return jsonify(array)
コード例 #5
0
def get_dbscan_and_reachability_figs(hdf_filename, mds_hdf_key, optics_hdf_key, first_dim, second_dim, cutoff):
    with pd.HDFStore(hdf_filename, 'r') as store:
        mds = store[mds_hdf_key]
        try:
            mds = mds[[first_dim, second_dim]]
        except KeyError:
            mds = mds.iloc[:, [int(first_dim), int(second_dim)]]
        optics = store[optics_hdf_key]

    # df = pd.concat([mds, optics], axis=1, sort=False)
    df = pd.merge(mds, optics, left_index=True, right_index=True)
    labels = df.labels[optics.ordering]
    names = df.index[optics.ordering]
    space = np.arange(len(df.index))
    reachability = df.reachability[optics.ordering]

    reach_fig = px.scatter(x=space, y=reachability, color=labels, hover_name=names, range_x=[min(space), max(space)+1])
    dbscan_fig = go.Figure()

    if cutoff is not None:
        reach_fig.add_annotation(
            x=1,
            y=cutoff+.05,
            text="DBSCAN cutoff",
            xref="paper",
            showarrow=False,
            font_size=12
        )
        reach_fig.add_shape(
            type="line",
            xref='paper',
            x0=0,
            y0=cutoff,
            x1=1,
            y1=cutoff,
            line=dict(color="RoyalBlue", width=3)
        )

        x = df.iloc[:, int(first_dim)]
        y = df.iloc[:, int(second_dim)]
        labels_db = cluster_optics_dbscan(reachability=optics.reachability,
                                           core_distances=optics.core_distances,
                                           ordering=optics.ordering, eps=cutoff)
        labels_db_text = [f"cluster {x}" for x in labels_db]

        dbscan_fig = px.scatter(
            x=x,
            y=y,
            color=labels_db_text,
            hover_name=df.index,
            labels={
                "x": f"Component {int(first_dim)}",
                "y": f"Component {int(second_dim)}",
                "color": "Clusters"
            },
            title=f"DBSCAN clustering for epsilon {cutoff}"
        )

    return dbscan_fig, reach_fig
コード例 #6
0
def OPTICS_Clustering(X):
    X = preprocess(X)
    cluster = OPTICS(min_samples=100, xi=.05, min_cluster_size=.05)
    cluster.fit(X)
    label_pred = cluster_optics_dbscan(reachability=cluster.reachability_,
                                       core_distances=cluster.core_distances_,
                                       ordering=cluster.ordering_,
                                       eps=2)
    label_pred = cluster.labels_
    return label_pred
コード例 #7
0
ファイル: optics_clustering.py プロジェクト: asteca/optics
 def mskNoise(eps_v):
     # Extract the labeled assigned to each point for this eps value
     labels_dbs = skclust.cluster_optics_dbscan(
         reachability=model_OPTIC.reachability_,
         core_distances=model_OPTIC.core_distances_,
         ordering=model_OPTIC.ordering_,
         eps=eps_v)
     # Identify points that are *not* labeled as "noise" (labeled as -1)
     msk = labels_dbs != -1
     # Return data array with points labeled as noise filterd out
     return data[msk]
コード例 #8
0
 def cluster_optics(self, similarity_matrix):
     print('Clustering with optics.')
     #TODO: Fix
     clust = OPTICS(min_samples=2, xi=0.005)#, min_cluster_size=.05)
     clust.fit(similarity_matrix)
     labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                core_distances=clust.core_distances_,
                                ordering=clust.ordering_, eps=100)
     
     labels = clust.labels_[clust.ordering_]
     #labels = labels_050
     labels = np.add(labels,1)
     #labels = labels_050
     return labels
コード例 #9
0
def get_dbscan_and_reachability_figs(hdf_filename, pcoa_hdf_key, optics_hdf_key, first_pc, second_pc, cutoff):
    with pd.HDFStore(hdf_filename, 'r') as store:
        pcoa = store[pcoa_hdf_key][[first_pc, second_pc]]
        optics = store[optics_hdf_key]

    df = pd.merge(pcoa, optics, left_index=True, right_index=True)
    labels = df.labels[optics.ordering]
    names = df.index[optics.ordering]
    space = np.arange(len(df.index))
    reachability = df.reachability[optics.ordering]

    reach_fig = px.scatter(x=space, y=reachability, color=labels, hover_name=names,
                           range_x=[min(space), max(space) + 1])
    dbscan_fig = go.Figure()

    if cutoff is not None:
        reach_fig.add_annotation(
            x=1,
            y=cutoff + .05,
            text="DBSCAN cutoff",
            xref="paper",
            showarrow=False,
            font_size=12
        )
        reach_fig.add_shape(
            type="line",
            xref='paper',
            x0=0,
            y0=cutoff,
            x1=1,
            y1=cutoff,
            line=dict(color="RoyalBlue", width=3)
        )

        x = df[first_pc]
        y = df[second_pc]
        labels_db = cluster_optics_dbscan(reachability=optics.reachability,
                                          core_distances=optics.core_distances,
                                          ordering=optics.ordering, eps=cutoff)
        labels_db_text = [f"cluster {x}" for x in labels_db]

        dbscan_fig = px.scatter(x=x, y=y, color=labels_db_text, hover_name=df.index)

    return dbscan_fig, reach_fig
コード例 #10
0
def cluster_mask_OPTICS(skel, show_image=False):

    xs, ys = convert_mask_to_regression(skel)
    X = np.array(list(zip(xs, ys)))

    clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)
    clust.fit(X)

    print(clust.reachability_)
    print(clust.core_distances_)
    print(clust.ordering_)

    labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=2)

    # Пока не разобрался
    return None
コード例 #11
0
ファイル: cluster.py プロジェクト: vinkrishna/spectral
def cluster(data, **kwargs):
    """
    Clusters the array using OPTICS and dbscan. Finds the best number of clusters.
    
    Parameters
    -----------
    data_array: array
        STFT array or low-dimensional embedding from `embed()` [nchan x nobs x ntrials]
        
    Returns
    -------
    res: array
        results with res[0] having the 
    nclust: int
        number of clusters identified
    """

    clust = OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.1, n_jobs=-1)
    clust.fit(data)

    epsilon = np.arange(0, 2, step=0.01)

    ncl = np.array([])
    res = np.array([])

    for e in epsilon:
        labels = cluster_optics_dbscan(
            reachability=clust.reachability_,
            core_distances=clust.core_distances_,
            ordering=clust.ordering_,
            eps=e,
        )

        ncl = np.append(ncl, len(np.unique(labels[labels > -1])))
        if ncl[-1] <= 1:
            res = np.append(res, 0)
        else:
            res = np.append(res, calinski_harabasz_score(data, labels))

    nclust = np.unique(ncl)

    return res, nclust
コード例 #12
0
ファイル: analyse_photo.py プロジェクト: qrider71/PyPhotoLab
def cluster(cluster_min_samples=5, cluster_eps=2.0 / RADIUS_EARTH_KM):
    print("Start clustering")
    conn = db_connect()
    coords_map = db_select_photos_coords(conn)
    coords_rad = list(coords_map.keys())
    clustering = OPTICS(min_samples=cluster_min_samples, metric='haversine').fit(coords_rad)

    labels_dbscan = cluster_optics_dbscan(reachability=clustering.reachability_,
                                          core_distances=clustering.core_distances_,
                                          ordering=clustering.ordering_, eps=cluster_eps)

    # coords_rad_labels = zip(coords_rad, clustering.labels_)
    coords_rad_labels = zip(coords_rad, labels_dbscan)
    map_coords_deg_cluster = {coords_map[coord_rad]: label for (coord_rad, label) in coords_rad_labels}

    map_hull_points_idx = compute_hull_curves(map_coords_deg_cluster)

    db_create_clusters(conn, map_coords_deg_cluster, map_hull_points_idx)
    conn.close()
    print("Finished clustering")
コード例 #13
0
ファイル: cluster.py プロジェクト: surf-sci-bc/agfalta_tools
def _remodel_optics(model, target="xi", **kwargs):
    if target == "xi":
        xi = kwargs.get("xi", 0.03)
        min_cluster_size = kwargs.get("min_cluster_size", 0.01)
        min_samples = kwargs.get("min_samples", 0.03)
        labels = sk_cluster.cluster_optics_xi(
            min_samples=min_samples,
            min_cluster_size=min_cluster_size,
            xi=xi,
            reachability=model.reachability_,
            predecessor=model.predecessor_,
            ordering=model.ordering_)
    else:
        eps = kwargs.get("eps", 0.5)
        labels = sk_cluster.cluster_optics_dbscan(
            eps=eps,
            reachability=model.reachability_,
            core_distances=model.core_distances_,
            ordering=model.ordering_,
        )
    return sort_labels(labels)
コード例 #14
0
def test_dbscan(Dl, args, logger, deepFD, epoch):
    logger.info('Testing with DBSCAN...')
    labels = getattr(Dl, Dl.ds+'_labels')
    features = get_embeddings(deepFD, Dl).cpu().numpy()
    save_embeddings(features, args.out_path, epoch)

    resultfile = f'{args.out_path}/results.txt'
    fa = open(resultfile, 'a')
    fa.write(f'====== Epoch {epoch} ======\n')
    # optics
    optics = OPTICS()
    optics.fit(features)
    logists = optics.labels_
    logists[logists >= 0] = 0
    logists[logists < 0] = 1
    logger.info('evaluating with optics')
    results = _eval(labels, logists, logists)
    logger.info(' pre  \t rec  \t  f1  \t  ap  \tpr_auc\troc_auc\t h_pre\t h_rec\t h_f1')
    logger.info('{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}'.format(results['pre'],results['rec'],results['f1'],results['ap'],results['pr_auc'],results['roc_auc'],results['h_pre'],results['h_rec'],results['h_f1']))
    fa.write('OPTICS\n')
    fa.write(' pre  \t rec  \t  f1  \t  ap  \tpr_auc\troc_auc\t h_pre\t h_rec\t h_f1 \n')
    fa.write('{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format(results['pre'],results['rec'],results['f1'],results['ap'],results['pr_auc'],results['roc_auc'],results['h_pre'],results['h_rec'],results['h_f1']))

    # dbscan with different epsilon
    epsilons = [0.5, 2, 5, 10]
    for ep in epsilons:
        logists = cluster_optics_dbscan(reachability=optics.reachability_, core_distances=optics.core_distances_, ordering=optics.ordering_, eps=ep)
        logists[logists >= 0] = 0
        logists[logists < 0] = 1
        logger.info(f'evaluating with dbscan at {ep}')
        results = _eval(labels, logists, logists)
        logger.info(' pre  \t rec  \t  f1  \t  ap  \tpr_auc\troc_auc\t h_pre\t h_rec\t h_f1')
        logger.info('{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}'.format(results['pre'],results['rec'],results['f1'],results['ap'],results['pr_auc'],results['roc_auc'],results['h_pre'],results['h_rec'],results['h_f1']))
        fa.write(f'DBSCAN at {ep}\n')
        fa.write(' pre  \t rec  \t  f1  \t  ap  \tpr_auc\troc_auc\t h_pre\t h_rec\t h_f1 \n')
        fa.write('{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\t{:.4f}\n'.format(results['pre'],results['rec'],results['f1'],results['ap'],results['pr_auc'],results['roc_auc'],results['h_pre'],results['h_rec'],results['h_f1']))
    fa.close()
コード例 #15
0
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))

clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)

# Run the fit
clust.fit(X)

labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_,
                                   eps=0.5)
labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_,
                                   eps=2)

space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

plt.figure(figsize=(10, 7))
G = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1, 0])
ax3 = plt.subplot(G[1, 1])
コード例 #16
0
ファイル: optics_clustering.py プロジェクト: asteca/optics
def main():
    """
    """
    min_samples_rng = np.arange(min_samps, max_samps, step)
    # Process all files in 'input_folder'
    files = readFiles()
    for file_path in files:
        # Extract required data from file
        data_all, data_id, data_c, data_err, msk_accpt = dataExtract(file_path)

        # This dictionary will hold all the runs
        probs_dict = {"ID": data_id}

        # For all the 'min_samples' values in 'min_samples_rng'
        no_outliers = False
        for min_samples in min_samples_rng:
            print("min_sample={}".format(min_samples))

            # For all the re-sample runs
            probs_all = []
            for _ in range(Nruns):
                print("  Re-sample N={}".format(_))

                # Use non-resampled values in the first run
                if _ == 0:
                    # data_arr = data_c
                    data_arr = np.array([data_c[_] for _ in data_c.columns]).T
                else:
                    # Re-sample data
                    data_arr = reSampleData(data_c, data_err)
                # Apply PCA reduction
                print("  PCA dimension reduction...")
                data_pca = dimReduc(data_arr, PCAdims)

                # Obtain OPTICS model
                print("  OPTICS model...")
                model_OPTIC = runOPTICS(data_pca, min_samples)
                labels = model_OPTIC.labels_[model_OPTIC.ordering_]
                if (labels == -1).sum() == 0:
                    no_outliers = True
                    break

                # Auto eps selection
                print("  eps selection...")
                eps_final = findEps(data_pca, model_OPTIC, perc_cut)

                # DBSCAN labels
                print("  DBSCAN labels...")
                labels_dbs = skclust.cluster_optics_dbscan(
                    reachability=model_OPTIC.reachability_,
                    core_distances=model_OPTIC.core_distances_,
                    ordering=model_OPTIC.ordering_,
                    eps=eps_final)

                msk_memb = labels_dbs != -1
                probs = np.zeros(len(msk_accpt))
                j = 0
                for i, st_f in enumerate(msk_accpt):
                    if st_f:
                        if msk_memb[j]:
                            probs[i] = 1
                        j += 1
                probs_all.append(probs)

            if no_outliers is True:
                print("No more outliers. Breaking")
                break
            probs_dict[str(min_samples)] = np.round(np.mean(probs_all, 0), 3)

        # Estimate mean probabilities
        all_vals = []
        for k, vals in probs_dict.items():
            if k != 'ID':
                all_vals.append(vals)
        probs_mean = np.round(np.array(all_vals).mean(0), 3)

        # Write to file
        probs_dict['probs_mean'] = probs_mean
        fname, fext = file_path.parts[-1].split('.')
        fout = 'output/' + fname + "_probs." + fext
        ascii.write(probs_dict, fout, overwrite=True)
コード例 #17
0
X2 = [4, -1] + .1 * np.random.randn(n_diem, 2)
X3 = [1, -2] + .2 * np.random.randn(n_diem, 2)
X4 = [-2, 3] + .3 * np.random.randn(n_diem, 2)
X5 = [3, -2] + 1.6 * np.random.randn(n_diem, 2)
X6 = [5, 6] + 2 * np.random.randn(n_diem, 2)
X = np.vstack((X1, X2, X3, X4, X5, X6))  #Theo thứ tự theo chiều dọc

clust_optics = OPTICS(min_samples=50, xi=0.05,
                      min_cluster_size=0.05)  #Truyền tham số có hàm
# OPTICS với MinPts:50 , e=0,05

# Run the fit
clust_optics.fit(X)  # Run OPTICS

labels_050 = cluster_optics_dbscan(reachability=clust_optics.reachability_,
                                   core_distances=clust_optics.core_distances_,
                                   ordering=clust_optics.ordering_,
                                   eps=0.5)  #RUN DBSCAN VỚI EPS=0,5
labels_200 = cluster_optics_dbscan(reachability=clust_optics.reachability_,
                                   core_distances=clust_optics.core_distances_,
                                   ordering=clust_optics.ordering_,
                                   eps=2)  #RUN DBSCAN VỚI EPS=2

space = np.arange(len(X))  #Độ dài mãng dư liệu
reachability = clust_optics.reachability_[clust_optics.ordering_]
labels = clust_optics.labels_[clust_optics.ordering_]

plt.figure(figsize=(10, 7))  #Độ lớn figure
G = gridspec.GridSpec(2, 3)  #Tạo các vị trí cho ax gồm có 2 hàng 3 cột
ax1 = plt.subplot(G[0, :])  #ax1 hiển thị ở hàng 0 cột 1 2 3
ax2 = plt.subplot(G[1, 0])  #ax2 hiển thị ở hàng 1 cột 0
ax3 = plt.subplot(G[1, 1])  #ax3 hiển thị ở hàng 1 cột 1
コード例 #18
0
"""

import numpy as np
from sklearn.cluster import OPTICS, cluster_optics_dbscan
import matplotlib.pyplot as plt

# Config
data_file = 'results/00_TSNE/HANDS17_DPREN_ShapeSplit_val_normalized_DEFAULT_combined_50.npy'
########################################################################

data = np.load(data_file)

epsilons = [2.0, 4.0]
clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=1000)
clust.fit(data)

for epsilon in epsilons:
    print('Starting clustering for epsilon {}'.format(epsilon))
    labels = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=epsilon)
    cluster_labels = set(labels)
    for i in cluster_labels:
        mask = labels == i
        cluster = data[mask]
        if i == -1:
            plt.scatter(cluster[:, 0], cluster[:, 1], s=1, alpha=1.0, label=i, color='black')
        else:
            plt.scatter(cluster[:, 0], cluster[:, 1], s=1, alpha=0.2, label=i)
    plt.show()
コード例 #19
0
 ff = np.load(dirr+'OPTICS_sp%d_smin%d.npz'%(sp, mins))
 lon0 = ff['lon']
 lat0 = ff['lat']
 reachability = ff['reachability'] / 1000
 ordering = ff['ordering']
 predecessor = ff['predecessor']
 core_distances = ff['core_distances']
 #%% Create the clusters from the reachabilities, given the xi value
 labels = []
 for op in opts:
     m, c = op[0], op[1]
     if m == "xi":
         l, _ = cluster_optics_xi(reachability, predecessor, ordering, mins, xi=c)
     else:
         l = cluster_optics_dbscan(reachability=reachability,
                                             core_distances=core_distances,
                                            ordering=ordering, eps=c)
     labels.append(l) 
 
 norms = []
 for l in labels:
     bounds = np.arange(-.5,np.max(l)+1.5,1)
     norms.append(matplotlib.colors.BoundaryNorm(bounds, len(bounds)))
 
 #%%
 exte=[18, 360-70, -75, 0]; latlines=[-75,-50, -25, 0, 25, 50, 75, 100];
 
 # Read Foram data
 readData = '/Volumes/HD/network_clustering/'
 data = nwf.readForamset(readData + 'ForamData.csv')
 Foramspecies = nwf.readForamset(readData + 'ForamDataHeader.txt')[0][21:]
コード例 #20
0
def graph_optics_neighborhoods(X):
    clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)
    clust.fit(X)

    labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=0.5)
    labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                       core_distances=clust.core_distances_,
                                       ordering=clust.ordering_,
                                       eps=2)

    space = np.arange(len(X))
    reachability = clust.reachability_[clust.ordering_]
    labels = clust.labels_[clust.ordering_]

    plt.figure(figsize=(10, 7))
    G = gridspec.GridSpec(2, 3)
    ax1 = plt.subplot(G[0, :])
    ax2 = plt.subplot(G[1, 0])
    ax3 = plt.subplot(G[1, 1])
    ax4 = plt.subplot(G[1, 2])

    # Reachability plot
    colors = ['g.', 'r.', 'b.', 'y.', 'c.']
    for klass, color in zip(range(0, 5), colors):
        Xk = space[labels == klass]
        Rk = reachability[labels == klass]
        ax1.plot(Xk, Rk, color, alpha=0.3)
    ax1.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.3)
    ax1.plot(space, np.full_like(space, 2., dtype=float), 'k-', alpha=0.5)
    ax1.plot(space, np.full_like(space, 0.5, dtype=float), 'k-.', alpha=0.5)
    ax1.set_ylabel('Reachability (epsilon distance)')
    ax1.set_title('Reachability Plot')

    # OPTICS
    colors = ['g.', 'r.', 'b.', 'y.', 'c.']
    for klass, color in zip(range(0, 5), colors):
        Xk = X[clust.labels_ == klass]
        ax2.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax2.plot(X[clust.labels_ == -1, 0],
             X[clust.labels_ == -1, 1],
             'k+',
             alpha=0.1)
    ax2.set_title('Automatic Clustering\nOPTICS')

    # DBSCAN at 0.5
    colors = ['g', 'greenyellow', 'olive', 'r', 'b', 'c']
    for klass, color in zip(range(0, 6), colors):
        Xk = X[labels_050 == klass]
        ax3.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3, marker='.')
    ax3.plot(X[labels_050 == -1, 0], X[labels_050 == -1, 1], 'k+', alpha=0.1)
    ax3.set_title('Clustering at 0.5 epsilon cut\nDBSCAN')

    # DBSCAN at 2.
    colors = ['g.', 'm.', 'y.', 'c.']
    for klass, color in zip(range(0, 4), colors):
        Xk = X[labels_200 == klass]
        ax4.plot(Xk[:, 0], Xk[:, 1], color, alpha=0.3)
    ax4.plot(X[labels_200 == -1, 0], X[labels_200 == -1, 1], 'k+', alpha=0.1)
    ax4.set_title('Clustering at 2.0 epsilon cut\nDBSCAN')

    plt.tight_layout()
    plt.show()
コード例 #21
0
ファイル: optics.py プロジェクト: ashrafhussain17/ML-and-DS
X_normalized = pd.DataFrame(X_normalized)

# Renaming the columns
X_normalized.columns = X.columns

X_normalized.head()

# OPTICS Clustering model
optics_model = OPTICS(min_samples=10, min_cluster_size=0.05)

# Training the model
optics_model.fit(X_normalized)

# DBSCAN technique with eps = 0.5
labels1 = cluster_optics_dbscan(reachability=optics_model.reachability_,
                                core_distances=optics_model.core_distances_,
                                ordering=optics_model.ordering_,
                                eps=0.3)

# DBSCAN technique with eps = 2.0
labels2 = cluster_optics_dbscan(reachability=optics_model.reachability_,
                                core_distances=optics_model.core_distances_,
                                ordering=optics_model.ordering_,
                                eps=1.0)

# Creating a numpy array with numbers at equal spaces till
# the specified range
space = np.arange(len(X_normalized))

# Storing the reachability distance of each point
reachability = optics_model.reachability_[optics_model.ordering_]
コード例 #22
0
C1 = [-5, -2] + .8 * np.random.randn(n_points_per_cluster, 2)
C2 = [4, -1] + .1 * np.random.randn(n_points_per_cluster, 2)
C3 = [1, -2] + .2 * np.random.randn(n_points_per_cluster, 2)
C4 = [-2, 3] + .3 * np.random.randn(n_points_per_cluster, 2)
C5 = [3, -2] + 1.6 * np.random.randn(n_points_per_cluster, 2)
C6 = [5, 6] + 2 * np.random.randn(n_points_per_cluster, 2)
X = np.vstack((C1, C2, C3, C4, C5, C6))

clust = OPTICS(min_samples=50, xi=.05, min_cluster_size=.05)

# Run the fit
clust.fit(X)

labels_050 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=0.5)
labels_200 = cluster_optics_dbscan(reachability=clust.reachability_,
                                   core_distances=clust.core_distances_,
                                   ordering=clust.ordering_, eps=2)

space = np.arange(len(X))
reachability = clust.reachability_[clust.ordering_]
labels = clust.labels_[clust.ordering_]

plt.figure(figsize=(10, 7))
G = gridspec.GridSpec(2, 3)
ax1 = plt.subplot(G[0, :])
ax2 = plt.subplot(G[1, 0])
ax3 = plt.subplot(G[1, 1])
ax4 = plt.subplot(G[1, 2])
コード例 #23
0
    def machine_learning(self, df, plugin_options):
        """Apply the scikit-learn OPTICS machine learning algorithm to the supplied data set, returning the results and indices.

		Args:
			df (Pandas DataFrame): DataFrame containing the machine learning ready version of the dataset to be processed.
			plugin_options (dictionary):  Dictionary containing any optional parameters for plugins being used.

		Returns:
			Dictionary: Dictionary containing final machine learning results and other internal data that user may want to save for review.
		"""
        print("\n")
        print("--Beginning:  Machine Learning")
        print("\tMachine learning algorithm:  scikit-learn OPTICS")

        if ("OPTICS_eps" in plugin_options):
            self.eps = float(plugin_options["OPTICS_eps"])
            print("\tOverriding default eps, it is set to: %g" % self.eps)
        else:
            print("\tUsing default setting for eps: %g" % self.eps)

        if ("OPTICS_min_samples" in plugin_options):
            self.min_samples = int(plugin_options["OPTICS_min_samples"])
            print("\tOverriding default min_samples, it is set to: %i" %
                  self.min_samples)
        else:
            print("\tUsing default setting for min_samples: %g" %
                  self.min_samples)

        # Capture start time.
        start_time = time.time()

        # Create an instance of OPTICS, a normalizer, and create a pipeline for
        # automatic execution of both.
        # cluster_method = "xi" or "dbscan"
        optics = OPTICS(max_eps=self.eps,
                        min_samples=self.min_samples,
                        cluster_method="xi")
        normalizer = StandardScaler(copy=False,
                                    with_mean=self.with_mean,
                                    with_std=True)

        print("\tBeginning:  fitting")
        start_time_fitting = time.time()

        # Check to see if the user wants to skip normalization of the data before
        # applying OPTICS to the data.
        if "OPTICS_skip_normalization" not in plugin_options:
            normalized_data = normalizer.fit_transform(df)
        else:
            print("\t\tNOT normalizing data as requested by user...")
            normalized_data = df
        optics.fit(normalized_data)
        results = cluster_optics_dbscan(reachability=optics.reachability_,
                                        core_distances=optics.core_distances_,
                                        ordering=optics.ordering_,
                                        eps=self.eps)

        # Number of clusters in labels, ignoring noise if present.
        n_clusters = len(set(results)) - (1 if -1 in results else 0)
        n_noise = list(results).count(-1)
        print("\tn_clusters = %i, n_noise = %i" % (n_clusters, n_noise))

        # compute centroids of the clusters
        centroids = np.zeros((n_clusters, normalized_data.shape[1]))
        for i in range(0, n_clusters):
            j = [k for k, x in enumerate(results) if x == i]
            centroids[i] = np.sum(normalized_data[j], axis=0) / len(j)

        self.fitting_time = time.time() - start_time_fitting
        print("\tFinished:  fitting")

        print("\n\tFitting Time: %.4f seconds" % self.fitting_time)
        print("\tMachine Learning Total Time: %.4f seconds" %
              (time.time() - start_time))

        print("--Finished:  Machine Learning")

        # Return a dictionary containing specific components created or calculated
        # as part of the machine learning process.  These may be used to perform
        # additional tasks (saving data to files, graphing, etc.).
        return {"OPTICS_Results": results, "OPTICS_Centroids": centroids}