Python OPTICSの例、sklearn.cluster.OPTICS Pythonの例

コード例 #1

0

ファイルを表示

def create_clusters_from_optics(mod: model,
                                rejection_ratio=0.5,
                                maxima_ratio=0.5,
                                min_elements=5,
                                iter=100,
                                metric="euclidean",
                                max_bound=np.inf):
    mod.setname("OPTICS rejection_ratio=" +
                str(round(rejection_ratio * 1000) / 1000) + " maxima_ratio=" +
                str(maxima_ratio) + " min_elements=" + str(min_elements))

    X = mod.mesures()
    mod.start_treatment()
    model: sk.OPTICS = sk.OPTICS(max_bound=max_bound,
                                 maxima_ratio=maxima_ratio,
                                 rejection_ratio=rejection_ratio,
                                 min_samples=min_elements,
                                 n_jobs=-1,
                                 metric=metric)

    model.fit(X)
    mod.clusters_from_labels(model.labels_, "cl_optics")

    mod.end_treatment()

    return mod

コード例 #2

0

ファイルを表示

ファイル: traj_analysis.py プロジェクト: jr-marchand/caviar

def cluster_print_sklearn(list_distances,
                          matrix_distances,
                          dict_corresp,
                          n_fr,
                          dist_threshold=0.2,
                          clustering="dbscan",
                          min_occu=5.0):
    """
	Same as cluster_print but using sklearn's dbscan and optics clustering methods (https://scikit-learn.org/stable/modules/clustering.html)
	"""

    import sklearn.cluster as c
    if clustering == "dbscan":
        model = c.DBSCAN(eps=0.2, metric="precomputed").fit(matrix_distances)
    elif clustering == "optics":
        model = c.OPTICS(metric="precomputed").fit(matrix_distances)
    clusters = model.labels_

    u, count = np.unique(clusters, return_counts=1)
    count_sort_ind = np.argsort(-count)  # sort descending by occupancy/size
    clust_names = u[count_sort_ind]  # sort descending by occupancy/size
    counts = count[count_sort_ind]  # sort descending by occupancy/size

    for i in range(len(clust_names)):
        occupancy = np.round((counts[i] / n_fr) * 100, 1)
        if clust_names[i] > -1 and occupancy > float(min_occu):
            cluster_of_int = np.where(clusters == clust_names[i])[0]
            center = find_center(cluster_of_int, dict_corresp,
                                 matrix_distances)
            print(
                f"Cluster {clust_names[i]} has an {occupancy}% and the representative structure is {center[0]}, with an average distance of {np.round(center[1], 2)} to other cluster members"
            )

    return clusters

コード例 #3

0

ファイルを表示

ファイル: filament_fit.py プロジェクト: builab/filament_tool_topaz

def optics_fit(img, xi=-0.15, min_samples=2):
    from sklearn import cluster
    print(img)
    X = np.concatenate((img[0][..., np.newaxis], img[1][..., np.newaxis]),
                       axis=1)
    print(X)
    optics = cluster.OPTICS(min_samples, cluster_method='xi', xi=xi)

    optics.fit(X)

    if hasattr(optics, 'labels_'):
        y_pred = optics.labels_.astype(int)
    else:
        y_pred = optics.predict(X)

    labels = optics.labels_

    unique_labels = set(labels)

    clusters = []
    for l in unique_labels:
        class_member_mask = (labels == l)
        cluster = X[class_member_mask]
        clusters.append(cluster)
    print(clusters)
    return clusters

コード例 #4

0

ファイルを表示

ファイル: json_cluster.py プロジェクト: lucaerculiani/ecai20-continual-egocentric-object-recognition

def get_data(session_ds, inc_eval_ds, ms_band, db_eps):
    session_data = list(session_ds)
    inc_eval_data = list(inc_eval_ds)
    session_emb = np.squeeze([utils.t2a(d[0][0]) for d in session_data])
    session_lab = np.squeeze([d[1] for d in session_data])

    inc_eval_emb = np.squeeze([utils.t2a(d[0][0]) for d in inc_eval_data])
    inc_eval_lab = np.squeeze([d[1] for d in inc_eval_data])

    X = np.concatenate((session_emb, inc_eval_emb))
    y = np.concatenate((session_lab, inc_eval_lab))

    meanshifts = [cl.MeanShift(bandwidth=b).fit_predict(X) for b in ms_band]
    optics = cl.OPTICS(min_samples=1).fit_predict(X)
    dbscans = [cl.DBSCAN(eps=e, min_samples=1).fit_predict(X) for e in db_eps]

    res = np.array(meanshifts + dbscans + [optics])
    inc_pred = res[:, session_lab.size:]

    aris = [adjusted_rand_score(p, inc_eval_lab) for p in inc_pred]
    amis = [
        adjusted_mutual_info_score(p, inc_eval_lab, average_method='max')
        for p in inc_pred
    ]

    return np.array(aris), np.array(amis), inc_pred, inc_eval_lab

コード例 #5

0

ファイルを表示

ファイル: clustering.py プロジェクト: lucmichalski/ODS-QA

def compute_clusters(vectors, clusters, algorithm='kmeans'):
    # select clustering algorithm
    if algorithm == 'kmeans':
        algorithm = cluster.MiniBatchKMeans(n_clusters=len(set(clusters)))
    elif algorithm == 'dbscan':
        algorithm = cluster.DBSCAN(eps=1.25, n_jobs=-1)
    elif algorithm == 'optics':
        algorithm = cluster.OPTICS(min_samples=10,
                                   eps=10,
                                   cluster_method='dbscan',
                                   n_jobs=-1)
    elif algorithm == 'birch':
        algorithm = cluster.Birch(n_clusters=len(set(clusters)))
    elif algorithm == 'spectral':
        algorithm = cluster.SpectralClustering(n_clusters=len(set(clusters)),
                                               eigen_solver='arpack',
                                               affinity="nearest_neighbors",
                                               n_jobs=-1)
    elif algorithm == 'affinity':
        algorithm = cluster.AffinityPropagation(damping=.9, preference=-200)
    else:
        raise NotImplementedError(f"Not implemented for algorithm {algorithm}")

    # predict cluster memberships
    algorithm.fit(vectors)
    if hasattr(algorithm, 'labels_'):
        labels = algorithm.labels_.astype(np.int)
    else:
        labels = algorithm.predict(vectors)

    #transform categorical labels to digits
    if isinstance(clusters[0], str):
        labels_true = LabelEncoder().fit_transform(clusters)
    elif isinstance(clusters[0], (int, np.int)):
        labels_true = clusters

    # Number of clusters in labels, ignoring noise if present.
    n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise_ = list(labels).count(-1)

    print('Estimated number of clusters: %d' % n_clusters_)
    print('Estimated number of noise points: %d' % n_noise_)
    print("Homogeneity: %0.3f" %
          metrics.homogeneity_score(labels_true, labels))
    print("Completeness: %0.3f" %
          metrics.completeness_score(labels_true, labels))
    print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels))
    print("Adjusted Rand Index: %0.3f" %
          metrics.adjusted_rand_score(labels_true, labels))
    print("Adjusted Mutual Information: %0.3f" %
          metrics.adjusted_mutual_info_score(labels_true, labels))
    print("Silhouette Coefficient: %0.3f" %
          metrics.silhouette_score(vectors, labels))

    return labels, algorithm

コード例 #6

0

ファイルを表示

def cluster_fps(fps: List[np.ndarray],
                ncluster: int = 100,
                method: str = 'minibatch',
                ncpu: Optional[int] = None) -> np.ndarray:
    """Cluster the molecular fingerprints, fps, by a given method

    Parameters
    ----------
    fps : List[np.ndarray]
        a list of bit vectors corresponding to a given molecule's Morgan
        fingerprint (radius=2, length=1024)
    ncluster : int (Default = 100)
        the number of clusters to form with the given fingerprints (if the
        input method requires this parameter)
    method : str (Default = 'kmeans')
        the clusering method to use.
        Choices include:
        - k-means clustering: 'kmeans'
        - mini-batch k-means clustering: 'minibatch'
        - OPTICS clustering 'optics'
    ncpu : Optional[int]
        the number of cores to parallelize clustering over, if possible

    Returns
    -------
    cluster_ids : np.ndarray
        the cluster id corresponding to a given fingerprint
    """
    begin = timeit.default_timer()

    fps = sparse.vstack(fps, format='csr')

    if method == 'kmeans':
        clusterer = cluster.KMeans(n_clusters=ncluster, n_jobs=ncpu)
    elif method == 'minibatch':
        clusterer = cluster.MiniBatchKMeans(n_clusters=ncluster,
                                            n_init=10,
                                            batch_size=100,
                                            init_size=1000)
    elif method == 'optics':
        clusterer = cluster.OPTICS(min_samples=0.01,
                                   metric='jaccard',
                                   n_jobs=ncpu)
    else:
        raise ValueError(f'{method} is not a supported clustering method')

    cluster_ids = clusterer.fit_predict(fps)

    elapsed = timeit.default_timer() - begin
    print(f'Clustering and predictions took: {elapsed:0.3f}s')

    return cluster_ids

コード例 #7

0

ファイルを表示

ファイル: cluster_face_test.py プロジェクト: ttt43ttt/album-management

def cluster_faces_by_OPTICS(data):
    encodings = [d["encoding"] for d in data]
    clt = cluster.OPTICS(cluster_method="xi",
                         max_eps=2,
                         min_samples=5,
                         metric="euclidean",
                         n_jobs=-1)
    clt.fit(encodings)

    # print(clt.core_distances_)
    labels = list(clt.labels_)
    for (i, label) in enumerate(labels):
        if label == -1:
            # -1是噪声点，表明没有所属的cluster，单独给一个标签
            labels[i] = len(labels) + i
    return labels

コード例 #8

0

ファイルを表示

ファイル: density_functions.py プロジェクト: PietroBonardi/advise_only

def find_best_min_samples_optics(X = None, START = 1, END = 100, EPS = 0.5):
#     Scale the data. Makes the algorithm more correct
    # Devo partire da 0.3, altrimenti da un sacco di problemi

    sils = []
    N_clusters = []
    SAMPLES = np.arange(START, END)
    for MIN_SAMPLE in SAMPLES:
        print("-- Progress: "+str(int(((MIN_SAMPLE-START)/(END-START))*10000)/100)+"%\r", end='')
#         Clustering
        clusterer = cl.OPTICS(max_eps=EPS,
                    min_samples=MIN_SAMPLE).fit(X)
#         Get the labels (-1 is noise points)
        labels = clusterer.labels_

        # Number of clusters in labels, ignoring noise if present.
        n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0)
        n_noise_ = list(labels).count(-1)

#         Keeps trace of the number of clusters
        N_clusters.append(n_clusters_)
#         Keeps trace of the silhouette results
        X_score = X[np.where(labels > 0)]
        labels_score = labels[np.where(labels > 0)]
        try:
            sils.append(ms.silhouette_score(X_score, labels_score))
        except:
            print("All points labelled as noise. Stopping.")
            break

#     Save the best Silhouette and its position
    position_of_best = np.where(sils == max(sils))[0][0]
    print("Best Silhouette:", sils[position_of_best])
    print("Number of Clusters for it:", N_clusters[position_of_best])
    print("Min_sample value for it:", SAMPLES[position_of_best])

#     Plor the MIN_SAMPLE value (step) to the Silhouette coefficient
    plt.plot(SAMPLES, sils)
    plt.axhline(y = 0, c = 'red')
    plt.show()
    return {"Silhouette": sils[position_of_best],
            "N_cluster": N_clusters[position_of_best],
            "Min": SAMPLES[position_of_best]}

コード例 #9

0

ファイルを表示

ファイル: optics_clustering.py プロジェクト: asteca/optics

def runOPTICS(data, min_samples):
    """
    min_samplesint: > 1 or float between 0 and 1 (default=5)
      The number of samples in a neighborhood for a point to be considered as
      a core point. Also, up and down steep regions can't have more then
      min_samples consecutive non-steep points. Expressed as an absolute
      number or a fraction of the number of samples (rounded to be at least 2).
    """
    model_OPTIC = skclust.OPTICS(min_samples=min_samples)
    # Fit the model
    model_OPTIC.fit(data)

    # import matplotlib.pyplot as plt
    # space = np.arange(len(data))
    # reachability = model_OPTIC.reachability_[model_OPTIC.ordering_]
    # labels = model_OPTIC.labels_[model_OPTIC.ordering_]
    # plt.plot(space[labels != -1], reachability[labels != -1], 'g.', alpha=0.25)
    # plt.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.25)
    # plt.show()

    return model_OPTIC

コード例 #10

0

ファイルを表示

ファイル: plot_cluster_comparison.py プロジェクト: shengchaohua/sklearn-examples

    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=params['min_samples'],
                            xi=params['xi'],
                            min_cluster_size=params['min_cluster_size'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average",
        affinity="cityblock",
        n_clusters=params['n_clusters'],
        connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                  covariance_type='full')

    clustering_algorithms = (('MiniBatchKMeans', two_means),
                             ('AffinityPropagation', affinity_propagation),
                             ('MeanShift', ms), ('SpectralClustering',

コード例 #11

0

ファイルを表示

    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    # ============
    # Create cluster objects
    # ============
    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(
        n_clusters=params['n_clusters'], linkage='ward',
        connectivity=connectivity)
    spectral = cluster.SpectralClustering(
        n_clusters=params['n_clusters'], eigen_solver='arpack',
        affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=30, maxima_ratio=.8,
                            rejection_ratio=.4)
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average", affinity="cityblock",
        n_clusters=params['n_clusters'], connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(
        n_components=params['n_clusters'], covariance_type='full')

    clustering_algorithms = (
        ('MiniBatchKMeans', two_means),
        ('AffinityPropagation', affinity_propagation),
        ('MeanShift', ms),
        ('SpectralClustering', spectral),
        ('Ward', ward),

コード例 #12

0

ファイルを表示

    def run(self):

        for i_dataset, (dataset, algo_params) in enumerate(self.datasets):
            # update parameters with dataset-specific values
            params = self.default_base.copy()
            params.update(algo_params)

            X, y = dataset

            # normalize dataset for easier parameter selection
            X = self.scaler.fit_transform(X)

            print(f'mean{self.scaler.mean_}, var{self.scaler.var_}, n_samples[{self.scaler.n_samples_seen_}], scale[{self.scaler.scale_}]')

            # estimate bandwidth for mean shift
            bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

            # connectivity matrix for structured Ward
            connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False)

            # make connectivity symmetric
            connectivity = 0.5 * (connectivity + connectivity.T)

            # ============
            # Create cluster objects
            # ============
            for alg in self.selected_clustering_algorithms:
                if alg is 'MiniBatchKMeans':
                    self.clustering_algorithms['MiniBatchKMeans'] = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
                elif alg is 'AffinityPropagation':
                    self.clustering_algorithms['AffinityPropagation'] = cluster.AffinityPropagation(damping=params['damping'],preference=params['preference'])
                elif alg is 'MeanShift':
                    self.clustering_algorithms['MeanShift'] = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
                elif alg is 'SpectralClustering':
                    self.clustering_algorithms['SpectralClustering'] = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors")
                elif alg is 'Ward':
                    self.clustering_algorithms['Ward'] = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity)
                elif alg is 'AgglomerativeClustering':
                    self.clustering_algorithms['AgglomerativeClustering'] = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity)
                elif alg is 'DBSCAN':
                    self.clustering_algorithms['DBSCAN'] = cluster.DBSCAN(eps=params['eps'])
                elif alg is 'OPTICS':
                    self.clustering_algorithms['OPTICS'] = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size'])
                elif alg is 'Birch':
                    self.clustering_algorithms['Birch'] = cluster.Birch(n_clusters=params['n_clusters'])
                elif alg is 'GaussianMixture':
                    self.clustering_algorithms['GaussianMixture'] = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full')
                elif alg is 'OneDGaussianKernel':
                    self.clustering_algorithms['OneDGaussianKernel'] = OneDimensionalGaussianKernel(bandwidth=params['bandwidth'])
                elif alg is 'MultiDGaussianKernel':
                    self.clustering_algorithms['MultiDGaussianKernel'] = MultiDimensionalGaussianKernel(bandwidth=params['bandwidth'])


            # self.clustering_algorithms['MiniBatchKMeans'] = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
            # self.clustering_algorithms['AffinityPropagation'] = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference'])
            # self.clustering_algorithms['MeanShift'] = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
            # self.clustering_algorithms['SpectralClustering'] = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors")
            # self.clustering_algorithms['Ward'] = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity)
            # self.clustering_algorithms['AgglomerativeClustering'] = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity)
            # self.clustering_algorithms['DBSCAN'] = cluster.DBSCAN(eps=params['eps'])
            # self.clustering_algorithms['OPTICS'] = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size'])
            # self.clustering_algorithms['Birch'] = cluster.Birch(n_clusters=params['n_clusters'])
            # self.clustering_algorithms['GaussianMixture'] = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full')

            for name, algorithm in self.clustering_algorithms.items():
                t0 = time.time()

                # catch warnings related to kneighbors_graph
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        "ignore",
                        message="the number of connected components of the " +
                        "connectivity matrix is [0-9]{1,2}" +
                        " > 1. Completing it to avoid stopping the tree early.",
                        category=UserWarning)
                    warnings.filterwarnings(
                        "ignore",
                        message="Graph is not fully connected, spectral embedding" +
                        " may not work as expected.",
                        category=UserWarning)
                    algorithm.fit(X)

                t1 = time.time()
                if hasattr(algorithm, 'labels_'):
                    y_pred = algorithm.labels_.astype(np.int)
                else:
                    y_pred = algorithm.predict(X)
                    algorithm.labels_ = y_pred



# # Test code
# X_data = np.array([[667. , 7],
#  [693.3, 7],
#  [732.9, 6],
#  [658.9, 1],
#  [702.8, 7],
#  [697.2, 1],
#  [658.7, 2],
#  [723.1, 1],
#  [719.5, 3],
#  [687.4, 1],
#  [704.1, 1],
#  [658.8, 4],
#  [667.8, 3],
#  [703.4, 3]])
# Y = np.array([[667. ],
#  [693.3],
#  [732.9],
#  [658.9],
#  [702.8],
#  [697.2],
#  [658.7],
#  [723.1],
#  [719.5],
#  [687.4],
#  [704.1],
#  [658.8],
#  [667.8],
#  [703.4]])
#
# cl = clusters()
# # np_X = np.concatenate((X_data, Y), axis=1)
# cl.set_data(X_data, Y)
# cl.run()
#
# data, _, _ = cl.get_clustered_data('GaussianMixture')
#
# print(data)

コード例 #13

0

ファイルを表示

ファイル: ptcl_class.py プロジェクト: jlaehne/ParticleSpy

    def cluster_particles(self,
                          algorithm='Kmeans',
                          properties=None,
                          n_clusters=2,
                          eps=0.2,
                          min_samples=5):
        """
        Cluster particles in to different populations based on specified properties.
        
        Parameters
        ----------
        algorithm: str
            The algorithm to use for clustering.
            Options are 'Kmeans','DBSCAN','OPTICS','AffinityPropagation'.
        properties: list
            A list of the properties upon which to base the clustering.
        n_clusters: int
            The number of clusters to split the data into.
            Used for Kmeans.
        eps: float
            The distance between samples.
            Used for DBSCAN.
        min_samples: int
            The minimum number of samples within the eps distance to be classed as a cluster.
            Used for DBSCAN and OPTICS.
        
        Returns
        -------
        List of Particle_list() objects.
        """
        vec, feature_array = _extract_features(self, properties)

        feature_array = preprocessing.scale(feature_array)

        if algorithm == 'Kmeans':
            cluster_out = cluster.KMeans(
                n_clusters=n_clusters).fit_predict(feature_array)
            start = 0
        elif algorithm == 'DBSCAN':
            cluster_out = cluster.DBSCAN(
                eps=eps, min_samples=min_samples).fit_predict(feature_array)
            start = -1
        elif algorithm == 'OPTICS':
            cluster_out = cluster.OPTICS(
                min_samples=min_samples).fit_predict(feature_array)
            start = -1
        elif algorithm == 'AffinityPropagation':
            cluster_out = cluster.AffinityPropagation().fit_predict(
                feature_array)
            start = 0

        for i, p in enumerate(self.list):
            p.cluster_number = cluster_out[i]

        plist_clusters = []
        for n in range(start, cluster_out.max() + 1):
            p_list_new = Particle_list()
            p_list_new.list = list(
                it.compress(self.list, [c == n for c in cluster_out]))
            plist_clusters.append(p_list_new)

        return (plist_clusters)

コード例 #14

0

ファイルを表示

def optic(samples, samples_to_predict):
    op = cluster.OPTICS(min_samples=2, n_jobs=-1)
    op.fit(samples)
    return op.labels_

コード例 #15

0

ファイルを表示

ファイル: plot_cluster_comparison.py プロジェクト: Grasin98/scikit-learn

 # Create cluster objects
 # ============
 ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
 two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"])
 ward = cluster.AgglomerativeClustering(
     n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity
 )
 spectral = cluster.SpectralClustering(
     n_clusters=params["n_clusters"],
     eigen_solver="arpack",
     affinity="nearest_neighbors",
 )
 dbscan = cluster.DBSCAN(eps=params["eps"])
 optics = cluster.OPTICS(
     min_samples=params["min_samples"],
     xi=params["xi"],
     min_cluster_size=params["min_cluster_size"],
 )
 affinity_propagation = cluster.AffinityPropagation(
     damping=params["damping"], preference=params["preference"], random_state=0
 )
 average_linkage = cluster.AgglomerativeClustering(
     linkage="average",
     affinity="cityblock",
     n_clusters=params["n_clusters"],
     connectivity=connectivity,
 )
 birch = cluster.Birch(n_clusters=params["n_clusters"])
 gmm = mixture.GaussianMixture(
     n_components=params["n_clusters"], covariance_type="full"
 )

コード例 #16

0

ファイルを表示

    def __init__(self, inpath, outpath, filename, *args, **kwargs):

        fullpath = os.path.join(inpath, filename)
        mat = scipy.io.loadmat(fullpath)

        tmpdict = {}
        if len(kwargs) > 0:
            for key, value in kwargs.items():
                tmpdict[key] = value
        else:
            print "clustering to 10 clusters"
            self.n_clusters = 10

        if len(kwargs) > 0:

            if "varname" in tmpdict.keys():
                varname = tmpdict["varname"]
            else:
                varname = "data"

            if 'method' in kwargs.keys():
                self.methodname = kwargs['method']

                randint_ = random.randint(0, 100)
                if 'KMeans' in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklcl.KMeans(n_clusters=n['n_clusters'],
                                                   random_state=randint_)
                    else:
                        self.method = sklcl.KMeans(random_state=0)

                if 'DBSCAN' in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklcl.DBSCAN(
                            eps=n['eps'],
                            min_samples=n['min_samples'],
                            random_state=randint_)
                    else:
                        self.method = sklcl.DBSCAN()

                if 'OPTICS' in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklcl.OPTICS(
                            max_eps=n['max_eps'],
                            min_samples=n['min_samples'],
                            random_state=randint_)
                    else:
                        self.method = sklcl.OPTICS()

                if 'AffinityPropagation' in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklcl.AffinityPropagation(
                            damping=n['damping'], random_state=randint_)
                    else:
                        self.method = sklcl.AffinityPropagation()

                if 'MeanShift' in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklcl.MeanShift(bandwidth=n['bandwidth'],
                                                      random_state=randint_)
                    else:
                        self.method = sklcl.MeanShift()

                if "SpectralClustering" in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklcl.SpectralClustering(
                            n_clusters=n['n_clusters'], random_state=randint_)
                    else:
                        self.method = sklcl.SpectralClustering()

                if "Birch" in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklcl.Birch(threshold=n['threshold'],
                                                  n_clusters=n['n_clusters'])
                    else:
                        self.method = sklcl.Birch()

                if "AgglomerativeClustering" in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklcl.AgglomerativeClustering(
                            n_clusters=n['n_clusters'], linkage=n['linkage'])
                        self.n_clusters = n['n_clusters']
                    else:
                        self.method = sklcl.AgglomerativeClustering()
                        self.n_clusters = 2

                if "GaussianMixture" in self.methodname:
                    if 'xtra' in kwargs.keys():
                        n = kwargs['xtra']
                        self.method = sklmx.GaussianMixture(
                            covariance_type=n['covariance_type'],
                            n_components=n['n_components'],
                            random_state=randint_)
                    else:
                        self.method = sklmx.GaussianMixture()

                else:
                    pass
        else:
            print "clustering with KMeans"
            self.method = sklcl.KMeans(random_state=0)
            self.methodname = 'KMeans'

        print "------------"
        print(self.methodname)

        # if 'n_clusters' in tmpdict.keys():
        #     self.n_clusters =  tmpdict['n_clusters']
        # else:
        #     self.n_clusters = 10

        try:
            self.pos_quat = self.get_data_pos_quat(mat, varname)
            self.pos_quat_saved = self.pos_quat
        except KeyError as e:
            raise e
        self.pos_dir_vec = np.array(
            [pos_q_2_pos_vec(row) for row in self.pos_quat])

        # self.method = sklcl.KMeans(n_clusters=self.n_clusters, random_state=randint_)

        self.filename = filename
        self.outpath = outpath
        title = self.filename.split('.mat')
        self.title = title[0]

        self.centres = []
        self.centres_q = []
        self.centres_dirvec = []

コード例 #17

0

ファイルを表示

ファイル: plot_clustering.py プロジェクト: rth/scikit-learn-extra

centers = [[1, 1], [-1, -1], [1, -1]]
n_clusters = len(centers)

kmeans = KMeans(n_clusters=n_clusters, random_state=rng)
kmedoid = KMedoids(n_clusters=n_clusters, random_state=rng)

two_means = cluster.MiniBatchKMeans(n_clusters=n_clusters, random_state=rng)
spectral = cluster.SpectralClustering(
    n_clusters=n_clusters,
    eigen_solver="arpack",
    affinity="nearest_neighbors",
    random_state=rng,
)
dbscan = cluster.DBSCAN()
optics = cluster.OPTICS(min_samples=20, xi=0.1, min_cluster_size=0.2)
affinity_propagation = cluster.AffinityPropagation(damping=0.75,
                                                   preference=-220,
                                                   random_state=rng)
birch = cluster.Birch(n_clusters=n_clusters)
gmm = mixture.GaussianMixture(n_components=n_clusters,
                              covariance_type="full",
                              random_state=rng)

for n_samples in [300, 600]:
    # Construct the dataset
    X, labels_true = make_blobs(n_samples=n_samples,
                                centers=centers,
                                cluster_std=0.4,
                                random_state=rng)

コード例 #18

0

ファイルを表示

def do_clustering(df_app, output_dir, options):

    df_app_tmp = df_app.copy()
    df_app_tmp.drop('ANNOTATE', axis=1, inplace=True)
    if options['CLASS'] != '':
        df_app_tmp.drop('CLASS', axis=1, inplace=True)

    scaler_list = (StandardScaler(), RobustScaler(), QuantileTransformer(),
                   Normalizer())

    for scaler in scaler_list:
        scaler_text = 'Preprocessing: ' + scaler.__class__.__name__

        pca = PCA()

        df_app_tmp2 = scaler.fit_transform(df_app_tmp)
        ea_new = pca.fit_transform(df_app_tmp2)
        X = ea_new
        Y = df_app['CLASS']

        params = {
            'quantile': .3,
            'eps': .3,
            'damping': .9,
            'preference': -200,
            'n_neighbors': 10,
            'n_clusters': 3,
            'min_samples': 20,
            'xi': 0.05,
            'min_cluster_size': 0.1
        }

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(X,
                                        n_neighbors=params['n_neighbors'],
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)

        # ============
        # Create cluster objects
        # ============

        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
        ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                               linkage='ward',
                                               connectivity=connectivity)
        spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors")
        dbscan = cluster.DBSCAN(eps=params['eps'])
        optics = cluster.OPTICS(min_samples=params['min_samples'],
                                xi=params['xi'],
                                min_cluster_size=params['min_cluster_size'])
        affinity_propagation = cluster.AffinityPropagation(
            damping=params['damping'], preference=params['preference'])
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average",
            affinity="cityblock",
            n_clusters=params['n_clusters'],
            connectivity=connectivity)
        birch = cluster.Birch(n_clusters=params['n_clusters'])
        gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                      covariance_type='full')
        kmeans = cluster.KMeans(n_clusters=params['n_clusters'])

        clustering_algorithms = (('MiniBatchKMeans', two_means),
                                 ('AffinityPropagation', affinity_propagation),
                                 ('MeanShift', ms), ('SpectralClustering',
                                                     spectral), ('Ward', ward),
                                 ('AgglomerativeClustering', average_linkage),
                                 ('DBSCAN', dbscan), ('OPTICS', optics),
                                 ('Birch', birch), ('GaussianMixture',
                                                    gmm), ('KMeans', kmeans))

        for algo_name, algorithm in clustering_algorithms:
            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the " +
                    "connectivity matrix is [0-9]{1,2}" +
                    " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning)
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning)
                algorithm.fit(X)

            t1 = time.time()
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)

            colors = (0, 0, 0)
            area = np.pi * 3

            # Plot
            colors = np.array(
                list(
                    islice(
                        cycle([
                            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                            '#a65628', '#984ea3', '#999999', '#e41a1c',
                            '#dede00'
                        ]), int(max(y_pred) + 1))))
            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])

            plt.figure()  # size in inches
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])
            plt.title(
                ea_decode.options_title(options) + '\n' + scaler_text +
                ' - Clustering: ' + algo_name +
                ' ({})'.format(params['n_clusters']))
            plt.xlabel('x')
            plt.ylabel('y')

            for x_coord, y_coord, annotation, label in zip(
                    X[:, 0], X[:, 1], df_app['ANNOTATE'], y_pred):
                plt.annotate(annotation,
                             xy=(x_coord, y_coord),
                             c=colors[label])

            file_out = ea_decode.options_filename(
                options
            ) + '_' + scaler.__class__.__name__ + '_cluster' + '_' + algo_name + '_' + '{}'.format(
                params['n_clusters'])

            if output_dir == '':
                plt.show()
            else:
                plt.savefig(os.path.join(output_dir, file_out))
                plt.close()

            foo = 1

コード例 #19

0

ファイルを表示

def compute_all(channels,
                start,
                stop,
                history=timedelta(hours=2),
                filename=DEFAULT_FILENAME,
                **kwargs):
    # set up duration (minute-trend data has dt=1min, so reject intervals not on the minute).
    duration = (stop - start).total_seconds() / 60
    assert (stop - start).total_seconds() / 60 == (stop -
                                                   start).total_seconds() // 60
    duration = int(duration)
    logger.info(
        f'Clustering data from {start} to {stop} ({duration} minutes).')

    # download data using TimeSeries.get(), including history of point at t0.
    logger.debug(
        f'Initiating download from {start} to {stop} with history={history}...'
    )
    dl = TimeSeriesDict.get(channels,
                            start=to_gps(start - history),
                            end=to_gps(stop))
    logger.info(f'Downloaded from {start} to {stop} with history={history}.')

    if exists('input.npy'):
        input_data = np.load('input.npy')
        logger.info('Loaded input matrix.')
    else:
        # generate input matrix of the form [sample1;...;sampleN] with sampleK = [feature1,...,featureN]
        # for sklearn.cluster algorithms. This is the slow part of the function, so a progress bar is shown.
        logger.debug(f'Initiating input matrix generation...')
        with Progress('building input', (duration * 60)) as progress:
            input_data = stack([
                concatenate([
                    progress(dl[channel].crop,
                             t,
                             start=to_gps(start + timedelta(seconds=t) -
                                          history),
                             end=to_gps(start + timedelta(seconds=t))).value
                    for channel in channels
                ]) for t in range(0, int(duration * 60), 60)
            ])

        # verify input matrix dimensions.
        assert input_data.shape == (duration,
                                    int(
                                        len(channels) *
                                        history.total_seconds() / 60))
        np.save('input.npy', input_data)
        logger.info('Completed input matrix generation.')

    params = {
        'quantile': .3,
        'eps': .3,
        'damping': .9,
        'preference': -200,
        'n_neighbors': 10,
        'n_clusters': 15,
        'min_samples': 20,
        'xi': 0.05,
        'min_cluster_size': 0.1
    }

    if exists('X.npy'):
        X = np.load('X.npy')
        logger.info('Loaded X')
    else:
        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(input_data)
        np.save('X.npy', X)
        logger.info('Generated X')

    if exists('bandwidth.npy'):
        bandwidth = np.load('bandwidth.npy')
        logger.info('Loaded bandwidth')
    else:
        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])
        np.save('bandwidth.npy', bandwidth)
        logger.info('Generated bandwidth')

    if exists('connectivity.npy'):
        connectivity = np.load('connectivity.npy', allow_pickle=True)
        logger.info('Loaded connectivity')
    else:
        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(X,
                                        n_neighbors=params['n_neighbors'],
                                        include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)
        np.save('connectivity.npy', connectivity)
        logger.info('Generated connectivity')

    ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
    two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
    ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'],
                                           linkage='ward',
                                           connectivity=connectivity)
    spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                          eigen_solver='arpack',
                                          affinity="nearest_neighbors")
    dbscan = cluster.DBSCAN(eps=params['eps'])
    optics = cluster.OPTICS(min_samples=params['min_samples'],
                            xi=params['xi'],
                            min_cluster_size=params['min_cluster_size'])
    affinity_propagation = cluster.AffinityPropagation(
        damping=params['damping'], preference=params['preference'])
    average_linkage = cluster.AgglomerativeClustering(
        linkage="average",
        affinity="cityblock",
        n_clusters=params['n_clusters'],
        connectivity=connectivity)
    birch = cluster.Birch(n_clusters=params['n_clusters'])
    gmm = mixture.GaussianMixture(n_components=params['n_clusters'],
                                  covariance_type='full')

    clustering_algorithms = (
        ('MiniBatchKMeans', two_means),
        ('AffinityPropagation', affinity_propagation), ('MeanShift', ms),
        ('SpectralClustering', spectral), ('DBSCAN', dbscan),
        ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm)
        # ('Ward', ward),
        # ('AgglomerativeClustering', average_linkage),
    )

    for name, algorithm in clustering_algorithms:
        if exists(f'part-{name}-{filename}'):
            labels = TimeSeries.read(f'part-{name}-{filename}',
                                     f'{name}-labels')
            logger.debug(f'LOADED {name}.')
        else:
            logger.debug(f'doing {name}...')
            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the " +
                    "connectivity matrix is [0-9]{1,2}" +
                    " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning)
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding"
                    + " may not work as expected.",
                    category=UserWarning)
                algorithm.fit(X)

            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)
            # cast the output labels to a TimeSeries so that cropping is easy later on.
            labels = TimeSeries(
                y_pred,
                times=dl[channels[0]].crop(start=to_gps(start),
                                           end=to_gps(stop)).times,
                name=f'{name}-labels')

            labels.write(f'part-{name}-{filename}')
        # put labels in data download dictionary for easy saving.
        dl[labels.name] = labels

    # write data download and labels to specified filename.
    cache_file = abspath(filename)
    if exists(cache_file):
        remove(cache_file)
    dl.write(cache_file)
    logger.info(f'Wrote cache to {filename}')

コード例 #20

0

ファイルを表示

def plot_clustering():
    # ============
    # Generate datasets. We choose the size big enough to see the scalability
    # of the algorithms, but not too big to avoid too long running times
    # ============
    n_samples = 1500
    noisy_circles = make_circles(n_samples=n_samples, factor=.5,
                                          noise=.05)
    noisy_moons = make_moons(n_samples=n_samples, noise=.05)
    blobs = make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    # Anisotropicly distributed data
    random_state = 170
    X, y = make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)

    # blobs with varied variances
    varied = make_blobs(n_samples=n_samples,
                                 cluster_std=[1.0, 2.5, 0.5],
                                 random_state=random_state)

    # ============
    # Set up cluster parameters
    # ============
    fig = plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05,
                        hspace=.01)

    plot_num = 1

    default_base = {'quantile': .3,
                    'eps': .3,
                    'damping': .9,
                    'preference': -200,
                    'n_neighbors': 10,
                    'n_clusters': 3,
                    'min_samples': 20,
                    'xi': 0.05,
                    'min_cluster_size': 0.1}

    datasets = [
        (noisy_circles, {'damping': .77, 'preference': -240,
                         'quantile': .2, 'n_clusters': 2,
                         'min_samples': 20, 'xi': 0.25}),
        (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}),
        (varied, {'eps': .18, 'n_neighbors': 2,
                  'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}),
        (aniso, {'eps': .15, 'n_neighbors': 2,
                 'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}),
        (blobs, {}),
        (no_structure, {})]

    for i_dataset, (dataset, algo_params) in enumerate(datasets):
        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # estimate bandwidth for mean shift
        bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

        # connectivity matrix for structured Ward
        connectivity = kneighbors_graph(
            X, n_neighbors=params['n_neighbors'], include_self=False)
        # make connectivity symmetric
        connectivity = 0.5 * (connectivity + connectivity.T)

        # ============
        # Create cluster objects
        # ============
        ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
        two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
        ward = cluster.AgglomerativeClustering(
            n_clusters=params['n_clusters'], linkage='ward',
            connectivity=connectivity)
        spectral = cluster.SpectralClustering(
            n_clusters=params['n_clusters'], eigen_solver='arpack',
            affinity="nearest_neighbors")
        dbscan = cluster.DBSCAN(eps=params['eps'])
        optics = cluster.OPTICS(min_samples=params['min_samples'],
                                xi=params['xi'],
                                min_cluster_size=params['min_cluster_size'])
        affinity_propagation = cluster.AffinityPropagation(
            damping=params['damping'], preference=params['preference'])
        average_linkage = cluster.AgglomerativeClustering(
            linkage="average", affinity="cityblock",
            n_clusters=params['n_clusters'], connectivity=connectivity)
        birch = cluster.Birch(n_clusters=params['n_clusters'])
        gmm = mixture.GaussianMixture(
            n_components=params['n_clusters'], covariance_type='full')

        clustering_algorithms = (
            ('MiniBatchKMeans', two_means),
            ('AffinityPropagation', affinity_propagation),
            ('MeanShift', ms),
            ('SpectralClustering', spectral),
            ('Ward', ward),
            ('AgglomerativeClustering', average_linkage),
            ('DBSCAN', dbscan),
            ('OPTICS', optics),
            ('Birch', birch),
            ('GaussianMixture', gmm)
        )

        for name, algorithm in clustering_algorithms:
            t0 = time.time()

            # catch warnings related to kneighbors_graph
            with warnings.catch_warnings():
                warnings.filterwarnings(
                    "ignore",
                    message="the number of connected components of the " +
                    "connectivity matrix is [0-9]{1,2}" +
                    " > 1. Completing it to avoid stopping the tree early.",
                    category=UserWarning)
                warnings.filterwarnings(
                    "ignore",
                    message="Graph is not fully connected, spectral embedding" +
                    " may not work as expected.",
                    category=UserWarning)
                algorithm.fit(X)

            t1 = time.time()
            if hasattr(algorithm, 'labels_'):
                y_pred = algorithm.labels_.astype(np.int)
            else:
                y_pred = algorithm.predict(X)

            plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                                 '#f781bf', '#a65628', '#984ea3',
                                                 '#999999', '#e41a1c', '#dede00']),
                                          int(max(y_pred) + 1))))
            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())
            plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                     transform=plt.gca().transAxes, size=15,
                     horizontalalignment='right')
            plot_num += 1

    mpl.pyplot.close("all")

    return fig

コード例 #21

0

ファイルを表示

def main():
    """Comaprison between K-means, Spectral Clustering, CURE, DBSCAN, OPTICS and SNN clustering algorithms
    in artificial data sets. Choose between trhee options of data sets (comment or undo the comment for the
      desired set of data sets):

      1) Small artificial datasets
      2) Complex artificial datasets
      3) Varying densities artificial datasets

      """

    # ===========================
    # === ARTIFICIAL DATASETS ===
    # ===========================
    from sklearn import cluster, datasets
    np.random.seed(0)
    # ============

    # Generate datasets (taken from SKLEARN example)
    n_samples = 1500
    noisy_circles = datasets.make_circles(n_samples=n_samples,
                                          factor=.5,
                                          noise=.05)
    noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05)
    blobs = datasets.make_blobs(n_samples=n_samples, random_state=8)
    no_structure = np.random.rand(n_samples, 2), None

    # Anisotropicly distributed data
    random_state = 170
    X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state)
    transformation = [[0.6, -0.6], [-0.4, 0.8]]
    X_aniso = np.dot(X, transformation)
    aniso = (X_aniso, y)

    # blobs with varied variances
    varied = datasets.make_blobs(n_samples=n_samples,
                                 cluster_std=[1.0, 2.5, 0.5],
                                 random_state=random_state)

    # ============
    # Set up cluster parameters
    # ============
    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)

    plot_num = 1

    default_base = {
        'eps': .5,
        'MinPts_fraction': 0.5,
        'n_neighbors': 20,
        'n_clusters': 3,
        'min_samples': 20,
        'xi': 0.05,
        'min_cluster_size': 0.1
    }

    # ========= Small artificial datasets ===============
    datasets = [(noisy_circles, {
        'name': 'noisy_circles',
        'quantile': .2,
        'n_clusters': 2,
        'min_samples': 20,
        'xi': 0.25,
        'eps': 0.5,
        'd_eps': 0.15
    }), (noisy_moons, {
        'name': 'noisy_moons',
        'n_clusters': 2,
        'd_eps': 0.3
    }),
                (varied, {
                    'name': 'varied',
                    'eps': .5,
                    'd_eps': 0.18,
                    'min_samples': 5,
                    'xi': 0.035,
                    'min_cluster_size': .2
                }),
                (aniso, {
                    'name': 'aniso',
                    'eps': .5,
                    'd_eps': 0.15,
                    'min_samples': 20,
                    'xi': 0.1,
                    'min_cluster_size': .2
                }), (blobs, {
                    'name': 'blobs',
                    'd_eps': 0.3
                }), (no_structure, {
                    'name': 'no_structure',
                    'd_eps': 0.15
                })]

    # ========= Complex shape artificial data sets =========

    # datasets = [
    #     (None, {'name': 'complex9', 'n_clusters': 9, 'n_neighbors': 40, 'eps': 0.45, 'd_eps': 0.15, 'MinPts_fraction': 0.5}),
    #     (None,{'name': 'cure-t0-2000n-2D', 'n_clusters': 3, 'n_neighbors': 35, 'eps': 0.45, 'd_eps': 0.15,
    #      'MinPts_fraction': 0.4}),
    #     (None, {'name': 'cure-t1-2000n-2D', 'n_clusters': 6, 'n_neighbors': 35, 'eps': 0.40, 'd_eps': 0.15, 'xi': 0.035,
    #      'MinPts_fraction': 0.4}),
    #     (None,{'name': '3-spiral', 'n_clusters': 3, 'n_neighbors': 10, 'eps': 0.45, 'd_eps': 0.15, 'MinPts_fraction': 0.35})]

    # ======== Varying densities artificial data sets======

    # datasets = [
    #     (None,{'name': 'triangle1', 'n_clusters': 4, 'n_neighbors': 50, 'eps': 0.5, 'd_eps': 0.15, 'MinPts_fraction': 0.5}),
    #     (None,{'name': 'triangle2', 'n_clusters': 4, 'n_neighbors': 50, 'eps': 0.5, 'd_eps': 0.15, 'MinPts_fraction': 0.5}),
    #     (None,{'name': 'st900', 'n_clusters': 9, 'n_neighbors': 50, 'eps': 0.4, 'd_eps': 0.15, 'MinPts_fraction': 0.5}),
    #     (None,{'name': 'compound', 'n_clusters': 6, 'n_neighbors': 25, 'eps': 0.4, 'd_eps': 0.15, 'MinPts_fraction': 0.5})]

    results = []
    for i_dataset, (dataset, algo_params) in enumerate(datasets):

        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        if dataset == None:
            name = params['name']
            pd_dataset = pd.read_csv('./csv_files/' + name + '.csv')
            X = pd_dataset.iloc[:, :-1].to_numpy()
            y = pd_dataset.iloc[:, -1].to_numpy()
        else:

            X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # ============
        # Create cluster objects
        # ============

        k_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
        spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors")
        dbscan = cluster.DBSCAN(eps=params['d_eps'])
        optics = cluster.OPTICS(min_samples=params['min_samples'],
                                xi=params['xi'],
                                min_cluster_size=params['min_cluster_size'])
        snn = SNN(K=params['n_neighbors'],
                  Eps=params['eps'],
                  MinPts_fraction=0.5)

        clustering_algorithms = (('k_means', k_means), ('SpectralClustering',
                                                        spectral),
                                 ('CURE', cure), ('DBSCAN', dbscan),
                                 ('OPTICS', optics), ('SNN', snn))

        for name, algorithm in clustering_algorithms:

            if name == 'CURE':
                cure_inst = algorithm(X, params['n_clusters'])
                cure_inst.process()
                clusters = cure_inst.get_clusters()
                y_pred = [0] * len(X)
                for i in range(len(clusters)):
                    cluster_cure = clusters[i]
                    for index in cluster_cure:
                        y_pred[index] = i
            else:
                algorithm.fit(X)
                if hasattr(algorithm, 'labels_'):
                    y_pred = algorithm.labels_.astype(np.int)
                else:
                    y_pred = algorithm.predict(X)

            # EVALUATION
            mutual_info = None
            rand_index = None
            fowlkes_mallows = None
            calinski_score = None
            davies_bouldin = None
            silhouette = None

            if len(np.unique(y_pred)) > 1 and len(np.unique(y)) > 1:
                # External indices:
                mutual_info = round(
                    adjusted_mutual_info_score(y,
                                               y_pred,
                                               average_method='arithmetic'), 3)
                rand_index = round(adjusted_rand_score(y, y_pred), 3)
                fowlkes_mallows = round(fowlkes_mallows_score(y, y_pred), 3)

                # Internal indexes
                calinski_score = round(calinski_harabaz_score(X, y_pred), 3)
                davies_bouldin = round(davies_bouldin_score(X, y_pred), 3)
                silhouette = round(silhouette_score(X, y_pred), 3)

            results.append([
                params['name'], name, mutual_info, rand_index, fowlkes_mallows,
                calinski_score, davies_bouldin, silhouette
            ])

            # Plot the results
            plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            colors = np.array(
                list(
                    islice(
                        cycle([
                            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                            '#a65628', '#984ea3', '#999999', '#e41a1c',
                            '#dede00'
                        ]), int(max(y_pred) + 1))))

            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])
            plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

            plt.xlim(-2.5, 2.5)
            plt.ylim(-2.5, 2.5)
            plt.xticks(())
            plt.yticks(())
            plot_num += 1

    outputfile = "artificial_datasets_sklearn"
    #outputfile = "artificial_datasets"
    #outputfile = "artificial_datasets_densities"

    # Save evaluation metrics:
    results_df = pd.DataFrame(results,
                              columns=[
                                  'Dataset', 'Algorithm', 'AMI', 'ARI', 'FM',
                                  'CHI', 'DBI', 'Silhouette'
                              ])
    results_df.to_csv('./results/' + outputfile + '_metrics.csv',
                      index=False,
                      header=True)
    results_df.to_excel('./results/' + outputfile + '_metrics.xlsx',
                        index=False,
                        header=True)

    # Save plots:
    plt.savefig('./results/' + outputfile + '.png')
    plt.show()

コード例 #22

0

ファイルを表示

ファイル: MARKERS.py プロジェクト: insilicolife/micti

    def cluster_cells(self,
                      numberOfCluster=None,
                      subspace=False,
                      min_sample=10,
                      method="kmeans",
                      maxiter=10e3,
                      alpha=1,
                      gamma=1,
                      eta=0.01,
                      eps=0.5,
                      min_samples=5,
                      metric='euclidean',
                      xi=.05,
                      min_cluster_size=.05):

        if (subspace == False):
            data = self.data
        else:
            svd = TruncatedSVD(n_components=500)
            data = svd.fit_transform(mictiObject_1.data.toarray())

        if method == "kmeans":
            kmean = Kmeans.Kmeans(data, numberOfCluster, self.geneNames,
                                  self.cellNames)
            _, self.cluster_assignment = kmean.kmeans_multiple_runs(maxiter, 5)
            self.k = len(set(self.cluster_assignment))
        elif method == "GM":
            EM_GM = GM.GM(data, numberOfCluster, self.geneNames,
                          self.cellNames)
            EM_GMM = EM_GM.EM_for_high_dimension()
            self.cluster_assignment = np.argmax(EM_GMM["resp"], axis=1)
            self.k = len(set(self.cluster_assignment))
        elif method == "hdp":
            corpusData = pa.DataFrame(data.toarray())
            corpusData.columns = self.geneNames
            corpusData.index = self.cellNames
            cc, id2g, id2c = self.cellMatrix2cellCorpus(corpusData)
            hdp = HdpModel(cc, id2g, alpha=alpha, gamma=gamma, eta=eta)
            tp_dist = hdp.__getitem__(cc)
            cell_tp = [max(dict(i), key=dict(i).get) for i in tp_dist]
            low_conf_cluster = np.where(np.bincount(cell_tp) < min_sample)
            filter_noise = [
                False if i in low_conf_cluster[0] else True for i in cell_tp
            ]
            new_assignment = np.array([
                cell_tp[i] if filter_noise[i] else 100
                for i in range(len(filter_noise))
            ])
            new_assignment[new_assignment > sorted(set(new_assignment))
                           [-2]] = sorted(set(new_assignment))[-2] + 1
            self.cluster_assignment = new_assignment
            self.k = len(new_assignment)
        elif method == "lda":
            corpusData = pa.DataFrame(data.toarray())
            corpusData.columns = self.geneNames
            corpusData.index = self.cellNames
            cc, id2g, id2c = self.cellMatrix2cellCorpus(corpusData)
            lda = LdaModel(corpus=cc,
                           id2word=id2g,
                           num_topics=numberOfCluster,
                           update_every=1,
                           passes=1,
                           alpha=alpha,
                           eta=eta)
            cell_type = lda.get_document_topics(cc)
            cell_type_lda = [max(dict(i), key=dict(i).get) for i in cell_type]
            self.cluster_assignment = cell_type_lda
            self.k = len(set(cell_type_lda))
        elif method == "aggl":
            aggl_clustering = cluster.AgglomerativeClustering(
                n_clusters=numberOfCluster).fit(data.toarray())
            self.cluster_assignment = aggl_clustering.labels_
            self.k = len(set(aggl_clustering.labels_))
        elif method == "birch":
            birch_clustering = cluster.Birch(n_clusters=numberOfCluster).fit(
                data.toarray())
            self.cluster_assignment = birch_clustering.predict(data.toarray())
            self.k = len(set(list(self.cluster_assignment)))
        elif method == "dbscan":
            dbscan_clustering = cluster.DBSCAN(eps=eps,
                                               min_samples=min_samples,
                                               metric=metric).fit(
                                                   data.toarray())
            dbscan_lables = dbscan_clustering.labels_
            dbscan_lables[dbscan_lables < 0] = dbscan_lables.max() + 1
            self.cluster_assignment = dbscan_lables
            self.k = len(set(dbscan_lables))
        elif method == "knn":
            knn_sparce_connectivity = kneighbors_graph(data.toarray(),
                                                       min_sample)
            n_components, labels = csgraph.connected_components(
                knn_sparce_connectivity)
            labels[labels < 0] = labels.max() + 1
            self.cluster_assignment = labels
            self.k = len(set(labels))
        elif method == "optics":
            optics_clustering = cluster.OPTICS(
                min_samples=min_samples,
                xi=xi,
                min_cluster_size=min_cluster_size,
                metric=metric).fit(data.toarray())
            optics_label = optics_clustering.labels_[
                optics_clustering.ordering_]
            optics_label[optics_label < 0] = optics_label.max() + 1
            self.cluster_assignment = optics_label
            self.k = len(set(optics_label))
        self.cluster_label = [str(i) for i in range(self.k)]
        return None

コード例 #23

0

ファイルを表示

ファイル: optics_clustering.py プロジェクト: cenkbircanoglu/wsl-survey

tr_features = np.load('./data/voc12/features/train_features.npy',
                      allow_pickle=True)
val_features = np.load('./data/voc12/features/val_features.npy',
                       allow_pickle=True)
features = tr_aug_features.tolist() + tr_features.tolist(
) + val_features.tolist()

df = pd.DataFrame.from_records(features)
df.drop_duplicates('img_name', inplace=True)
df['feature'] = df['feature'].apply(lambda x: x[0].reshape(-1).tolist())
X = np.array(df['feature'].values.tolist())

for eps in range(2, 50, 3):
    for min_sample in [2, 3, 4, 8, 16, 32]:
        print(eps, min_sample)
        cls = cluster.OPTICS(eps=eps, n_jobs=-1, min_samples=min_sample)
        cls = cls.fit(X)
        labels = cls.labels_
        label_d = dict()
        category_size = len(set(cls.labels_))
        for img_name, label in zip(df['img_name'].values, cls.labels_):
            cluster_label = np.zeros(category_size)
            cluster_label[label] = 1
            label_d[img_name] = cluster_label
        print(eps, category_size, min_sample)
        if len(set(cls.labels_)) > 1:
            print("Silhouette Coefficient: %0.3f" %
                  metrics.silhouette_score(X, labels))

        np.save('./data/voc12/cls_optics%s_%s_labels.npy' % (eps, min_sample),
                label_d)

コード例 #24

0

ファイルを表示

ファイル: main_real.py プロジェクト: jreventos/SNN_clustering_algorithm

def main():
    """Comaprison between K-means, Spectral Clustering, CURE, DBSCAN, OPTICS and SNN clustering algorithms
    in small and medium real world data sets. Comparison with K-means and SNN clustering algorithm in large
    real world data set.

          """
    # ===============================
    # SMALL AND MEDIUM REAL DATA SETS
    # ===============================
    from sklearn import datasets
    plt.figure(figsize=(9 * 2 + 3, 12.5))
    plt.subplots_adjust(left=.02,
                        right=.98,
                        bottom=.001,
                        top=.96,
                        wspace=.05,
                        hspace=.01)

    plot_num = 1

    default_base = {
        'eps': .5,
        'MinPts_fraction': 0.5,
        'n_neighbors': 20,
        'n_clusters': 3,
        'min_samples': 20,
        'xi': 0.05,
        'min_cluster_size': 0.1,
        'width': 2.5,
        'height': 2.5
    }

    # Small and medium world real datasets
    iris = datasets.load_iris(return_X_y=True)
    breast_cancer = datasets.load_breast_cancer(return_X_y=True)

    datasets = [
        (iris, {
            'name': 'iris',
            'n_clusters': 3,
            'd_eps': 0.8,
            'coord_x': 2,
            'coord_y': 1,
            'n_neighbors': 30,
            'eps': 0.35,
            'MinPts_fraction': 0.5
        }),
        (breast_cancer, {
            'name': 'breast_cancer',
            'n_clusters': 2,
            'd_eps': 2,
            'coord_x': 2,
            'coord_y': 3,
            'n_neighbors': 60,
            'eps': 0.5,
            'MinPts_fraction': 0.5
        }),
    ]

    snn_parameters = []
    results = []
    total_ypred = []
    for i_dataset, (dataset, algo_params) in enumerate(datasets):
        # update parameters with dataset-specific values
        params = default_base.copy()
        params.update(algo_params)

        snn_parameters.append([
            params['name'], params['n_neighbors'], params['eps'],
            params['MinPts_fraction']
        ])

        X, y = dataset

        # normalize dataset for easier parameter selection
        X = StandardScaler().fit_transform(X)

        # ============
        # Create cluster algorithms
        # ============

        k_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
        spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'],
                                              eigen_solver='arpack',
                                              affinity="nearest_neighbors")
        dbscan = cluster.DBSCAN(eps=params['d_eps'])
        optics = cluster.OPTICS(min_samples=params['min_samples'],
                                xi=params['xi'],
                                min_cluster_size=params['min_cluster_size'])
        snn = SNN(K=params['n_neighbors'],
                  Eps=params['eps'],
                  MinPts_fraction=0.5)

        clustering_algorithms = (('Original', None), ('K_means', k_means),
                                 ('SpectralClustering', spectral),
                                 ('CURE', cure), ('DBSCAN', dbscan),
                                 ('OPTICS', optics), ('SNN', snn))

        for name, algorithm in clustering_algorithms:

            if name == 'CURE':
                cure_inst = algorithm(X, params['n_clusters'])
                cure_inst.process()
                clusters = cure_inst.get_clusters()
                y_pred = [0] * len(X)
                for i in range(len(clusters)):
                    cluster_cure = clusters[i]
                    for index in cluster_cure:
                        y_pred[index] = i
            elif name == 'Original':
                y_pred = y

            else:
                algorithm.fit(X)
                if hasattr(algorithm, 'labels_'):
                    y_pred = algorithm.labels_.astype(np.int)
                else:
                    y_pred = algorithm.predict(X)

            total_ypred.append(y_pred)

            mutual_info = None
            rand_index = None
            fowlkes_mallows = None
            calinski_score = None
            davies_bouldin = None
            silhouette = None

            if len(np.unique(y_pred)) > 1 and len(np.unique(y)) > 1:
                # External indices:
                mutual_info = round(
                    adjusted_mutual_info_score(y,
                                               y_pred,
                                               average_method='arithmetic'), 3)
                rand_index = round(adjusted_rand_score(y, y_pred), 3)
                fowlkes_mallows = round(fowlkes_mallows_score(y, y_pred), 3)

                # Internal indexes
                calinski_score = round(calinski_harabaz_score(X, y_pred), 3)
                davies_bouldin = round(davies_bouldin_score(X, y_pred), 3)
                silhouette = round(silhouette_score(X, y_pred), 3)

            results.append([
                params['name'], name, mutual_info, rand_index, fowlkes_mallows,
                calinski_score, davies_bouldin, silhouette
            ])

            plt.subplot(len(datasets), len(clustering_algorithms), plot_num)
            if i_dataset == 0:
                plt.title(name, size=18)

            colors = np.array(
                list(
                    islice(
                        cycle([
                            '#377eb8', '#ff7f00', '#4daf4a', '#f781bf',
                            '#a65628', '#984ea3', '#999999', '#e41a1c',
                            '#dede00'
                        ]), int(max(y_pred) + 1))))
            # add black color for outliers (if any)
            colors = np.append(colors, ["#000000"])
            plt.scatter(X[:, params['coord_x']],
                        X[:, params['coord_y']],
                        s=10,
                        color=colors[y_pred])

            plt.xlim(-params['width'], params['width'])
            plt.ylim(-params['height'], params['height'])
            plt.xticks(())
            plt.yticks(())
            plot_num += 1

    outputfile = "./results/real_datasets_sklearn_metrics"
    results_df = pd.DataFrame(results,
                              columns=[
                                  'Dataset', 'Algorithm', 'AMI', 'ARI', 'FM',
                                  'CHI', 'DBI', 'Silhouette'
                              ])
    results_df.to_csv(outputfile + '.csv', index=False, header=True)
    results_df.to_excel(outputfile + '.xlsx', index=False, header=True)

    plt.savefig('./results/real_datasets_sklearn.png')
    plt.show()

    # ===============================
    # LARGE REAL DATA SET
    # ===============================

    def correct_detections(y_pred):
        """Print correct de"""
        dos_cor, normal_cor, probe_cor, r2l_cor, u2r_cor = 0, 0, 0, 0, 0

        for val in y_pred[0:999]:
            if val == 0:
                dos_cor += 1
        for val in y_pred[1000:1999]:
            if val == 1:
                normal_cor += 1
        for val in y_pred[2000:2999]:
            if val == 2:
                probe_cor += 1
        for val in y_pred[3000:3999]:
            if val == 3:
                r2l_cor += 1
        for val in y_pred[4000:4999]:
            if val == 4:
                u2r_cor += 1

        print(dos_cor, normal_cor, probe_cor, r2l_cor, u2r_cor)

    # ===== K means clustering ======

    pd_dataset = pd.read_csv('./csv_files/KDD.csv')
    X = pd_dataset.iloc[:, :-1].to_numpy()
    y = pd_dataset.iloc[:, -1].to_numpy()

    k_means = cluster.MiniBatchKMeans(n_clusters=5, random_state=42)
    k_means.fit(X)
    if hasattr(k_means, 'labels_'):
        y_pred_kmeans = k_means.labels_.astype(np.int)
    else:
        y_pred_kmeans = k_means.predict(X)

    # Count detections per cluster
    unique, counts = np.unique(y_pred_kmeans, return_counts=True)
    print(dict(zip(unique, counts)))

    # Evaluation
    print(classification_report(y, y_pred_kmeans))
    correct_detections(y_pred_kmeans)

    # ===== SNN clustering ======

    snn = SNN(K=300, Eps=0.4, MinPts_fraction=0.5)
    snn.fit(X)
    if hasattr(snn, 'labels_'):
        y_pred = snn.labels_.astype(np.int)
    else:
        y_pred = snn.predict(X)

    # Count detections per cluster
    unique, counts = np.unique(y_pred, return_counts=True)
    print(dict(zip(unique, counts)))

    # Evaluation
    print(classification_report(y, y_pred))
    correct_detections(y_pred)

    # External and Internal indices evaluation :
    results = []
    total_ypred = [('Original', y), ('k-means', y_pred_kmeans),
                   ('SNN', y_pred)]
    for name, y_pred in total_ypred:
        silhouette = round(silhouette_score(X, y_pred), 3)

        results.append(['KDD CUP 99', name, silhouette])

        outputfile = "./results/real_datasets_KDD_CUP_metrics"
        results_df = pd.DataFrame(
            results, columns=['Dataset', 'Algorithm', 'Silhouette'])
        results_df.to_csv(outputfile + '.csv', index=False, header=True)
        results_df.to_excel(outputfile + '.xlsx', index=False, header=True)

コード例 #25

0

ファイルを表示

def opt(m, e, s, x):
    model = cluster.OPTICS(min_samples=int(m), xi=x, min_cluster_size=s)
    return model

コード例 #26

0

ファイルを表示

ファイル: main.py プロジェクト: AlfredoCU/Clustering-Algorithm

    else:
        y.append(model.predict(x))
        
fig = plt.figure(figsize=(27, 9))
fig.suptitle("GaussianMixture", fontsize=48)
for i in range(6):
    ax = plt.subplot(2, 3, i+1)
    ax.scatter(X[i][:,0], X[i][:,1], c=y[i])

plt.savefig("GaussianMixture.eps", format="eps")

# =============================================================================

y = []
for c, x in zip(classes, X):
    model = cluster.OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.1)
    model.fit(x)
    if hasattr(model, "labels_"):
        y.append(model.labels_.astype(np.int))
    else:
        y.append(model.predict(x))
        
fig = plt.figure(figsize=(27, 9))
fig.suptitle("OPTICS", fontsize=48)
for i in range(6):
    ax = plt.subplot(2, 3, i+1)
    ax.scatter(X[i][:,0], X[i][:,1], c=y[i])

plt.savefig("OPTICS.eps", format="eps")

# =============================================================================

コード例 #27

0

ファイルを表示

ファイル: sklearn_clusters.py プロジェクト: OlgaBelitskaya/colab_notebooks

dhtml('DBSCAN, OPTICS, & Others')

n_clusters=2
km=scl.KMeans(n_clusters=n_clusters,random_state=0)
y3_km=km.fit_predict(X3)
ac=scl.AgglomerativeClustering(n_clusters=n_clusters,
                               affinity='euclidean',
                               linkage='complete')
y3_ac=ac.fit_predict(X3)
sp=scl.SpectralClustering(n_clusters=n_clusters)
y3_sp=sp.fit_predict(X3)
db=scl.DBSCAN(eps=.2,min_samples=15,
              metric='euclidean')
y3_db=db.fit_predict(X3)
op=scl.OPTICS(eps=.2,min_samples=30)
y3_op=op.fit_predict(X3)
cl=['KMeans','Agglomerative','Spectral',
    'DBSCAN','OPTICS']
py3=[y3_km,y3_ac,y3_sp,y3_db,y3_op]

np.unique(y3_db)

f,ax=pl.subplots(5,1,figsize=(6,18))
for c in range(5):
    for i in np.unique(py3[c]):
        color=pl.cm.cool(.4*(i+1))
        ax[c].scatter(X3[py3[c]==i,0],
                      X3[py3[c]==i,1],
               s=30,color=color,marker='v',
               edgecolor='#aa33ff',

コード例 #28

0

ファイルを表示

    def run(self):
        plt.figure(figsize=(9 * 2 + 3, len(self.datasets)*2))
        # plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01)

        plot_num = 1

        for i_dataset, (dataset, algo_params) in enumerate(self.datasets):
            # update parameters with dataset-specific values
            params = self.default_base.copy()
            params.update(algo_params)

            X, y = dataset

            # normalize dataset for easier parameter selection
            X = StandardScaler().fit_transform(X)

            # estimate bandwidth for mean shift
            bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile'])

            # connectivity matrix for structured Ward
            connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False)

            # make connectivity symmetric
            connectivity = 0.5 * (connectivity + connectivity.T)

            # ============
            # Create cluster objects
            # ============
            ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True)
            two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters'])
            ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity)
            spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors")
            dbscan = cluster.DBSCAN(eps=params['eps'])
            optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size'])
            affinity_propagation = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference'])
            average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity)
            birch = cluster.Birch(n_clusters=params['n_clusters'])
            gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full')

            clustering_algorithms = (
                ('MiniBatchKMeans', two_means),
                ('AffinityPropagation', affinity_propagation),
                ('MeanShift', ms),
                ('SpectralClustering', spectral),
                ('Ward', ward),
                ('AgglomerativeClustering', average_linkage),
                ('DBSCAN', dbscan),
                ('OPTICS', optics),
                ('Birch', birch),
                ('GaussianMixture', gmm)
            )

            for name, algorithm in clustering_algorithms:
                t0 = time.time()

                # catch warnings related to kneighbors_graph
                with warnings.catch_warnings():
                    warnings.filterwarnings(
                        "ignore",
                        message="the number of connected components of the " +
                        "connectivity matrix is [0-9]{1,2}" +
                        " > 1. Completing it to avoid stopping the tree early.",
                        category=UserWarning)
                    warnings.filterwarnings(
                        "ignore",
                        message="Graph is not fully connected, spectral embedding" +
                        " may not work as expected.",
                        category=UserWarning)
                    algorithm.fit(X)

                t1 = time.time()
                if hasattr(algorithm, 'labels_'):
                    y_pred = algorithm.labels_.astype(np.int)
                else:
                    y_pred = algorithm.predict(X)

                plt.subplot(len(self.datasets), len(clustering_algorithms), plot_num)
                if i_dataset == 0:
                    plt.title(name, size=9)

                colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a',
                                                     '#f781bf', '#a65628', '#984ea3',
                                                     '#999999', '#e41a1c', '#dede00']),
                                              int(max(y_pred) + 1))))
                # add black color for outliers (if any)
                colors = np.append(colors, ["#000000"])
                plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred])

                plt.xlim(-2.5, 2.5)
                plt.ylim(-2.5, 2.5)
                plt.xticks(())
                plt.yticks(())
                plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'),
                         transform=plt.gca().transAxes, size=8,
                         horizontalalignment='right')
                plot_num += 1

        plt.show()


# Test code


#
# X_data = np.array([[667. ],
#  [693.3],
#  [732.9],
#  [658.9],
#  [702.8],
#  [697.2],
#  [658.7],
#  [723.1],
#  [719.5],
#  [687.4],
#  [704.1],
#  [658.8],
#  [667.8],
#  [703.4]])
# Y = np.array([38.36,
#  11.06,
#   8.13,
#  45.23,
#  11.16,
#  11.96,
#  40.27,
#   7.01,
#   7.25,
#  11.28,
#   7.21,
#  40.4 ,
#  32.2 ,
#  11.18])
#
# cl = cluster_test(X_data, Y)
# cl.run()