def create_clusters_from_optics(mod: model, rejection_ratio=0.5, maxima_ratio=0.5, min_elements=5, iter=100, metric="euclidean", max_bound=np.inf): mod.setname("OPTICS rejection_ratio=" + str(round(rejection_ratio * 1000) / 1000) + " maxima_ratio=" + str(maxima_ratio) + " min_elements=" + str(min_elements)) X = mod.mesures() mod.start_treatment() model: sk.OPTICS = sk.OPTICS(max_bound=max_bound, maxima_ratio=maxima_ratio, rejection_ratio=rejection_ratio, min_samples=min_elements, n_jobs=-1, metric=metric) model.fit(X) mod.clusters_from_labels(model.labels_, "cl_optics") mod.end_treatment() return mod
def cluster_print_sklearn(list_distances, matrix_distances, dict_corresp, n_fr, dist_threshold=0.2, clustering="dbscan", min_occu=5.0): """ Same as cluster_print but using sklearn's dbscan and optics clustering methods (https://scikit-learn.org/stable/modules/clustering.html) """ import sklearn.cluster as c if clustering == "dbscan": model = c.DBSCAN(eps=0.2, metric="precomputed").fit(matrix_distances) elif clustering == "optics": model = c.OPTICS(metric="precomputed").fit(matrix_distances) clusters = model.labels_ u, count = np.unique(clusters, return_counts=1) count_sort_ind = np.argsort(-count) # sort descending by occupancy/size clust_names = u[count_sort_ind] # sort descending by occupancy/size counts = count[count_sort_ind] # sort descending by occupancy/size for i in range(len(clust_names)): occupancy = np.round((counts[i] / n_fr) * 100, 1) if clust_names[i] > -1 and occupancy > float(min_occu): cluster_of_int = np.where(clusters == clust_names[i])[0] center = find_center(cluster_of_int, dict_corresp, matrix_distances) print( f"Cluster {clust_names[i]} has an {occupancy}% and the representative structure is {center[0]}, with an average distance of {np.round(center[1], 2)} to other cluster members" ) return clusters
def optics_fit(img, xi=-0.15, min_samples=2): from sklearn import cluster print(img) X = np.concatenate((img[0][..., np.newaxis], img[1][..., np.newaxis]), axis=1) print(X) optics = cluster.OPTICS(min_samples, cluster_method='xi', xi=xi) optics.fit(X) if hasattr(optics, 'labels_'): y_pred = optics.labels_.astype(int) else: y_pred = optics.predict(X) labels = optics.labels_ unique_labels = set(labels) clusters = [] for l in unique_labels: class_member_mask = (labels == l) cluster = X[class_member_mask] clusters.append(cluster) print(clusters) return clusters
def get_data(session_ds, inc_eval_ds, ms_band, db_eps): session_data = list(session_ds) inc_eval_data = list(inc_eval_ds) session_emb = np.squeeze([utils.t2a(d[0][0]) for d in session_data]) session_lab = np.squeeze([d[1] for d in session_data]) inc_eval_emb = np.squeeze([utils.t2a(d[0][0]) for d in inc_eval_data]) inc_eval_lab = np.squeeze([d[1] for d in inc_eval_data]) X = np.concatenate((session_emb, inc_eval_emb)) y = np.concatenate((session_lab, inc_eval_lab)) meanshifts = [cl.MeanShift(bandwidth=b).fit_predict(X) for b in ms_band] optics = cl.OPTICS(min_samples=1).fit_predict(X) dbscans = [cl.DBSCAN(eps=e, min_samples=1).fit_predict(X) for e in db_eps] res = np.array(meanshifts + dbscans + [optics]) inc_pred = res[:, session_lab.size:] aris = [adjusted_rand_score(p, inc_eval_lab) for p in inc_pred] amis = [ adjusted_mutual_info_score(p, inc_eval_lab, average_method='max') for p in inc_pred ] return np.array(aris), np.array(amis), inc_pred, inc_eval_lab
def compute_clusters(vectors, clusters, algorithm='kmeans'): # select clustering algorithm if algorithm == 'kmeans': algorithm = cluster.MiniBatchKMeans(n_clusters=len(set(clusters))) elif algorithm == 'dbscan': algorithm = cluster.DBSCAN(eps=1.25, n_jobs=-1) elif algorithm == 'optics': algorithm = cluster.OPTICS(min_samples=10, eps=10, cluster_method='dbscan', n_jobs=-1) elif algorithm == 'birch': algorithm = cluster.Birch(n_clusters=len(set(clusters))) elif algorithm == 'spectral': algorithm = cluster.SpectralClustering(n_clusters=len(set(clusters)), eigen_solver='arpack', affinity="nearest_neighbors", n_jobs=-1) elif algorithm == 'affinity': algorithm = cluster.AffinityPropagation(damping=.9, preference=-200) else: raise NotImplementedError(f"Not implemented for algorithm {algorithm}") # predict cluster memberships algorithm.fit(vectors) if hasattr(algorithm, 'labels_'): labels = algorithm.labels_.astype(np.int) else: labels = algorithm.predict(vectors) #transform categorical labels to digits if isinstance(clusters[0], str): labels_true = LabelEncoder().fit_transform(clusters) elif isinstance(clusters[0], (int, np.int)): labels_true = clusters # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) print('Estimated number of clusters: %d' % n_clusters_) print('Estimated number of noise points: %d' % n_noise_) print("Homogeneity: %0.3f" % metrics.homogeneity_score(labels_true, labels)) print("Completeness: %0.3f" % metrics.completeness_score(labels_true, labels)) print("V-measure: %0.3f" % metrics.v_measure_score(labels_true, labels)) print("Adjusted Rand Index: %0.3f" % metrics.adjusted_rand_score(labels_true, labels)) print("Adjusted Mutual Information: %0.3f" % metrics.adjusted_mutual_info_score(labels_true, labels)) print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(vectors, labels)) return labels, algorithm
def cluster_fps(fps: List[np.ndarray], ncluster: int = 100, method: str = 'minibatch', ncpu: Optional[int] = None) -> np.ndarray: """Cluster the molecular fingerprints, fps, by a given method Parameters ---------- fps : List[np.ndarray] a list of bit vectors corresponding to a given molecule's Morgan fingerprint (radius=2, length=1024) ncluster : int (Default = 100) the number of clusters to form with the given fingerprints (if the input method requires this parameter) method : str (Default = 'kmeans') the clusering method to use. Choices include: - k-means clustering: 'kmeans' - mini-batch k-means clustering: 'minibatch' - OPTICS clustering 'optics' ncpu : Optional[int] the number of cores to parallelize clustering over, if possible Returns ------- cluster_ids : np.ndarray the cluster id corresponding to a given fingerprint """ begin = timeit.default_timer() fps = sparse.vstack(fps, format='csr') if method == 'kmeans': clusterer = cluster.KMeans(n_clusters=ncluster, n_jobs=ncpu) elif method == 'minibatch': clusterer = cluster.MiniBatchKMeans(n_clusters=ncluster, n_init=10, batch_size=100, init_size=1000) elif method == 'optics': clusterer = cluster.OPTICS(min_samples=0.01, metric='jaccard', n_jobs=ncpu) else: raise ValueError(f'{method} is not a supported clustering method') cluster_ids = clusterer.fit_predict(fps) elapsed = timeit.default_timer() - begin print(f'Clustering and predictions took: {elapsed:0.3f}s') return cluster_ids
def cluster_faces_by_OPTICS(data): encodings = [d["encoding"] for d in data] clt = cluster.OPTICS(cluster_method="xi", max_eps=2, min_samples=5, metric="euclidean", n_jobs=-1) clt.fit(encodings) # print(clt.core_distances_) labels = list(clt.labels_) for (i, label) in enumerate(labels): if label == -1: # -1是噪声点,表明没有所属的cluster,单独给一个标签 labels[i] = len(labels) + i return labels
def find_best_min_samples_optics(X = None, START = 1, END = 100, EPS = 0.5): # Scale the data. Makes the algorithm more correct # Devo partire da 0.3, altrimenti da un sacco di problemi sils = [] N_clusters = [] SAMPLES = np.arange(START, END) for MIN_SAMPLE in SAMPLES: print("-- Progress: "+str(int(((MIN_SAMPLE-START)/(END-START))*10000)/100)+"%\r", end='') # Clustering clusterer = cl.OPTICS(max_eps=EPS, min_samples=MIN_SAMPLE).fit(X) # Get the labels (-1 is noise points) labels = clusterer.labels_ # Number of clusters in labels, ignoring noise if present. n_clusters_ = len(set(labels)) - (1 if -1 in labels else 0) n_noise_ = list(labels).count(-1) # Keeps trace of the number of clusters N_clusters.append(n_clusters_) # Keeps trace of the silhouette results X_score = X[np.where(labels > 0)] labels_score = labels[np.where(labels > 0)] try: sils.append(ms.silhouette_score(X_score, labels_score)) except: print("All points labelled as noise. Stopping.") break # Save the best Silhouette and its position position_of_best = np.where(sils == max(sils))[0][0] print("Best Silhouette:", sils[position_of_best]) print("Number of Clusters for it:", N_clusters[position_of_best]) print("Min_sample value for it:", SAMPLES[position_of_best]) # Plor the MIN_SAMPLE value (step) to the Silhouette coefficient plt.plot(SAMPLES, sils) plt.axhline(y = 0, c = 'red') plt.show() return {"Silhouette": sils[position_of_best], "N_cluster": N_clusters[position_of_best], "Min": SAMPLES[position_of_best]}
def runOPTICS(data, min_samples): """ min_samplesint: > 1 or float between 0 and 1 (default=5) The number of samples in a neighborhood for a point to be considered as a core point. Also, up and down steep regions can't have more then min_samples consecutive non-steep points. Expressed as an absolute number or a fraction of the number of samples (rounded to be at least 2). """ model_OPTIC = skclust.OPTICS(min_samples=min_samples) # Fit the model model_OPTIC.fit(data) # import matplotlib.pyplot as plt # space = np.arange(len(data)) # reachability = model_OPTIC.reachability_[model_OPTIC.ordering_] # labels = model_OPTIC.labels_[model_OPTIC.ordering_] # plt.plot(space[labels != -1], reachability[labels != -1], 'g.', alpha=0.25) # plt.plot(space[labels == -1], reachability[labels == -1], 'k.', alpha=0.25) # plt.show() return model_OPTIC
connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = (('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering',
# make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering( n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering( n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=30, maxima_ratio=.8, rejection_ratio=.4) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture( n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = ( ('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward),
def run(self): for i_dataset, (dataset, algo_params) in enumerate(self.datasets): # update parameters with dataset-specific values params = self.default_base.copy() params.update(algo_params) X, y = dataset # normalize dataset for easier parameter selection X = self.scaler.fit_transform(X) print(f'mean{self.scaler.mean_}, var{self.scaler.var_}, n_samples[{self.scaler.n_samples_seen_}], scale[{self.scaler.scale_}]') # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ for alg in self.selected_clustering_algorithms: if alg is 'MiniBatchKMeans': self.clustering_algorithms['MiniBatchKMeans'] = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) elif alg is 'AffinityPropagation': self.clustering_algorithms['AffinityPropagation'] = cluster.AffinityPropagation(damping=params['damping'],preference=params['preference']) elif alg is 'MeanShift': self.clustering_algorithms['MeanShift'] = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) elif alg is 'SpectralClustering': self.clustering_algorithms['SpectralClustering'] = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") elif alg is 'Ward': self.clustering_algorithms['Ward'] = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) elif alg is 'AgglomerativeClustering': self.clustering_algorithms['AgglomerativeClustering'] = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) elif alg is 'DBSCAN': self.clustering_algorithms['DBSCAN'] = cluster.DBSCAN(eps=params['eps']) elif alg is 'OPTICS': self.clustering_algorithms['OPTICS'] = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) elif alg is 'Birch': self.clustering_algorithms['Birch'] = cluster.Birch(n_clusters=params['n_clusters']) elif alg is 'GaussianMixture': self.clustering_algorithms['GaussianMixture'] = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') elif alg is 'OneDGaussianKernel': self.clustering_algorithms['OneDGaussianKernel'] = OneDimensionalGaussianKernel(bandwidth=params['bandwidth']) elif alg is 'MultiDGaussianKernel': self.clustering_algorithms['MultiDGaussianKernel'] = MultiDimensionalGaussianKernel(bandwidth=params['bandwidth']) # self.clustering_algorithms['MiniBatchKMeans'] = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) # self.clustering_algorithms['AffinityPropagation'] = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference']) # self.clustering_algorithms['MeanShift'] = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) # self.clustering_algorithms['SpectralClustering'] = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") # self.clustering_algorithms['Ward'] = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) # self.clustering_algorithms['AgglomerativeClustering'] = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) # self.clustering_algorithms['DBSCAN'] = cluster.DBSCAN(eps=params['eps']) # self.clustering_algorithms['OPTICS'] = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) # self.clustering_algorithms['Birch'] = cluster.Birch(n_clusters=params['n_clusters']) # self.clustering_algorithms['GaussianMixture'] = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') for name, algorithm in self.clustering_algorithms.items(): t0 = time.time() # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) algorithm.labels_ = y_pred # # Test code # X_data = np.array([[667. , 7], # [693.3, 7], # [732.9, 6], # [658.9, 1], # [702.8, 7], # [697.2, 1], # [658.7, 2], # [723.1, 1], # [719.5, 3], # [687.4, 1], # [704.1, 1], # [658.8, 4], # [667.8, 3], # [703.4, 3]]) # Y = np.array([[667. ], # [693.3], # [732.9], # [658.9], # [702.8], # [697.2], # [658.7], # [723.1], # [719.5], # [687.4], # [704.1], # [658.8], # [667.8], # [703.4]]) # # cl = clusters() # # np_X = np.concatenate((X_data, Y), axis=1) # cl.set_data(X_data, Y) # cl.run() # # data, _, _ = cl.get_clustered_data('GaussianMixture') # # print(data)
def cluster_particles(self, algorithm='Kmeans', properties=None, n_clusters=2, eps=0.2, min_samples=5): """ Cluster particles in to different populations based on specified properties. Parameters ---------- algorithm: str The algorithm to use for clustering. Options are 'Kmeans','DBSCAN','OPTICS','AffinityPropagation'. properties: list A list of the properties upon which to base the clustering. n_clusters: int The number of clusters to split the data into. Used for Kmeans. eps: float The distance between samples. Used for DBSCAN. min_samples: int The minimum number of samples within the eps distance to be classed as a cluster. Used for DBSCAN and OPTICS. Returns ------- List of Particle_list() objects. """ vec, feature_array = _extract_features(self, properties) feature_array = preprocessing.scale(feature_array) if algorithm == 'Kmeans': cluster_out = cluster.KMeans( n_clusters=n_clusters).fit_predict(feature_array) start = 0 elif algorithm == 'DBSCAN': cluster_out = cluster.DBSCAN( eps=eps, min_samples=min_samples).fit_predict(feature_array) start = -1 elif algorithm == 'OPTICS': cluster_out = cluster.OPTICS( min_samples=min_samples).fit_predict(feature_array) start = -1 elif algorithm == 'AffinityPropagation': cluster_out = cluster.AffinityPropagation().fit_predict( feature_array) start = 0 for i, p in enumerate(self.list): p.cluster_number = cluster_out[i] plist_clusters = [] for n in range(start, cluster_out.max() + 1): p_list_new = Particle_list() p_list_new.list = list( it.compress(self.list, [c == n for c in cluster_out])) plist_clusters.append(p_list_new) return (plist_clusters)
def optic(samples, samples_to_predict): op = cluster.OPTICS(min_samples=2, n_jobs=-1) op.fit(samples) return op.labels_
# Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params["n_clusters"]) ward = cluster.AgglomerativeClustering( n_clusters=params["n_clusters"], linkage="ward", connectivity=connectivity ) spectral = cluster.SpectralClustering( n_clusters=params["n_clusters"], eigen_solver="arpack", affinity="nearest_neighbors", ) dbscan = cluster.DBSCAN(eps=params["eps"]) optics = cluster.OPTICS( min_samples=params["min_samples"], xi=params["xi"], min_cluster_size=params["min_cluster_size"], ) affinity_propagation = cluster.AffinityPropagation( damping=params["damping"], preference=params["preference"], random_state=0 ) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params["n_clusters"], connectivity=connectivity, ) birch = cluster.Birch(n_clusters=params["n_clusters"]) gmm = mixture.GaussianMixture( n_components=params["n_clusters"], covariance_type="full" )
def __init__(self, inpath, outpath, filename, *args, **kwargs): fullpath = os.path.join(inpath, filename) mat = scipy.io.loadmat(fullpath) tmpdict = {} if len(kwargs) > 0: for key, value in kwargs.items(): tmpdict[key] = value else: print "clustering to 10 clusters" self.n_clusters = 10 if len(kwargs) > 0: if "varname" in tmpdict.keys(): varname = tmpdict["varname"] else: varname = "data" if 'method' in kwargs.keys(): self.methodname = kwargs['method'] randint_ = random.randint(0, 100) if 'KMeans' in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklcl.KMeans(n_clusters=n['n_clusters'], random_state=randint_) else: self.method = sklcl.KMeans(random_state=0) if 'DBSCAN' in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklcl.DBSCAN( eps=n['eps'], min_samples=n['min_samples'], random_state=randint_) else: self.method = sklcl.DBSCAN() if 'OPTICS' in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklcl.OPTICS( max_eps=n['max_eps'], min_samples=n['min_samples'], random_state=randint_) else: self.method = sklcl.OPTICS() if 'AffinityPropagation' in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklcl.AffinityPropagation( damping=n['damping'], random_state=randint_) else: self.method = sklcl.AffinityPropagation() if 'MeanShift' in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklcl.MeanShift(bandwidth=n['bandwidth'], random_state=randint_) else: self.method = sklcl.MeanShift() if "SpectralClustering" in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklcl.SpectralClustering( n_clusters=n['n_clusters'], random_state=randint_) else: self.method = sklcl.SpectralClustering() if "Birch" in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklcl.Birch(threshold=n['threshold'], n_clusters=n['n_clusters']) else: self.method = sklcl.Birch() if "AgglomerativeClustering" in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklcl.AgglomerativeClustering( n_clusters=n['n_clusters'], linkage=n['linkage']) self.n_clusters = n['n_clusters'] else: self.method = sklcl.AgglomerativeClustering() self.n_clusters = 2 if "GaussianMixture" in self.methodname: if 'xtra' in kwargs.keys(): n = kwargs['xtra'] self.method = sklmx.GaussianMixture( covariance_type=n['covariance_type'], n_components=n['n_components'], random_state=randint_) else: self.method = sklmx.GaussianMixture() else: pass else: print "clustering with KMeans" self.method = sklcl.KMeans(random_state=0) self.methodname = 'KMeans' print "------------" print(self.methodname) # if 'n_clusters' in tmpdict.keys(): # self.n_clusters = tmpdict['n_clusters'] # else: # self.n_clusters = 10 try: self.pos_quat = self.get_data_pos_quat(mat, varname) self.pos_quat_saved = self.pos_quat except KeyError as e: raise e self.pos_dir_vec = np.array( [pos_q_2_pos_vec(row) for row in self.pos_quat]) # self.method = sklcl.KMeans(n_clusters=self.n_clusters, random_state=randint_) self.filename = filename self.outpath = outpath title = self.filename.split('.mat') self.title = title[0] self.centres = [] self.centres_q = [] self.centres_dirvec = []
centers = [[1, 1], [-1, -1], [1, -1]] n_clusters = len(centers) kmeans = KMeans(n_clusters=n_clusters, random_state=rng) kmedoid = KMedoids(n_clusters=n_clusters, random_state=rng) two_means = cluster.MiniBatchKMeans(n_clusters=n_clusters, random_state=rng) spectral = cluster.SpectralClustering( n_clusters=n_clusters, eigen_solver="arpack", affinity="nearest_neighbors", random_state=rng, ) dbscan = cluster.DBSCAN() optics = cluster.OPTICS(min_samples=20, xi=0.1, min_cluster_size=0.2) affinity_propagation = cluster.AffinityPropagation(damping=0.75, preference=-220, random_state=rng) birch = cluster.Birch(n_clusters=n_clusters) gmm = mixture.GaussianMixture(n_components=n_clusters, covariance_type="full", random_state=rng) for n_samples in [300, 600]: # Construct the dataset X, labels_true = make_blobs(n_samples=n_samples, centers=centers, cluster_std=0.4, random_state=rng)
def do_clustering(df_app, output_dir, options): df_app_tmp = df_app.copy() df_app_tmp.drop('ANNOTATE', axis=1, inplace=True) if options['CLASS'] != '': df_app_tmp.drop('CLASS', axis=1, inplace=True) scaler_list = (StandardScaler(), RobustScaler(), QuantileTransformer(), Normalizer()) for scaler in scaler_list: scaler_text = 'Preprocessing: ' + scaler.__class__.__name__ pca = PCA() df_app_tmp2 = scaler.fit_transform(df_app_tmp) ea_new = pca.fit_transform(df_app_tmp2) X = ea_new Y = df_app['CLASS'] params = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3, 'min_samples': 20, 'xi': 0.05, 'min_cluster_size': 0.1 } # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') kmeans = cluster.KMeans(n_clusters=params['n_clusters']) clustering_algorithms = (('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm), ('KMeans', kmeans)) for algo_name, algorithm in clustering_algorithms: t0 = time.time() # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) colors = (0, 0, 0) area = np.pi * 3 # Plot colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.figure() # size in inches plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.title( ea_decode.options_title(options) + '\n' + scaler_text + ' - Clustering: ' + algo_name + ' ({})'.format(params['n_clusters'])) plt.xlabel('x') plt.ylabel('y') for x_coord, y_coord, annotation, label in zip( X[:, 0], X[:, 1], df_app['ANNOTATE'], y_pred): plt.annotate(annotation, xy=(x_coord, y_coord), c=colors[label]) file_out = ea_decode.options_filename( options ) + '_' + scaler.__class__.__name__ + '_cluster' + '_' + algo_name + '_' + '{}'.format( params['n_clusters']) if output_dir == '': plt.show() else: plt.savefig(os.path.join(output_dir, file_out)) plt.close() foo = 1
def compute_all(channels, start, stop, history=timedelta(hours=2), filename=DEFAULT_FILENAME, **kwargs): # set up duration (minute-trend data has dt=1min, so reject intervals not on the minute). duration = (stop - start).total_seconds() / 60 assert (stop - start).total_seconds() / 60 == (stop - start).total_seconds() // 60 duration = int(duration) logger.info( f'Clustering data from {start} to {stop} ({duration} minutes).') # download data using TimeSeries.get(), including history of point at t0. logger.debug( f'Initiating download from {start} to {stop} with history={history}...' ) dl = TimeSeriesDict.get(channels, start=to_gps(start - history), end=to_gps(stop)) logger.info(f'Downloaded from {start} to {stop} with history={history}.') if exists('input.npy'): input_data = np.load('input.npy') logger.info('Loaded input matrix.') else: # generate input matrix of the form [sample1;...;sampleN] with sampleK = [feature1,...,featureN] # for sklearn.cluster algorithms. This is the slow part of the function, so a progress bar is shown. logger.debug(f'Initiating input matrix generation...') with Progress('building input', (duration * 60)) as progress: input_data = stack([ concatenate([ progress(dl[channel].crop, t, start=to_gps(start + timedelta(seconds=t) - history), end=to_gps(start + timedelta(seconds=t))).value for channel in channels ]) for t in range(0, int(duration * 60), 60) ]) # verify input matrix dimensions. assert input_data.shape == (duration, int( len(channels) * history.total_seconds() / 60)) np.save('input.npy', input_data) logger.info('Completed input matrix generation.') params = { 'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 15, 'min_samples': 20, 'xi': 0.05, 'min_cluster_size': 0.1 } if exists('X.npy'): X = np.load('X.npy') logger.info('Loaded X') else: # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(input_data) np.save('X.npy', X) logger.info('Generated X') if exists('bandwidth.npy'): bandwidth = np.load('bandwidth.npy') logger.info('Loaded bandwidth') else: # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) np.save('bandwidth.npy', bandwidth) logger.info('Generated bandwidth') if exists('connectivity.npy'): connectivity = np.load('connectivity.npy', allow_pickle=True) logger.info('Loaded connectivity') else: # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) np.save('connectivity.npy', connectivity) logger.info('Generated connectivity') ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = ( ('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('DBSCAN', dbscan), ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm) # ('Ward', ward), # ('AgglomerativeClustering', average_linkage), ) for name, algorithm in clustering_algorithms: if exists(f'part-{name}-{filename}'): labels = TimeSeries.read(f'part-{name}-{filename}', f'{name}-labels') logger.debug(f'LOADED {name}.') else: logger.debug(f'doing {name}...') # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) # cast the output labels to a TimeSeries so that cropping is easy later on. labels = TimeSeries( y_pred, times=dl[channels[0]].crop(start=to_gps(start), end=to_gps(stop)).times, name=f'{name}-labels') labels.write(f'part-{name}-{filename}') # put labels in data download dictionary for easy saving. dl[labels.name] = labels # write data download and labels to specified filename. cache_file = abspath(filename) if exists(cache_file): remove(cache_file) dl.write(cache_file) logger.info(f'Wrote cache to {filename}')
def plot_clustering(): # ============ # Generate datasets. We choose the size big enough to see the scalability # of the algorithms, but not too big to avoid too long running times # ============ n_samples = 1500 noisy_circles = make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = make_moons(n_samples=n_samples, noise=.05) blobs = make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None # Anisotropicly distributed data random_state = 170 X, y = make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) # blobs with varied variances varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) # ============ # Set up cluster parameters # ============ fig = plt.figure(figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 default_base = {'quantile': .3, 'eps': .3, 'damping': .9, 'preference': -200, 'n_neighbors': 10, 'n_clusters': 3, 'min_samples': 20, 'xi': 0.05, 'min_cluster_size': 0.1} datasets = [ (noisy_circles, {'damping': .77, 'preference': -240, 'quantile': .2, 'n_clusters': 2, 'min_samples': 20, 'xi': 0.25}), (noisy_moons, {'damping': .75, 'preference': -220, 'n_clusters': 2}), (varied, {'eps': .18, 'n_neighbors': 2, 'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2}), (aniso, {'eps': .15, 'n_neighbors': 2, 'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2}), (blobs, {}), (no_structure, {})] for i_dataset, (dataset, algo_params) in enumerate(datasets): # update parameters with dataset-specific values params = default_base.copy() params.update(algo_params) X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward connectivity = kneighbors_graph( X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering( n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering( n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation( damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering( linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture( n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = ( ('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm) ) for name, algorithm in clustering_algorithms: t0 = time.time() # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) plt.subplot(len(datasets), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00']), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=15, horizontalalignment='right') plot_num += 1 mpl.pyplot.close("all") return fig
def main(): """Comaprison between K-means, Spectral Clustering, CURE, DBSCAN, OPTICS and SNN clustering algorithms in artificial data sets. Choose between trhee options of data sets (comment or undo the comment for the desired set of data sets): 1) Small artificial datasets 2) Complex artificial datasets 3) Varying densities artificial datasets """ # =========================== # === ARTIFICIAL DATASETS === # =========================== from sklearn import cluster, datasets np.random.seed(0) # ============ # Generate datasets (taken from SKLEARN example) n_samples = 1500 noisy_circles = datasets.make_circles(n_samples=n_samples, factor=.5, noise=.05) noisy_moons = datasets.make_moons(n_samples=n_samples, noise=.05) blobs = datasets.make_blobs(n_samples=n_samples, random_state=8) no_structure = np.random.rand(n_samples, 2), None # Anisotropicly distributed data random_state = 170 X, y = datasets.make_blobs(n_samples=n_samples, random_state=random_state) transformation = [[0.6, -0.6], [-0.4, 0.8]] X_aniso = np.dot(X, transformation) aniso = (X_aniso, y) # blobs with varied variances varied = datasets.make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state) # ============ # Set up cluster parameters # ============ plt.figure(figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 default_base = { 'eps': .5, 'MinPts_fraction': 0.5, 'n_neighbors': 20, 'n_clusters': 3, 'min_samples': 20, 'xi': 0.05, 'min_cluster_size': 0.1 } # ========= Small artificial datasets =============== datasets = [(noisy_circles, { 'name': 'noisy_circles', 'quantile': .2, 'n_clusters': 2, 'min_samples': 20, 'xi': 0.25, 'eps': 0.5, 'd_eps': 0.15 }), (noisy_moons, { 'name': 'noisy_moons', 'n_clusters': 2, 'd_eps': 0.3 }), (varied, { 'name': 'varied', 'eps': .5, 'd_eps': 0.18, 'min_samples': 5, 'xi': 0.035, 'min_cluster_size': .2 }), (aniso, { 'name': 'aniso', 'eps': .5, 'd_eps': 0.15, 'min_samples': 20, 'xi': 0.1, 'min_cluster_size': .2 }), (blobs, { 'name': 'blobs', 'd_eps': 0.3 }), (no_structure, { 'name': 'no_structure', 'd_eps': 0.15 })] # ========= Complex shape artificial data sets ========= # datasets = [ # (None, {'name': 'complex9', 'n_clusters': 9, 'n_neighbors': 40, 'eps': 0.45, 'd_eps': 0.15, 'MinPts_fraction': 0.5}), # (None,{'name': 'cure-t0-2000n-2D', 'n_clusters': 3, 'n_neighbors': 35, 'eps': 0.45, 'd_eps': 0.15, # 'MinPts_fraction': 0.4}), # (None, {'name': 'cure-t1-2000n-2D', 'n_clusters': 6, 'n_neighbors': 35, 'eps': 0.40, 'd_eps': 0.15, 'xi': 0.035, # 'MinPts_fraction': 0.4}), # (None,{'name': '3-spiral', 'n_clusters': 3, 'n_neighbors': 10, 'eps': 0.45, 'd_eps': 0.15, 'MinPts_fraction': 0.35})] # ======== Varying densities artificial data sets====== # datasets = [ # (None,{'name': 'triangle1', 'n_clusters': 4, 'n_neighbors': 50, 'eps': 0.5, 'd_eps': 0.15, 'MinPts_fraction': 0.5}), # (None,{'name': 'triangle2', 'n_clusters': 4, 'n_neighbors': 50, 'eps': 0.5, 'd_eps': 0.15, 'MinPts_fraction': 0.5}), # (None,{'name': 'st900', 'n_clusters': 9, 'n_neighbors': 50, 'eps': 0.4, 'd_eps': 0.15, 'MinPts_fraction': 0.5}), # (None,{'name': 'compound', 'n_clusters': 6, 'n_neighbors': 25, 'eps': 0.4, 'd_eps': 0.15, 'MinPts_fraction': 0.5})] results = [] for i_dataset, (dataset, algo_params) in enumerate(datasets): # update parameters with dataset-specific values params = default_base.copy() params.update(algo_params) if dataset == None: name = params['name'] pd_dataset = pd.read_csv('./csv_files/' + name + '.csv') X = pd_dataset.iloc[:, :-1].to_numpy() y = pd_dataset.iloc[:, -1].to_numpy() else: X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # ============ # Create cluster objects # ============ k_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['d_eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) snn = SNN(K=params['n_neighbors'], Eps=params['eps'], MinPts_fraction=0.5) clustering_algorithms = (('k_means', k_means), ('SpectralClustering', spectral), ('CURE', cure), ('DBSCAN', dbscan), ('OPTICS', optics), ('SNN', snn)) for name, algorithm in clustering_algorithms: if name == 'CURE': cure_inst = algorithm(X, params['n_clusters']) cure_inst.process() clusters = cure_inst.get_clusters() y_pred = [0] * len(X) for i in range(len(clusters)): cluster_cure = clusters[i] for index in cluster_cure: y_pred[index] = i else: algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) # EVALUATION mutual_info = None rand_index = None fowlkes_mallows = None calinski_score = None davies_bouldin = None silhouette = None if len(np.unique(y_pred)) > 1 and len(np.unique(y)) > 1: # External indices: mutual_info = round( adjusted_mutual_info_score(y, y_pred, average_method='arithmetic'), 3) rand_index = round(adjusted_rand_score(y, y_pred), 3) fowlkes_mallows = round(fowlkes_mallows_score(y, y_pred), 3) # Internal indexes calinski_score = round(calinski_harabaz_score(X, y_pred), 3) davies_bouldin = round(davies_bouldin_score(X, y_pred), 3) silhouette = round(silhouette_score(X, y_pred), 3) results.append([ params['name'], name, mutual_info, rand_index, fowlkes_mallows, calinski_score, davies_bouldin, silhouette ]) # Plot the results plt.subplot(len(datasets), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plot_num += 1 outputfile = "artificial_datasets_sklearn" #outputfile = "artificial_datasets" #outputfile = "artificial_datasets_densities" # Save evaluation metrics: results_df = pd.DataFrame(results, columns=[ 'Dataset', 'Algorithm', 'AMI', 'ARI', 'FM', 'CHI', 'DBI', 'Silhouette' ]) results_df.to_csv('./results/' + outputfile + '_metrics.csv', index=False, header=True) results_df.to_excel('./results/' + outputfile + '_metrics.xlsx', index=False, header=True) # Save plots: plt.savefig('./results/' + outputfile + '.png') plt.show()
def cluster_cells(self, numberOfCluster=None, subspace=False, min_sample=10, method="kmeans", maxiter=10e3, alpha=1, gamma=1, eta=0.01, eps=0.5, min_samples=5, metric='euclidean', xi=.05, min_cluster_size=.05): if (subspace == False): data = self.data else: svd = TruncatedSVD(n_components=500) data = svd.fit_transform(mictiObject_1.data.toarray()) if method == "kmeans": kmean = Kmeans.Kmeans(data, numberOfCluster, self.geneNames, self.cellNames) _, self.cluster_assignment = kmean.kmeans_multiple_runs(maxiter, 5) self.k = len(set(self.cluster_assignment)) elif method == "GM": EM_GM = GM.GM(data, numberOfCluster, self.geneNames, self.cellNames) EM_GMM = EM_GM.EM_for_high_dimension() self.cluster_assignment = np.argmax(EM_GMM["resp"], axis=1) self.k = len(set(self.cluster_assignment)) elif method == "hdp": corpusData = pa.DataFrame(data.toarray()) corpusData.columns = self.geneNames corpusData.index = self.cellNames cc, id2g, id2c = self.cellMatrix2cellCorpus(corpusData) hdp = HdpModel(cc, id2g, alpha=alpha, gamma=gamma, eta=eta) tp_dist = hdp.__getitem__(cc) cell_tp = [max(dict(i), key=dict(i).get) for i in tp_dist] low_conf_cluster = np.where(np.bincount(cell_tp) < min_sample) filter_noise = [ False if i in low_conf_cluster[0] else True for i in cell_tp ] new_assignment = np.array([ cell_tp[i] if filter_noise[i] else 100 for i in range(len(filter_noise)) ]) new_assignment[new_assignment > sorted(set(new_assignment)) [-2]] = sorted(set(new_assignment))[-2] + 1 self.cluster_assignment = new_assignment self.k = len(new_assignment) elif method == "lda": corpusData = pa.DataFrame(data.toarray()) corpusData.columns = self.geneNames corpusData.index = self.cellNames cc, id2g, id2c = self.cellMatrix2cellCorpus(corpusData) lda = LdaModel(corpus=cc, id2word=id2g, num_topics=numberOfCluster, update_every=1, passes=1, alpha=alpha, eta=eta) cell_type = lda.get_document_topics(cc) cell_type_lda = [max(dict(i), key=dict(i).get) for i in cell_type] self.cluster_assignment = cell_type_lda self.k = len(set(cell_type_lda)) elif method == "aggl": aggl_clustering = cluster.AgglomerativeClustering( n_clusters=numberOfCluster).fit(data.toarray()) self.cluster_assignment = aggl_clustering.labels_ self.k = len(set(aggl_clustering.labels_)) elif method == "birch": birch_clustering = cluster.Birch(n_clusters=numberOfCluster).fit( data.toarray()) self.cluster_assignment = birch_clustering.predict(data.toarray()) self.k = len(set(list(self.cluster_assignment))) elif method == "dbscan": dbscan_clustering = cluster.DBSCAN(eps=eps, min_samples=min_samples, metric=metric).fit( data.toarray()) dbscan_lables = dbscan_clustering.labels_ dbscan_lables[dbscan_lables < 0] = dbscan_lables.max() + 1 self.cluster_assignment = dbscan_lables self.k = len(set(dbscan_lables)) elif method == "knn": knn_sparce_connectivity = kneighbors_graph(data.toarray(), min_sample) n_components, labels = csgraph.connected_components( knn_sparce_connectivity) labels[labels < 0] = labels.max() + 1 self.cluster_assignment = labels self.k = len(set(labels)) elif method == "optics": optics_clustering = cluster.OPTICS( min_samples=min_samples, xi=xi, min_cluster_size=min_cluster_size, metric=metric).fit(data.toarray()) optics_label = optics_clustering.labels_[ optics_clustering.ordering_] optics_label[optics_label < 0] = optics_label.max() + 1 self.cluster_assignment = optics_label self.k = len(set(optics_label)) self.cluster_label = [str(i) for i in range(self.k)] return None
tr_features = np.load('./data/voc12/features/train_features.npy', allow_pickle=True) val_features = np.load('./data/voc12/features/val_features.npy', allow_pickle=True) features = tr_aug_features.tolist() + tr_features.tolist( ) + val_features.tolist() df = pd.DataFrame.from_records(features) df.drop_duplicates('img_name', inplace=True) df['feature'] = df['feature'].apply(lambda x: x[0].reshape(-1).tolist()) X = np.array(df['feature'].values.tolist()) for eps in range(2, 50, 3): for min_sample in [2, 3, 4, 8, 16, 32]: print(eps, min_sample) cls = cluster.OPTICS(eps=eps, n_jobs=-1, min_samples=min_sample) cls = cls.fit(X) labels = cls.labels_ label_d = dict() category_size = len(set(cls.labels_)) for img_name, label in zip(df['img_name'].values, cls.labels_): cluster_label = np.zeros(category_size) cluster_label[label] = 1 label_d[img_name] = cluster_label print(eps, category_size, min_sample) if len(set(cls.labels_)) > 1: print("Silhouette Coefficient: %0.3f" % metrics.silhouette_score(X, labels)) np.save('./data/voc12/cls_optics%s_%s_labels.npy' % (eps, min_sample), label_d)
def main(): """Comaprison between K-means, Spectral Clustering, CURE, DBSCAN, OPTICS and SNN clustering algorithms in small and medium real world data sets. Comparison with K-means and SNN clustering algorithm in large real world data set. """ # =============================== # SMALL AND MEDIUM REAL DATA SETS # =============================== from sklearn import datasets plt.figure(figsize=(9 * 2 + 3, 12.5)) plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 default_base = { 'eps': .5, 'MinPts_fraction': 0.5, 'n_neighbors': 20, 'n_clusters': 3, 'min_samples': 20, 'xi': 0.05, 'min_cluster_size': 0.1, 'width': 2.5, 'height': 2.5 } # Small and medium world real datasets iris = datasets.load_iris(return_X_y=True) breast_cancer = datasets.load_breast_cancer(return_X_y=True) datasets = [ (iris, { 'name': 'iris', 'n_clusters': 3, 'd_eps': 0.8, 'coord_x': 2, 'coord_y': 1, 'n_neighbors': 30, 'eps': 0.35, 'MinPts_fraction': 0.5 }), (breast_cancer, { 'name': 'breast_cancer', 'n_clusters': 2, 'd_eps': 2, 'coord_x': 2, 'coord_y': 3, 'n_neighbors': 60, 'eps': 0.5, 'MinPts_fraction': 0.5 }), ] snn_parameters = [] results = [] total_ypred = [] for i_dataset, (dataset, algo_params) in enumerate(datasets): # update parameters with dataset-specific values params = default_base.copy() params.update(algo_params) snn_parameters.append([ params['name'], params['n_neighbors'], params['eps'], params['MinPts_fraction'] ]) X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # ============ # Create cluster algorithms # ============ k_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['d_eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) snn = SNN(K=params['n_neighbors'], Eps=params['eps'], MinPts_fraction=0.5) clustering_algorithms = (('Original', None), ('K_means', k_means), ('SpectralClustering', spectral), ('CURE', cure), ('DBSCAN', dbscan), ('OPTICS', optics), ('SNN', snn)) for name, algorithm in clustering_algorithms: if name == 'CURE': cure_inst = algorithm(X, params['n_clusters']) cure_inst.process() clusters = cure_inst.get_clusters() y_pred = [0] * len(X) for i in range(len(clusters)): cluster_cure = clusters[i] for index in cluster_cure: y_pred[index] = i elif name == 'Original': y_pred = y else: algorithm.fit(X) if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) total_ypred.append(y_pred) mutual_info = None rand_index = None fowlkes_mallows = None calinski_score = None davies_bouldin = None silhouette = None if len(np.unique(y_pred)) > 1 and len(np.unique(y)) > 1: # External indices: mutual_info = round( adjusted_mutual_info_score(y, y_pred, average_method='arithmetic'), 3) rand_index = round(adjusted_rand_score(y, y_pred), 3) fowlkes_mallows = round(fowlkes_mallows_score(y, y_pred), 3) # Internal indexes calinski_score = round(calinski_harabaz_score(X, y_pred), 3) davies_bouldin = round(davies_bouldin_score(X, y_pred), 3) silhouette = round(silhouette_score(X, y_pred), 3) results.append([ params['name'], name, mutual_info, rand_index, fowlkes_mallows, calinski_score, davies_bouldin, silhouette ]) plt.subplot(len(datasets), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=18) colors = np.array( list( islice( cycle([ '#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00' ]), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.scatter(X[:, params['coord_x']], X[:, params['coord_y']], s=10, color=colors[y_pred]) plt.xlim(-params['width'], params['width']) plt.ylim(-params['height'], params['height']) plt.xticks(()) plt.yticks(()) plot_num += 1 outputfile = "./results/real_datasets_sklearn_metrics" results_df = pd.DataFrame(results, columns=[ 'Dataset', 'Algorithm', 'AMI', 'ARI', 'FM', 'CHI', 'DBI', 'Silhouette' ]) results_df.to_csv(outputfile + '.csv', index=False, header=True) results_df.to_excel(outputfile + '.xlsx', index=False, header=True) plt.savefig('./results/real_datasets_sklearn.png') plt.show() # =============================== # LARGE REAL DATA SET # =============================== def correct_detections(y_pred): """Print correct de""" dos_cor, normal_cor, probe_cor, r2l_cor, u2r_cor = 0, 0, 0, 0, 0 for val in y_pred[0:999]: if val == 0: dos_cor += 1 for val in y_pred[1000:1999]: if val == 1: normal_cor += 1 for val in y_pred[2000:2999]: if val == 2: probe_cor += 1 for val in y_pred[3000:3999]: if val == 3: r2l_cor += 1 for val in y_pred[4000:4999]: if val == 4: u2r_cor += 1 print(dos_cor, normal_cor, probe_cor, r2l_cor, u2r_cor) # ===== K means clustering ====== pd_dataset = pd.read_csv('./csv_files/KDD.csv') X = pd_dataset.iloc[:, :-1].to_numpy() y = pd_dataset.iloc[:, -1].to_numpy() k_means = cluster.MiniBatchKMeans(n_clusters=5, random_state=42) k_means.fit(X) if hasattr(k_means, 'labels_'): y_pred_kmeans = k_means.labels_.astype(np.int) else: y_pred_kmeans = k_means.predict(X) # Count detections per cluster unique, counts = np.unique(y_pred_kmeans, return_counts=True) print(dict(zip(unique, counts))) # Evaluation print(classification_report(y, y_pred_kmeans)) correct_detections(y_pred_kmeans) # ===== SNN clustering ====== snn = SNN(K=300, Eps=0.4, MinPts_fraction=0.5) snn.fit(X) if hasattr(snn, 'labels_'): y_pred = snn.labels_.astype(np.int) else: y_pred = snn.predict(X) # Count detections per cluster unique, counts = np.unique(y_pred, return_counts=True) print(dict(zip(unique, counts))) # Evaluation print(classification_report(y, y_pred)) correct_detections(y_pred) # External and Internal indices evaluation : results = [] total_ypred = [('Original', y), ('k-means', y_pred_kmeans), ('SNN', y_pred)] for name, y_pred in total_ypred: silhouette = round(silhouette_score(X, y_pred), 3) results.append(['KDD CUP 99', name, silhouette]) outputfile = "./results/real_datasets_KDD_CUP_metrics" results_df = pd.DataFrame( results, columns=['Dataset', 'Algorithm', 'Silhouette']) results_df.to_csv(outputfile + '.csv', index=False, header=True) results_df.to_excel(outputfile + '.xlsx', index=False, header=True)
def opt(m, e, s, x): model = cluster.OPTICS(min_samples=int(m), xi=x, min_cluster_size=s) return model
else: y.append(model.predict(x)) fig = plt.figure(figsize=(27, 9)) fig.suptitle("GaussianMixture", fontsize=48) for i in range(6): ax = plt.subplot(2, 3, i+1) ax.scatter(X[i][:,0], X[i][:,1], c=y[i]) plt.savefig("GaussianMixture.eps", format="eps") # ============================================================================= y = [] for c, x in zip(classes, X): model = cluster.OPTICS(min_samples=20, xi=0.05, min_cluster_size=0.1) model.fit(x) if hasattr(model, "labels_"): y.append(model.labels_.astype(np.int)) else: y.append(model.predict(x)) fig = plt.figure(figsize=(27, 9)) fig.suptitle("OPTICS", fontsize=48) for i in range(6): ax = plt.subplot(2, 3, i+1) ax.scatter(X[i][:,0], X[i][:,1], c=y[i]) plt.savefig("OPTICS.eps", format="eps") # =============================================================================
dhtml('DBSCAN, OPTICS, & Others') n_clusters=2 km=scl.KMeans(n_clusters=n_clusters,random_state=0) y3_km=km.fit_predict(X3) ac=scl.AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean', linkage='complete') y3_ac=ac.fit_predict(X3) sp=scl.SpectralClustering(n_clusters=n_clusters) y3_sp=sp.fit_predict(X3) db=scl.DBSCAN(eps=.2,min_samples=15, metric='euclidean') y3_db=db.fit_predict(X3) op=scl.OPTICS(eps=.2,min_samples=30) y3_op=op.fit_predict(X3) cl=['KMeans','Agglomerative','Spectral', 'DBSCAN','OPTICS'] py3=[y3_km,y3_ac,y3_sp,y3_db,y3_op] np.unique(y3_db) f,ax=pl.subplots(5,1,figsize=(6,18)) for c in range(5): for i in np.unique(py3[c]): color=pl.cm.cool(.4*(i+1)) ax[c].scatter(X3[py3[c]==i,0], X3[py3[c]==i,1], s=30,color=color,marker='v', edgecolor='#aa33ff',
def run(self): plt.figure(figsize=(9 * 2 + 3, len(self.datasets)*2)) # plt.subplots_adjust(left=.02, right=.98, bottom=.001, top=.96, wspace=.05, hspace=.01) plot_num = 1 for i_dataset, (dataset, algo_params) in enumerate(self.datasets): # update parameters with dataset-specific values params = self.default_base.copy() params.update(algo_params) X, y = dataset # normalize dataset for easier parameter selection X = StandardScaler().fit_transform(X) # estimate bandwidth for mean shift bandwidth = cluster.estimate_bandwidth(X, quantile=params['quantile']) # connectivity matrix for structured Ward connectivity = kneighbors_graph(X, n_neighbors=params['n_neighbors'], include_self=False) # make connectivity symmetric connectivity = 0.5 * (connectivity + connectivity.T) # ============ # Create cluster objects # ============ ms = cluster.MeanShift(bandwidth=bandwidth, bin_seeding=True) two_means = cluster.MiniBatchKMeans(n_clusters=params['n_clusters']) ward = cluster.AgglomerativeClustering(n_clusters=params['n_clusters'], linkage='ward', connectivity=connectivity) spectral = cluster.SpectralClustering(n_clusters=params['n_clusters'], eigen_solver='arpack', affinity="nearest_neighbors") dbscan = cluster.DBSCAN(eps=params['eps']) optics = cluster.OPTICS(min_samples=params['min_samples'], xi=params['xi'], min_cluster_size=params['min_cluster_size']) affinity_propagation = cluster.AffinityPropagation(damping=params['damping'], preference=params['preference']) average_linkage = cluster.AgglomerativeClustering(linkage="average", affinity="cityblock", n_clusters=params['n_clusters'], connectivity=connectivity) birch = cluster.Birch(n_clusters=params['n_clusters']) gmm = mixture.GaussianMixture(n_components=params['n_clusters'], covariance_type='full') clustering_algorithms = ( ('MiniBatchKMeans', two_means), ('AffinityPropagation', affinity_propagation), ('MeanShift', ms), ('SpectralClustering', spectral), ('Ward', ward), ('AgglomerativeClustering', average_linkage), ('DBSCAN', dbscan), ('OPTICS', optics), ('Birch', birch), ('GaussianMixture', gmm) ) for name, algorithm in clustering_algorithms: t0 = time.time() # catch warnings related to kneighbors_graph with warnings.catch_warnings(): warnings.filterwarnings( "ignore", message="the number of connected components of the " + "connectivity matrix is [0-9]{1,2}" + " > 1. Completing it to avoid stopping the tree early.", category=UserWarning) warnings.filterwarnings( "ignore", message="Graph is not fully connected, spectral embedding" + " may not work as expected.", category=UserWarning) algorithm.fit(X) t1 = time.time() if hasattr(algorithm, 'labels_'): y_pred = algorithm.labels_.astype(np.int) else: y_pred = algorithm.predict(X) plt.subplot(len(self.datasets), len(clustering_algorithms), plot_num) if i_dataset == 0: plt.title(name, size=9) colors = np.array(list(islice(cycle(['#377eb8', '#ff7f00', '#4daf4a', '#f781bf', '#a65628', '#984ea3', '#999999', '#e41a1c', '#dede00']), int(max(y_pred) + 1)))) # add black color for outliers (if any) colors = np.append(colors, ["#000000"]) plt.scatter(X[:, 0], X[:, 1], s=10, color=colors[y_pred]) plt.xlim(-2.5, 2.5) plt.ylim(-2.5, 2.5) plt.xticks(()) plt.yticks(()) plt.text(.99, .01, ('%.2fs' % (t1 - t0)).lstrip('0'), transform=plt.gca().transAxes, size=8, horizontalalignment='right') plot_num += 1 plt.show() # Test code # # X_data = np.array([[667. ], # [693.3], # [732.9], # [658.9], # [702.8], # [697.2], # [658.7], # [723.1], # [719.5], # [687.4], # [704.1], # [658.8], # [667.8], # [703.4]]) # Y = np.array([38.36, # 11.06, # 8.13, # 45.23, # 11.16, # 11.96, # 40.27, # 7.01, # 7.25, # 11.28, # 7.21, # 40.4 , # 32.2 , # 11.18]) # # cl = cluster_test(X_data, Y) # cl.run()