def cluster_clique(self, intervals, threshold): self.clustering_name = "clique clustering" X = self.data_frame.get_point_only_df().values clique_instance = clique(X, intervals, threshold) clique_instance.process() clusters = clique_instance.get_clusters() self.data_frame.add_result_name(self.clustering_name, -2, ColType.CLUSTER_LABEL) i = 1 for cluster in clusters: for index in cluster: self.data_frame.add_result(self.clustering_name, index, i) i += 1 self.cluster_count = len( set(self.data_frame.df[self.clustering_name].tolist())) self.clustering_result = self.data_frame.df[ self.clustering_name].tolist() return self.clustering_name
def visualize(path, levels, threshold, ccore_enabled, **kwargs): sample = read_sample(path) clique_instance = clique(sample, levels, threshold, ccore=ccore_enabled) clique_instance.process() cells = clique_instance.get_cells() clique_visualizer.show_grid(cells, sample)
def fit(self,data): data = data.values self.CLIQUE = clique(data,self.intervals,self.threshold) self.CLIQUE.process() preds = self.CLIQUE.get_clusters() self.labels_ = np.empty(data.shape[0],dtype=int) for id_,pred in enumerate(preds): for i in pred: self.labels_[i] = id_
def exception(type, sample_storage, levels, threshold, ccore_enabled): try: sample = sample_storage if isinstance(sample_storage, str): sample = read_sample(sample_storage) bang_instance = clique(sample, levels, threshold, ccore=ccore_enabled) bang_instance.process() except type: return except Exception as ex: raise AssertionError("Expected: '%s', Actual: '%s'" % (type, type(ex).__name__)) raise AssertionError("Expected: '%s', Actual: 'None'" % type)
def analyze_manifold_old(model: VaeWrapper, sess, xs, ys, stage=1): # TODO (3/17): deprecate inds = list(range(len(xs))) cnt = 1000 np.random.shuffle(inds) xs, ys = xs[inds][:cnt], ys[inds][:cnt] zs = model.encode(xs, stage=stage) zst = zs.T corr = np.corrcoef(zst) print(corr) # create CLIQUE algorithm for processing intervals = 20 # defines amount of cells in grid in each dimension threshold = 0 clique_instance = clique(zs, intervals, threshold) # start clustering process and obtain results clique_instance.process() clusters = clique_instance.get_clusters() # allocated clusters # points that are considered as outliers noise = clique_instance.get_noise() cells = clique_instance.get_cells() # CLIQUE blocks that forms grid print("Amount of clusters:", len(clusters)) encodings = clique_instance.get_cluster_encoding() print(encodings) if model.latent_dim == 2: # visualize clustering results # clique_visualizer.show_grid(cells, zs) # clique_visualizer.show_clusters(zs, clusters, noise) import hdbscan clusterer = hdbscan.HDBSCAN(min_cluster_size=4) cluster_labels = clusterer.fit_predict(zs) print("# clusters by HDBSCAN", clusterer.labels_.max()) print("probabilities", clusterer.probabilities_) print("labels", clusterer.labels_) clusterer.condensed_tree_.plot() plt.show() elif model.latent_dim == 3: fig = go.Figure(data=[go.Scatter3d( x=zst[0], y=zst[1], z=zst[2], mode='markers', marker=dict(size=2, color=ys, colorscale='Viridis', opacity=0.8), )]) # tight layout fig.update_layout(margin=dict(l=0, r=0, b=0, t=0)) fig.show()
def setup(self, keywords={}): """ Setup the algorithms """ for p in keywords.keys(): setattr(self, p, keywords[p]) if self.method == "bang": self.obj = bang(self.data_list, self.levels, ccore=self.ccore, density_threshold=self.density_threshold, amount_threshold=self.amount_threshold) if self.method == "clique": self.obj = clique(self.data_list, self.amount_threshold, self.density_threshold, ccore=self.ccore) return
def template_clustering(data_path, intervals, density_threshold, **kwargs): print("Sample: '%s'." % os.path.basename(data_path)) data = read_sample(data_path) clique_instance = clique(data, intervals, density_threshold) clique_instance.process() clusters = clique_instance.get_clusters() noise = clique_instance.get_noise() cells = clique_instance.get_cells() print([len(cluster) for cluster in clusters]) clique_visualizer.show_grid(cells, data) visualizer = cluster_visualizer() visualizer.append_clusters(clusters, data) visualizer.append_cluster(noise, data, marker='x') visualizer.show()
def clustering(path, intervals, density_threshold, expected_clusters, expected_noise, ccore_enabled, **kwargs): sample = read_sample(path) dimension = len(sample[0]) clique_instance = clique(sample, intervals, density_threshold, ccore=ccore_enabled) clique_instance.process() clusters = clique_instance.get_clusters() noise = clique_instance.get_noise() cells = clique_instance.get_cells() assertion.eq(len(cells), pow(intervals, dimension)) obtained_length = len(noise) obtained_cluster_length = [] for cluster in clusters: obtained_length += len(cluster) obtained_cluster_length.append(len(cluster)) obtained_cluster_length.sort() assertion.eq(len(sample), obtained_length) assertion.eq(expected_noise, len(noise)) if expected_clusters is not None: assertion.eq(len(expected_clusters), len(clusters)) assertion.eq(expected_clusters, obtained_cluster_length) covered_points = set() for cell in cells: points = cell.points for index_point in points: covered_points.add(index_point) assertion.eq(len(sample), len(covered_points)) return clique_instance
data = pd.read_csv("Mall_Customers.csv") data.rename(columns={ 'Annual Income (k$)': 'Annual_Income', 'Spending Score (1-100)': 'Spending_Score' }, inplace=True) data['Gender'] = data['Gender'].replace(['Male', 'Female'], [0, 1]) data.drop(["CustomerID"], axis=1, inplace=True) data_values = data.values # Define the number of grid cells in each dimension intervals = 5 # Density threshold threshold = 0 clique_instance = clique(data_values, intervals, threshold) clique_instance.process() clique_cluster = clique_instance.get_clusters() noise = clique_instance.get_noise() cells = clique_instance.get_cells() print("Amount of clusters:", len(clique_cluster)) for cluster in clique_cluster: print(cluster) labelList = [0] * 200 j = 1 for cluster in clique_cluster: for x in cluster:
sales.date = sales.date.apply( lambda x: datetime.datetime.strptime(x, '%d.%m.%Y')) monthly_sales = sales.groupby(["date_block_num", "shop_id", "item_id"])["date_block_num", "item_price", "item_cnt_day", "shop_id"].agg({ "date_block_num": "mean", "item_price": "mean", "item_cnt_day": "sum", "shop_id": "mean" }) df = pd.DataFrame(monthly_sales) df = np.array(df) from pyclustering.cluster.clique import clique, clique_visualizer from pyclustering.utils import read_sample from pyclustering.samples.definitions import FCPS_SAMPLES intervals = 10 threshold = 0 clique_instance = clique(df, intervals, threshold) clique_instance.process() clusters = clique_instance.get_clusters() noise = clique_instance.get_noise() cells = clique_instance.get_cells() print("Amount of clusters:", len(clusters))
data_M = np.array(data) return data_M # file_list = ["five_cluster.txt", "spiral.txt", # "ThreeCircles.txt", "Twomoons.txt"] data_M = loadDataSet('Twomoons.txt') TestData = data_M[:, [1, 2]] g_truth = data_M[:, 0] # 创建 CLIQUE 算法进行处理 # 定义每个维度中网格单元的数量 intervals = 15 # 密度阈值 threshold = 10 clique_instance = clique(TestData, intervals, threshold) ''' five_cluster.txt intervals = 15,threshold = 10 0.812 t=0.039999961853027344 spiral.txt no ThreeCircles.txt intervals = 20,threshold = 1 0.990563419372745 t=0.0800008773803711 Twomoons.txt intervals = 15,threshold = 1 0.9880159786950732 t=0.04000043869018555 ''' # 开始聚类过程并获得结果 t1 = time.time_ns() clique_instance.process() t2 = time.time_ns() clique_cluster = clique_instance.get_clusters() # allocated clusters # 被认为是异常值的点(噪点) noise = clique_instance.get_noise() # CLIQUE形成的网格单元
def test_high_dimension_data_failure(self): data = [[0, 1, 2, 1, 3, 4, 5, 1, 2, 3, 3, 1, 3], [0, 1, 0, 1, 3, 8, 5, 5, 3, 3, 3, 0, 0]] clique_instance = clique(data, 15, 0) assertion.exception(RuntimeError, clique_instance.process)
del X1, X2, X3, X4, X5 # correct cluster labels in an array col = [] for i in range(n_cluster): for j in range(n): col.append(i) for i in range(n_noise): col.append(-1) col = (col) del i, j # apply DBSCAN multiple times X_cliq = X.values.tolist() t0 = time.time() clique_instance = clique(X_cliq, intervall, threshold) clique_instance.process() clusters = clique_instance.get_clusters() noise_c = clique_instance.get_noise() t1 = time.time() clusters.append(noise_c) ## plot the result # array with all the color per cluster color = [] help_color = np.array([np.zeros(len(X))] * 2).T counter = 0 for j in range(len(clusters)): for index, i in enumerate(clusters[j]): help_color[counter, 0] = i help_color[counter, 1] = j