def cluster(self, method='ward'): l_method = agglomerative_l_method(self.X, method=method) self.sub_clusters_cnt = len(l_method.cluster_centers_) # print('sub_clusters_cnt:', self.sub_clusters_cnt, 'cnt:', self.cnt) self.clustering_model = DividableClustering() self.clustering_model.fit(self.X, l_method.labels_)
def fn(inst): if not 'x' in inst: raise Exception('no x') x = inst['x'] model = l_method.agglomerative_l_method(x) return inst.set('prediction', model.labels_)\ .set('centroids', model.cluster_centers_)
def l_method(ax, X, method): l_method = agglomerative_l_method(X, method=method) suggest_n = len(l_method.cluster_centers_) cmap = get_cmap(suggest_n + 1) for label in range(suggest_n): XX = list(map(lambda xy: xy[0], filter(lambda xy: xy[1] == label, zip(X, l_method.labels_)))) plot(ax, XX, c=cmap(label), edgecolors='none')
def cluster(self): l_method = agglomerative_l_method(self.X) # suggest_n = len(l_method.cluster_centers_) # agg = AgglomerativeClustering(suggest_n) # agg.fit(self.X) # agg_labels = agg.labels_ # l_method_labels = l_method.labels_ # # print('agg_labels:', agg_labels) # print('l_method_labels:', l_method_labels) # first tier clustering, using agglomerative clustering self.clustering_model = DividableClustering() self.clustering_model.fit(self.X, l_method.labels_)
def cluster(self, method='ward'): assert len(self.X) == len(self.y_seed) l_method = agglomerative_l_method(self.X, method=method) # first tier clustering, using agglomerative clustering self.clustering_model = DividableClustering() self.clustering_model.fit(self.X, l_method.labels_) # second tier, using kmeans for suspect_label in range(self.clustering_model.latest_label): ind_X = self.clustering_model.get_X_with_idx(suspect_label) y_seed = [] X = [] for x, idx in ind_X: X.append(x) y_seed.append(self.y_seed[idx]) # no collision in this sub-group if not self.has_collision(X, y_seed): continue # there is collisions in this sub-group low_cnt = 2 high_cnt = len(X) last_possible_labels = None while low_cnt <= high_cnt: # 1/4 biased binary search cluster_cnt = int((high_cnt - low_cnt) * 1/4 + low_cnt) kmeans = KMeans(cluster_cnt) kmeans.fit(X) if not self.has_collision(X, y_seed, kmeans): last_possible_labels = kmeans.labels_ high_cnt = cluster_cnt - 1 else: low_cnt = cluster_cnt + 1 self.splitting_score += cluster_cnt print('split sub_clusters_cnt:', cluster_cnt, 'cnt:', len(X), 'main cnt:', self.cnt) self.clustering_model.split(suspect_label, last_possible_labels) self.clustering_model.relabel()
from dividable_clustering import DividableClustering from agglomerative_clustering import AgglomerativeClustering from sklearn.cluster import KMeans from dataset import * from sklearn.neighbors import BallTree from l_method import agglomerative_l_method dataset = get_iris() l_method = agglomerative_l_method(dataset.X) model = DividableClustering() model.fit(dataset.X, l_method.labels_) print('labels:', l_method.labels_) print('predicts:', model.predict(dataset.X))
def fit(self, x): self.x = x labels = agglomerative_l_method(x) labels = np.array(labels) self.labels_ = labels
def static(index): return 'rgb'[index] return map_index_to_rgb_color dataset = get_iris() pca = PCA(2) pca.fit(dataset.X) X = pca.transform(dataset.X) # X = dataset.X # X = list(map(lambda x: x[:2], dataset.X)) print('X:', X) l_method = agglomerative_l_method(X) clusters_cnt = len(l_method.cluster_centers_) # agg = AgglomerativeClustering(clusters_cnt).fit(X) # labels = agg.labels_ labels = l_method.labels_ print('X:', X) X_by_label = {} for x, label in zip(X, labels): if label not in X_by_label: X_by_label[label] = [] X_by_label[label].append(x)