def test_birch_example_reproducibility(example_id): # check reproducibility of the Birch example rng = np.random.RandomState(42) X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng) cluster_model = Birch(threshold=0.9, branching_factor=20, compute_sample_indices=True) cluster_model.fit(X) # assert len(cluster_model.root_.subclusters_[1].child_.subclusters_) == 3 htree, n_subclusters = birch_hierarchy_wrapper(cluster_model) assert htree.tree_size == n_subclusters # same random seed as in the birch hierarchy example assert htree.tree_size == 78 sc = htree.flatten()[example_id] if example_id == 34: # this is true in both cases, but example_id fails on circle ci assert sc.current_depth == 1 assert len(sc.children) == 3 assert_array_equal([sc['cluster_id'] for sc in htree.flatten()], np.arange(htree.tree_size))
def test_birch_clusterig_single_nodes(): basename = os.path.dirname(__file__) X = np.load( os.path.join(basename, '..', 'data', 'ds_lsi_birch', 'data.npy')) branching_factor = 5 mod = Birch(n_clusters=None, threshold=0.1, branching_factor=branching_factor, compute_labels=False, compute_sample_indices=True) mod.fit(X) htree, n_subclusters = birch_hierarchy_wrapper(mod) # let's compute cluster similarity for row in htree.flatten(): inertia, S_sim = centroid_similarity(X, row['document_id_accumulated']) row['document_similarity'] = S_sim row['cluster_similarity'] = inertia assert htree.tree_size == n_subclusters doc_count = 0 for el in htree.flatten(): doc_count += len(el['document_id']) el.current_depth el.document_id_accumulated assert doc_count == len(htree['document_id_accumulated']) assert doc_count == X.shape[0] assert htree.document_count == X.shape[0] # make sure that we have no clusters with a single child assert sum(len(el.children) == 1 for el in htree.flatten()) == 0
def test_birch_make_hierarchy(dataset, optimal_sampling): if dataset == 'random': np.random.seed(9999) X = np.random.rand(1000, 100) normalize(X) branching_factor = 10 elif dataset == 'birch_hierarchical': basename = os.path.dirname(__file__) X = np.load( os.path.join(basename, '..', 'data', 'ds_lsi_birch', 'data.npy')) branching_factor = 2 mod = Birch(n_clusters=None, threshold=0.1, branching_factor=branching_factor, compute_labels=False, compute_sample_indices=True) mod.fit(X) htree, n_subclusters = birch_hierarchy_wrapper(mod) # let's compute cluster similarity for row in htree.flatten(): inertia, S_sim = centroid_similarity(X, row['document_id_accumulated']) row['document_similarity'] = S_sim row['cluster_similarity'] = inertia assert htree.tree_size == n_subclusters doc_count = 0 for el in htree.flatten(): doc_count += len(el['document_id']) el.current_depth el.document_id_accumulated assert doc_count == len(htree['document_id_accumulated']) assert doc_count == X.shape[0] assert htree.document_count == X.shape[0] if optimal_sampling: s_samples_1 = compute_optimal_sampling(htree, min_similarity=0.85, min_coverage=0.9) for row in s_samples_1: assert len(row['document_similarity']) == 1 assert len(row['document_id_accumulated']) == 1 s_samples_2 = compute_optimal_sampling(htree, min_similarity=0.85, min_coverage=0.2) s_samples_3 = compute_optimal_sampling(htree, min_similarity=0.9, min_coverage=0.9) assert len(s_samples_1) > len(s_samples_2) assert len(s_samples_1) < len(s_samples_3)
We start by computing BIRCH clustering on some random structured data, """ import numpy as np from sklearn.datasets import make_blobs from freediscovery.cluster import Birch, birch_hierarchy_wrapper rng = np.random.RandomState(42) X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng) cluster_model = Birch(threshold=0.9, branching_factor=20, compute_sample_indices=True) cluster_model.fit(X) ############################################################################### # # Next we wrap each subcluster in the cluster hierarchy # (``cluster_model.root_``) with the # :class:`~freediscovery.cluster.BirchSubcluster` class # that allows easier manipulation of the hierarchical tree. htree, _ = birch_hierarchy_wrapper(cluster_model) print('Total number of subclusters:', htree.tree_size) ############################################################################### # # Visualizing the hierarchy # -------------------------
class birch(object): def __init__(self, threshold=0.5, branching_factor=20, n_clusters=None, outlier_threshold=0.5): self.threshold = threshold self.branching_factor = branching_factor self.n_clusters = n_clusters self.outlier_threshold = outlier_threshold self.Birch_clusterer = Birch(threshold=self.threshold, branching_factor=self.branching_factor, n_clusters=self.n_clusters, compute_sample_indices=True) # Fitting the model with train_X def fit(self, data): self.data = data #self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True) self.Birch_clusterer.fit(self.data) #Defines and builds the Cluster Feature Tree def get_cluster_tree(self): self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer) clusters = {} max_depth = 0 for i in range(n_clusters): node = bcluster() sub_cluster = self.htree.flatten()[i] node.set_cluster_id(sub_cluster['cluster_id']) depth = sub_cluster.current_depth node.set_depth(depth) if depth > max_depth: max_depth = depth if i not in clusters.keys(): clusters[i] = {} if sub_cluster.current_depth == 0: node.set_parent() else: node.set_parent(clusters[sub_cluster.parent['cluster_id']]) cluster_size = sub_cluster['cluster_size'] node.set_size(cluster_size) data_points = sub_cluster['document_id_accumulated'] data_points_names = self.data.iloc[ data_points].index.values.tolist() node.set_data_points(data_points_names) centroid = self.data.iloc[ sub_cluster['document_id_accumulated'], :].mean(axis=0).values node.set_centroid(centroid) d1, d1_v = self.calculate_d1(centroid, data_points) d2 = self.calculate_d2(centroid, data_points, d1_v) node.add_d1(d1) node.add_d2(d2) node.calculate_threshold(self.outlier_threshold) clusters[i] = node self.cluster_tree = clusters return self.cluster_tree, max_depth #Calculate the d1 distance(point farthest away from centroid) def calculate_d1(self, centroid, data_points): d1 = 0 u = centroid d1_v = None for point in data_points: v = point distance = euclidean(u, v) if distance > d1: d1 = distance d1_v = v return d1, d1_v #Calculate the d2 distance(point farthest away from d1 and its distance from centroid) def calculate_d2(self, centroid, data_points, d1_v): d2_d1 = 0 u = d1_v d2_v = None for point in data_points: v = point distance = euclidean(u, v) if distance > d2_d1: d2_d1 = distance d2_v = v d2 = euclidean(centroid, v) return d2 # Display's the tree def show_clutser_tree(self): self.htree.display_tree() # Prediction Function with height based prediction with outlier detection def predict(self, test_X, depth): predicted = [] for test_instance in test_X.iterrows(): test_sample = test_instance[1].values min_distance = float('inf') selected_cluster = None for cluster_id in self.cluster_tree: if self.cluster_tree[cluster_id].depth != depth: continue u = self.cluster_tree[cluster_id].centroid v = np.asarray(test_sample, dtype='float64') distance = euclidean(u, v) if distance < min_distance: min_distance = distance selected_cluster = cluster_id self.cluster_tree[selected_cluster].add_test_points( test_instance[0]) # Outlier identifier #if self.cluster_tree[selected_cluster].check_outlier(min_distance): # self.cluster_tree[selected_cluster].add_outlier_points(test_instance[0]) #_predicted_label = self.cluster_tree[selected_cluster].classifier.predict([test_sample]) #self.cluster_tree[selected_cluster].add_predicted(_predicted_label) predicted.append(selected_cluster) return predicted # Model certification creator def certify_model(self, cluster_tree, test_y): for cluster_id in cluster_tree: if len(cluster_tree[cluster_id].test_points) == 0: continue cluster_tree[cluster_id].set_test_labels( test_y[cluster_tree[cluster_id].test_points].values) precision = metrics.precision_score( cluster_tree[cluster_id].test_labels, cluster_tree[cluster_id].predicted, average='weighted') recall = metrics.recall_score(cluster_tree[cluster_id].test_labels, cluster_tree[cluster_id].predicted, average='weighted') f1_Score = metrics.f1_score(cluster_tree[cluster_id].test_labels, cluster_tree[cluster_id].predicted, average='weighted') score = { 'precision': precision, 'recall': recall, 'f1_Score': f1_Score } cluster_tree[cluster_id].set_score(score)