def test_birch_example_reproducibility(example_id):
    # check reproducibility of the Birch example
    rng = np.random.RandomState(42)

    X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng)

    cluster_model = Birch(threshold=0.9,
                          branching_factor=20,
                          compute_sample_indices=True)
    cluster_model.fit(X)
    # assert len(cluster_model.root_.subclusters_[1].child_.subclusters_) == 3

    htree, n_subclusters = birch_hierarchy_wrapper(cluster_model)

    assert htree.tree_size == n_subclusters

    # same random seed as in the birch hierarchy example
    assert htree.tree_size == 78
    sc = htree.flatten()[example_id]
    if example_id == 34:
        # this is true in both cases, but example_id fails on circle ci
        assert sc.current_depth == 1
        assert len(sc.children) == 3

    assert_array_equal([sc['cluster_id'] for sc in htree.flatten()],
                       np.arange(htree.tree_size))
def test_birch_clusterig_single_nodes():

    basename = os.path.dirname(__file__)
    X = np.load(
        os.path.join(basename, '..', 'data', 'ds_lsi_birch', 'data.npy'))
    branching_factor = 5

    mod = Birch(n_clusters=None,
                threshold=0.1,
                branching_factor=branching_factor,
                compute_labels=False,
                compute_sample_indices=True)
    mod.fit(X)

    htree, n_subclusters = birch_hierarchy_wrapper(mod)

    # let's compute cluster similarity
    for row in htree.flatten():
        inertia, S_sim = centroid_similarity(X, row['document_id_accumulated'])
        row['document_similarity'] = S_sim
        row['cluster_similarity'] = inertia

    assert htree.tree_size == n_subclusters

    doc_count = 0
    for el in htree.flatten():
        doc_count += len(el['document_id'])
        el.current_depth
        el.document_id_accumulated
    assert doc_count == len(htree['document_id_accumulated'])
    assert doc_count == X.shape[0]
    assert htree.document_count == X.shape[0]

    # make sure that we have no clusters with a single child
    assert sum(len(el.children) == 1 for el in htree.flatten()) == 0
def test_birch_make_hierarchy(dataset, optimal_sampling):

    if dataset == 'random':
        np.random.seed(9999)

        X = np.random.rand(1000, 100)
        normalize(X)
        branching_factor = 10
    elif dataset == 'birch_hierarchical':
        basename = os.path.dirname(__file__)
        X = np.load(
            os.path.join(basename, '..', 'data', 'ds_lsi_birch', 'data.npy'))
        branching_factor = 2

    mod = Birch(n_clusters=None,
                threshold=0.1,
                branching_factor=branching_factor,
                compute_labels=False,
                compute_sample_indices=True)
    mod.fit(X)

    htree, n_subclusters = birch_hierarchy_wrapper(mod)

    # let's compute cluster similarity
    for row in htree.flatten():
        inertia, S_sim = centroid_similarity(X, row['document_id_accumulated'])
        row['document_similarity'] = S_sim
        row['cluster_similarity'] = inertia

    assert htree.tree_size == n_subclusters

    doc_count = 0
    for el in htree.flatten():
        doc_count += len(el['document_id'])
        el.current_depth
        el.document_id_accumulated
    assert doc_count == len(htree['document_id_accumulated'])
    assert doc_count == X.shape[0]
    assert htree.document_count == X.shape[0]
    if optimal_sampling:
        s_samples_1 = compute_optimal_sampling(htree,
                                               min_similarity=0.85,
                                               min_coverage=0.9)

        for row in s_samples_1:
            assert len(row['document_similarity']) == 1
            assert len(row['document_id_accumulated']) == 1
        s_samples_2 = compute_optimal_sampling(htree,
                                               min_similarity=0.85,
                                               min_coverage=0.2)
        s_samples_3 = compute_optimal_sampling(htree,
                                               min_similarity=0.9,
                                               min_coverage=0.9)

        assert len(s_samples_1) > len(s_samples_2)
        assert len(s_samples_1) < len(s_samples_3)
Exemple #4
0
 def __init__(self,
              threshold=0.5,
              branching_factor=20,
              n_clusters=None,
              outlier_threshold=0.5):
     self.threshold = threshold
     self.branching_factor = branching_factor
     self.n_clusters = n_clusters
     self.outlier_threshold = outlier_threshold
     self.Birch_clusterer = Birch(threshold=self.threshold,
                                  branching_factor=self.branching_factor,
                                  n_clusters=self.n_clusters,
                                  compute_sample_indices=True)
Exemple #5
0
    def birch(self, n_clusters=None, threshold=0.5, branching_factor=50,
              max_tree_depth=None):
        """
        Perform Birch clustering

        Parameters
        ----------
        n_clusters : int
            number of clusters
        lsi_components : int
            apply LSA before the clustering algorithm
        threshold : float
            birch threshold
        max_tree_depth : {int, None}
            maximum depth of the hierarchical tree
        """
        pars = {'threshold': threshold, 'is_hierarchical': n_clusters is None,
                'max_tree_depth': max_tree_depth, "metric": self.metric}
        if 'lsi' not in self.pipeline:
            raise ValueError("you must use lsi with birch clustering "
                             "for scaling reasons.")

        if n_clusters is None:
            compute_labels = False
        else:
            compute_labels = True

        km = Birch(n_clusters=n_clusters, threshold=threshold,
                   branching_factor=branching_factor,
                   compute_labels=compute_labels,
                   compute_sample_indices=True)

        return self._cluster_func(n_clusters, km, pars)
Exemple #6
0
Building the cluster hierarchy
------------------------------

We start by computing BIRCH clustering on some random structured data,
"""

import numpy as np
from sklearn.datasets import make_blobs
from freediscovery.cluster import Birch, birch_hierarchy_wrapper

rng = np.random.RandomState(42)

X, y = make_blobs(n_samples=1000, n_features=10, random_state=rng)

cluster_model = Birch(threshold=0.9,
                      branching_factor=20,
                      compute_sample_indices=True)
cluster_model.fit(X)

###############################################################################
#
# Next we wrap each subcluster in the cluster hierarchy
# (``cluster_model.root_``) with the
# :class:`~freediscovery.cluster.BirchSubcluster` class
# that allows easier manipulation of the hierarchical tree.

htree, _ = birch_hierarchy_wrapper(cluster_model)
print('Total number of subclusters:', htree.tree_size)

###############################################################################
#
Exemple #7
0
class birch(object):
    def __init__(self,
                 threshold=0.5,
                 branching_factor=20,
                 n_clusters=None,
                 outlier_threshold=0.5):
        self.threshold = threshold
        self.branching_factor = branching_factor
        self.n_clusters = n_clusters
        self.outlier_threshold = outlier_threshold
        self.Birch_clusterer = Birch(threshold=self.threshold,
                                     branching_factor=self.branching_factor,
                                     n_clusters=self.n_clusters,
                                     compute_sample_indices=True)

    # Fitting the model with train_X
    def fit(self, data):
        self.data = data
        #self.data.drop(self.data.columns[len(self.data.columns)-1], axis=1, inplace=True)
        self.Birch_clusterer.fit(self.data)

    #Defines and builds the Cluster Feature Tree
    def get_cluster_tree(self):
        self.htree, n_clusters = birch_hierarchy_wrapper(self.Birch_clusterer)
        clusters = {}
        max_depth = 0
        for i in range(n_clusters):
            node = bcluster()
            sub_cluster = self.htree.flatten()[i]
            node.set_cluster_id(sub_cluster['cluster_id'])
            depth = sub_cluster.current_depth
            node.set_depth(depth)
            if depth > max_depth:
                max_depth = depth
            if i not in clusters.keys():
                clusters[i] = {}
            if sub_cluster.current_depth == 0:
                node.set_parent()
            else:
                node.set_parent(clusters[sub_cluster.parent['cluster_id']])
            cluster_size = sub_cluster['cluster_size']
            node.set_size(cluster_size)
            data_points = sub_cluster['document_id_accumulated']
            data_points_names = self.data.iloc[
                data_points].index.values.tolist()
            node.set_data_points(data_points_names)
            centroid = self.data.iloc[
                sub_cluster['document_id_accumulated'], :].mean(axis=0).values
            node.set_centroid(centroid)
            d1, d1_v = self.calculate_d1(centroid, data_points)
            d2 = self.calculate_d2(centroid, data_points, d1_v)
            node.add_d1(d1)
            node.add_d2(d2)
            node.calculate_threshold(self.outlier_threshold)
            clusters[i] = node
            self.cluster_tree = clusters
        return self.cluster_tree, max_depth

    #Calculate the d1 distance(point farthest away from centroid)
    def calculate_d1(self, centroid, data_points):
        d1 = 0
        u = centroid
        d1_v = None
        for point in data_points:
            v = point
            distance = euclidean(u, v)
            if distance > d1:
                d1 = distance
                d1_v = v
        return d1, d1_v

    #Calculate the d2 distance(point farthest away from d1 and its distance from centroid)
    def calculate_d2(self, centroid, data_points, d1_v):
        d2_d1 = 0
        u = d1_v
        d2_v = None
        for point in data_points:
            v = point
            distance = euclidean(u, v)
            if distance > d2_d1:
                d2_d1 = distance
                d2_v = v
        d2 = euclidean(centroid, v)
        return d2

    # Display's the tree
    def show_clutser_tree(self):
        self.htree.display_tree()

    # Prediction Function with height based prediction with outlier detection
    def predict(self, test_X, depth):
        predicted = []
        for test_instance in test_X.iterrows():
            test_sample = test_instance[1].values
            min_distance = float('inf')
            selected_cluster = None
            for cluster_id in self.cluster_tree:
                if self.cluster_tree[cluster_id].depth != depth:
                    continue
                u = self.cluster_tree[cluster_id].centroid
                v = np.asarray(test_sample, dtype='float64')
                distance = euclidean(u, v)
                if distance < min_distance:
                    min_distance = distance
                    selected_cluster = cluster_id
            self.cluster_tree[selected_cluster].add_test_points(
                test_instance[0])
            # Outlier identifier
            #if self.cluster_tree[selected_cluster].check_outlier(min_distance):
            #    self.cluster_tree[selected_cluster].add_outlier_points(test_instance[0])
            #_predicted_label = self.cluster_tree[selected_cluster].classifier.predict([test_sample])
            #self.cluster_tree[selected_cluster].add_predicted(_predicted_label)
            predicted.append(selected_cluster)
        return predicted

    # Model certification creator
    def certify_model(self, cluster_tree, test_y):
        for cluster_id in cluster_tree:
            if len(cluster_tree[cluster_id].test_points) == 0:
                continue
            cluster_tree[cluster_id].set_test_labels(
                test_y[cluster_tree[cluster_id].test_points].values)
            precision = metrics.precision_score(
                cluster_tree[cluster_id].test_labels,
                cluster_tree[cluster_id].predicted,
                average='weighted')
            recall = metrics.recall_score(cluster_tree[cluster_id].test_labels,
                                          cluster_tree[cluster_id].predicted,
                                          average='weighted')
            f1_Score = metrics.f1_score(cluster_tree[cluster_id].test_labels,
                                        cluster_tree[cluster_id].predicted,
                                        average='weighted')
            score = {
                'precision': precision,
                'recall': recall,
                'f1_Score': f1_Score
            }
            cluster_tree[cluster_id].set_score(score)
def test_birch_hierarchy_fitted():
    model = Birch()

    with pytest.raises(NotFittedError):
        birch_hierarchy_wrapper(model)