Exemple #1
0
 def test_build(self):
     m = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(1))
     self.assertEqual(2, len(m.layers))
     m.build(criterion.MaxDepth(2))
     self.assertEqual(3, len(m.layers))
     self.assertEqual(len(self.data), m.graphs[0].population)
     return
Exemple #2
0
 def test_partition_backends(self):
     data = datasets.random(n=100, dimensions=5)[0]
     m_single = Manifold(data, 'euclidean')._partition_single(
         [criterion.MaxDepth(5)])
     m_thread = Manifold(data, 'euclidean')._partition_threaded(
         [criterion.MaxDepth(5)])
     self.assertEqual(m_single, m_thread)
     return
Exemple #3
0
 def test_random_no_limits(self):
     # We begin by getting some data and building with no constraints.
     data = np.random.randn(100, 3)
     manifold = Manifold(data, 'euclidean').build()
     # With no constraints, clusters should be singletons.
     self.assertEqual(data.shape[0], manifold.graph.population)
     self.assertEqual(1, len(manifold.find_clusters(data[0], 0., -1)))
     self.assertEqual(1, len(manifold.find_points(data[0], 0.)))
     self.assertEqual(data.shape[0], manifold.layers[-1].cardinality)
     return
Exemple #4
0
    def test_eq(self):
        self.assertEqual(self.manifold, self.manifold)
        other = Manifold(self.data, 'euclidean', argpoints=0.2).build(
            criterion.MaxDepth(10),
            criterion.LFDRange(60, 50),
        )
        self.assertNotEqual(self.manifold, other)
        self.assertEqual(other, other)

        other = Manifold(self.data, 'cosine')
        self.assertNotEqual(self.manifold, other)
        return
Exemple #5
0
 def test_random_large(self):
     data = np.random.randn(1000, 3)
     manifold = Manifold(data, 'euclidean').build(
         criterion.MaxDepth(10),
         criterion.LFDRange(60, 50),
     )
     for _ in range(10):
         point = int(np.random.choice(3))
         linear_results = linear_search(data[point], 0.5, data,
                                        manifold.metric)
         self.assertEqual(len(linear_results),
                          len(manifold.find_points(data[point], 0.5)))
     return
Exemple #6
0
def depth_distribution(shape: str, filename: str):
    data = SHAPES[shape](num_points=10**3).T
    manifold: Manifold = Manifold(data, 'euclidean').build()
    depths: List[int] = [leaf.depth for leaf in manifold.layers[-1].clusters]
    hist_plot(depths, manifold.depth, False, 'depths',
              f'depths of leaves for {shape}', filename)
    return
Exemple #7
0
    def test_replace(self):
        manifold = Manifold(self.data, 'euclidean').build(
            criterion.MaxDepth(12),
            criterion.LFDRange(80, 20),
        )
        graph = manifold.layers[-1].build_edges()

        for i in range(10):
            clusters: Dict[int, Cluster] = {c: cluster for c, cluster in zip(range(graph.cardinality), graph.clusters)}
            if len(clusters) < 10:
                break
            sample_size = len(clusters) // 10
            samples: List[int] = list(map(int, np.random.choice(graph.cardinality, size=sample_size, replace=False)))
            removals: Set[Cluster] = {clusters[c] for c in samples if clusters[c].children}
            additions: Set[Cluster] = set()
            [additions.update(cluster.children) for cluster in removals]

            graph.replace_clusters(
                removals=removals,
                additions=additions,
            )

            clusters: Set[Cluster] = set(graph.clusters)

            self.assertEqual(0, len(removals.intersection(clusters)), f'\n1. Some removals clusters were still in the graph. iter {i}')
            self.assertTrue(additions.issubset(clusters), f'\n2. Some additions clusters were not in the graph. iter {i}')

            removal_edges: Set[Edge] = {edge for cluster in removals for edge in graph.edges if cluster in edge}
            self.assertEqual(0, len(removal_edges), f'\n3. Some removals clusters were still found among edges. iter {i}')

            self.assertEqual(0, len(graph.cache), f'\n4. Graph cache had some elements. {[k for k in graph.cache.keys()]}. iter {i}')
        return
Exemple #8
0
 def test_argsamples(self):
     data = np.zeros((100, 100))
     for i in range(10):
         data = np.concatenate([data, np.ones((1, 100)) * i], axis=0)
         manifold = Manifold(data, 'euclidean')
         cluster = Cluster(manifold, manifold.argpoints, '')
         self.assertLessEqual(i + 1, len(cluster.argsamples))
     return
Exemple #9
0
 def setUpClass(cls) -> None:
     cls.data, cls.labels = datasets.random(n=1000, dimensions=3)
     cls.manifold = Manifold(cls.data, 'euclidean')
     cls.manifold.build(
         criterion.MaxDepth(8),
         criterion.LFDRange(60, 50),
     )
     return
Exemple #10
0
    def test_pruned(self):
        manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(10), criterion.Layer(8))
        graph = manifold.graphs[0]
        pruned_graph, subsumed_clusters = graph.pruned_graph

        self.assertLessEqual(pruned_graph.cardinality, graph.cardinality)
        self.assertSetEqual(set(pruned_graph.clusters), set(subsumed_clusters.keys()))
        for cluster, subsumed in subsumed_clusters.items():
            self.assertEqual(0, len(subsumed.intersection(set(pruned_graph.clusters))))
Exemple #11
0
 def test_two_points_with_dups(self):
     # Here we have two distinct clusters.
     data = np.concatenate([np.ones((500, 2)) * -2, np.ones((500, 2)) * 2])
     manifold = Manifold(data, 'euclidean').build()
     # We expect building to stop with two clusters.
     self.assertEqual(
         2, manifold.graph.cardinality,
         f'Expected 2 clusters, got {manifold.graph.cardinality}')
     return
Exemple #12
0
    def test_init(self):
        m = Manifold(self.data, 'euclidean')
        self.assertEqual(1, len(m.layers))

        m = Manifold(self.data, 'euclidean', [1, 2, 3])
        self.assertListEqual([1, 2, 3], m.argpoints)

        fraction = 0.2
        m = Manifold(self.data, 'euclidean', fraction)
        self.assertEqual(int(len(self.data) * fraction), len(m.argpoints))

        with self.assertRaises(ValueError):
            # noinspection PyTypeChecker
            Manifold(self.data, 'euclidean', ['a', 'b', 'c'])

        with self.assertRaises(ValueError):
            # noinspection PyTypeChecker
            Manifold(self.data, 'euclidean', 'apples')
        return
Exemple #13
0
def radii_distribution(shape: str, filename: str):
    data = SHAPES[shape](num_points=10**3).T
    manifold: Manifold = Manifold(data, 'euclidean').build()
    radii: List[float] = [
        cluster.radius for layer in manifold.layers
        for cluster in layer.clusters if cluster.radius > 0
    ]
    hist_plot(radii, 32, True, 'radius',
              f'radii of clusters in tree for {shape}', filename)
    return
Exemple #14
0
 def setUpClass(cls) -> None:
     np.random.seed(42)
     cls.data, _ = datasets.bullseye(n=1000, num_rings=2)
     cls.manifold: Manifold = Manifold(cls.data, 'euclidean')
     cls.manifold.build(
         criterion.MaxDepth(10),
         criterion.Layer(5),
     )
     cls.graph: Graph = cls.manifold.graphs[0]
     return
Exemple #15
0
    def test_neighbors(self):
        for dataset in [
                datasets.bullseye,
        ]:  # datasets.spiral_2d, datasets.tori, datasets.skewer, datasets.line]:
            data, labels = dataset()
            manifold = Manifold(data, 'euclidean').build(
                criterion.MaxDepth(12),
                criterion.Layer(8),
            )

            for cluster in manifold.graphs[0].clusters:
                potential_neighbors: List[Cluster] = [
                    c for c in manifold.graphs[0].clusters
                    if c.name != cluster.name
                ]
                argcenters: List[int] = [
                    c.argmedoid for c in potential_neighbors
                ]
                distances: List[float] = list(
                    cluster.distance_from(argcenters))
                radii: List[float] = [
                    cluster.radius + c.radius for c in potential_neighbors
                ]
                true_neighbors = {
                    c: d
                    for c, d, r in zip(potential_neighbors, distances, radii)
                    if d <= r
                }
                neighbors = {
                    edge.neighbor(cluster): edge.distance
                    for edge in manifold.graphs[0].edges_from(cluster)
                }

                extras = set(neighbors.keys()) - set(true_neighbors.keys())
                self.assertEqual(
                    0,
                    len(extras),
                    msg=
                    f'got extra neighbors: optimal, true {len(true_neighbors)}, actual {len(neighbors)}\n'
                    + "\n".join([
                        f"{c.name}, {cluster.radius + c.radius:.6f}"
                        for c in extras
                    ]))

                missed = set(true_neighbors.keys()) - set(neighbors.keys())
                self.assertEqual(
                    0,
                    len(missed),
                    msg=
                    f'missed some neighbors: optimal, true {len(true_neighbors)}, actual {len(neighbors)}\n'
                    + '\n'.join([
                        f'{c.name}, {cluster.radius + c.radius:.6f}'
                        for c in missed
                    ]))
        return
Exemple #16
0
    def test_build_tree(self):
        m = Manifold(self.data, 'euclidean')
        self.assertEqual(1, len(m.layers))

        m.build_tree(criterion.AddLevels(2))
        self.assertEqual(3, len(m.layers))

        # MaxDepth shouldn't do anything in build_tree if we're beyond that depth already.
        m.build_tree(criterion.MaxDepth(1))
        self.assertEqual(3, len(m.layers))

        m.build_tree()
        self.assertEqual(len(self.data), m.layers[-1].cardinality)
        return
Exemple #17
0
def get_persistent_components(data: np.array) -> np.array:
    manifold: Manifold = Manifold(data, 'euclidean').build()
    num_cells = (len(data) * (len(data) - 1)) // 2
    persistence_vectors: np.array = np.zeros(shape=(manifold.depth + 1,
                                                    num_cells),
                                             dtype=int)
    for depth, layer in enumerate(manifold.layers):
        for i, component in enumerate(layer.components):
            points: List[int] = list()
            [points.extend(cluster.argpoints) for cluster in component]
            for j, left in enumerate(points):
                for right in points[j + 1:]:
                    persistence_vectors[depth, key(left, right)] = i + 1
    return persistence_vectors
Exemple #18
0
    def test_dot_file(self):
        manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MinRadius(0.2), criterion.Layer(7))
        graph: Graph = manifold.graphs[0]
        old_clusters: Set[Cluster] = {cluster for cluster in graph.clusters}
        old_edges: Set[Edge] = {edge for edge in graph.edges}

        dot_string = manifold.graphs[0].as_dot_string('bullseye_d7')

        graph = graph.from_dot_string(dot_string)
        new_clusters: Set[Cluster] = {cluster for cluster in graph.clusters}
        new_edges: Set[Edge] = {edge for edge in graph.edges}

        self.assertEqual(old_clusters, new_clusters, f'Found mismatch between old and new clusters.')
        self.assertEqual(old_edges, new_edges, f'Found mismatch between old and new edges.')
        return
Exemple #19
0
    def test_jaccard(self):
        manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(5))

        for i, left in enumerate(manifold.layers):
            self.assertEqual(1, left.jaccard(left), 'identical graphs should have a jaccard index of 1.')
            for j, right in enumerate(manifold.layers):
                if i != j:
                    self.assertEqual(1, left.jaccard(right), f'different layers should have a jaccard index of 1.')

        while len(manifold.layers[-1].components) < 2:
            manifold.build(criterion.MaxDepth(manifold.depth + 1))

        for i, left in enumerate(manifold.layers[-1].components):
            self.assertEqual(1, left.jaccard(left), 'identical components should have a jaccard index of 1.')
            for j, right in enumerate(manifold.layers[-1].components):
                if i != j:
                    self.assertEqual(0, left.jaccard(right), f'different components should have a jaccard index of 0.')
def create_barcodes(
    data: np.array,
    *,
    normalize: bool = True,
    merge: Optional[int] = 4,
) -> Dict[int, Barcodes]:
    manifold: Manifold = Manifold(data, 'euclidean').build_tree(
        criterion.MaxDepth(20))
    barcodes: Barcodes = dict()

    # living-clusters is a heap with highest radius at the top
    living_clusters = [Code(manifold.root, manifold.root.radius)]
    heapq.heapify(living_clusters)

    while living_clusters:  # Go over max-heap
        current: Code = heapq.heappop(living_clusters)

        if current.cluster.children:  # handle children
            current.set_birth(current.radius)
            [left, right] = list(current.cluster.children)

            if left.radius >= current.radius:  # left is still-born
                barcodes[left] = (current.radius, current.radius)
            else:  # or added to living clusters
                heapq.heappush(living_clusters, Code(left, current.radius))

            if right.radius >= current.radius:  # right is still-born
                barcodes[right] = (current.radius, current.radius)
            else:  # or added to living-clusters
                heapq.heappush(living_clusters, Code(right, current.radius))

        else:  # otherwise set birth to zero-radius
            current.set_birth(0.)
        # add current to dict of barcodes
        barcodes[current.cluster] = (current.birth, current.death)

    if normalize:
        barcodes = _normalize(manifold.root.radius, barcodes)

    barcodes_by_cardinality = _group_by_cardinality(barcodes)

    if merge is not None:
        barcodes_by_cardinality = _merge_high_cardinalities(
            merge, barcodes_by_cardinality)

    return barcodes_by_cardinality
Exemple #21
0
def volume_ratios(data: np.ndarray, filename: str) -> pd.DataFrame:
    if os.path.exists(filename):
        volumes_df = pd.read_csv(filename)
        volumes_df.fillna('', inplace=True)
    else:
        # Create manifold from data
        manifold = Manifold(data, 'euclidean').build(criterion.MaxDepth(16), )

        # get volumes of all clusters in the manifold
        volumes: Dict[Cluster, float] = {
            cluster: cluster.radius**3
            for layer in manifold.layers for cluster in layer.clusters
        }
        clusters: List[Cluster] = list(sorted(list(volumes.keys())))
        clusters_enumerations: Dict[Cluster, int] = {
            c: i
            for i, c in enumerate(clusters)
        }

        # Initialize table for volume ratios
        ratios = np.zeros(shape=(len(volumes), manifold.depth + 1),
                          dtype=np.float32)
        for c, i in clusters_enumerations.items():
            ratios[i][c.depth] = c.radius**3

        # populate table with correct ratios
        for graph in manifold.graphs:
            for cluster in graph.clusters:
                for g in manifold.graphs[cluster.depth + 1:]:
                    children = [
                        c for c in g if cluster.name == c.name[:cluster.depth]
                    ]
                    total_volume = sum(
                        (volumes[c]
                         for c in children)) + np.finfo(np.float32).eps
                    ratios[clusters_enumerations[cluster]][
                        g.depth] = ratios[clusters_enumerations[cluster]][
                            cluster.depth] / total_volume
                ratios[clusters_enumerations[cluster]][cluster.depth] = 0.

        # write a .csv of ratios
        volumes_df = pd.DataFrame(data=ratios)
        volumes_df['cluster_names'] = [cluster.name for cluster in clusters]
        volumes_df.to_csv(filename, index=False)

    return volumes_df
Exemple #22
0
    def test_load(self):
        original = self.manifold
        with TemporaryFile() as fp:
            original.dump(fp)
            fp.seek(0)
            loaded = Manifold.load(fp, self.data)
        self.assertEqual(original, loaded)
        self.assertEqual(set(original.layers[-1]), set(loaded.layers[-1]))
        self.assertEqual(original.graphs[0], loaded.graphs[0])

        for layer in loaded.layers:
            for cluster in layer:
                self.assertIn('radius', cluster.cache)
                self.assertIn('argradius', cluster.cache)
                self.assertIn('argsamples', cluster.cache)
                self.assertIn('argmedoid', cluster.cache)
                self.assertIn('local_fractal_dimension', cluster.cache)
        return
Exemple #23
0
    def test_jaccard(self):
        manifold: Manifold = Manifold(self.data,
                                      'euclidean').build(criterion.MaxDepth(4))

        for i, left in enumerate(manifold.layers[-1].clusters):
            self.assertEqual(
                1, left.jaccard(left),
                'identical clusters should have a jaccard index of 1.')
            for j, right in enumerate(manifold.layers[-1].clusters):
                if i != j:
                    self.assertEqual(
                        0, left.jaccard(right),
                        f'different clusters should have a jaccard index of 0.'
                    )
            self.assertEqual(
                left.cardinality / left.parent.cardinality,
                left.jaccard(left.parent),
                f'jaccard index with parent should be equal to child/parent cardinality ratio.',
            )
Exemple #24
0
 def test_all_same(self):
     # A bit simpler, every point is the same.
     data = np.ones((1000, 3))
     manifold = Manifold(data, 'euclidean').build()
     # There should only ever be one cluster here.
     self.assertEqual(1, len(manifold.layers))
     manifold.build_tree()
     # Even after explicit deepen calls.
     self.assertEqual(1, len(manifold.layers))
     self.assertEqual(
         1, len(manifold.find_clusters(np.asarray([1, 1, 1]), 0.0, -1)))
     # And, we should get all 1000 points back for any of the data.
     self.assertEqual(1000, len(manifold.find_points(data[0], 0.0)))
     return
Exemple #25
0
def build_data(
        method_function: Callable[[Graph], Dict[Cluster, float]],
        graph: Graph,
        manifold: Manifold,
        labels: numpy.ndarray,
):
    cluster_scores = list((cluster, score) for cluster, score in method_function(graph).items())

    train_x = numpy.zeros(shape=(len(cluster_scores), 6), dtype=numpy.float32)
    train_y = numpy.zeros(shape=(len(cluster_scores),))

    for i, (cluster, score) in enumerate(cluster_scores):
        train_x[i] = manifold.cluster_ratios(cluster)

        y_true = numpy.asarray(labels[cluster.argpoints], dtype=numpy.float32)

        # TODO: Why was this numpy.sum instead of float?
        loss = float(numpy.mean(numpy.square(score - y_true))) / cluster.cardinality

        train_y[i] = 1. - loss

    return train_x, train_y
Exemple #26
0
class TestCriterion(unittest.TestCase):
    @classmethod
    def setUpClass(cls) -> None:
        cls.data = datasets.bullseye(n=500)[0]
        return

    def setUp(self) -> None:
        self.manifold = Manifold(self.data, 'euclidean')
        return

    def test_min_radius(self):
        min_radius = 0.1
        self.manifold.build(criterion.MinRadius(min_radius), )
        self.assertTrue(
            all((cluster.radius > min_radius for layer in self.manifold.layers
                 for cluster in layer if cluster.children)))
        self.assertTrue(
            all((cluster.radius <= min_radius for layer in self.manifold.layers
                 for cluster in layer if not cluster.children)))
        return

    def test_combinations(self):
        min_points, max_depth = 10, 8
        self.manifold.build(criterion.MinPoints(min_points),
                            criterion.MaxDepth(max_depth))
        [
            self.assertLessEqual(len(c.children), 1)
            for g in self.manifold.layers for c in g
            if len(c.argpoints) <= min_points or c.depth >= max_depth
        ]
        # self.plot()
        return

    def test_medoid_near_centroid(self):
        self.manifold.build(criterion.MedoidNearCentroid(),
                            criterion.MaxDepth(8))
        # self.plot()
        return

    def test_uniform_distribution(self):
        self.manifold.build(criterion.UniformDistribution(),
                            criterion.MaxDepth(8))
        # self.plot()
        return

    def test_lfd_range(self):
        self.manifold.build(criterion.MaxDepth(12), criterion.LFDRange(60, 50))

        for leaf in self.manifold.layers[-1].clusters:
            ancestry = self.manifold.ancestry(leaf)
            included = sum(
                (1 if ancestor in self.manifold.graph.clusters else 0
                 for ancestor in ancestry))
            self.assertEqual(
                1, included,
                f"expected exactly one ancestor to be in graph. Found {included}"
            )
        return

    def test_minimize_subsumed(self):
        fraction: float = 0.2

        self.manifold.build(
            criterion.MaxDepth(12),
            criterion.LFDRange(80, 20),
            criterion.MinimizeSubsumed(fraction),
        )

        for leaf in self.manifold.layers[-1].clusters:
            ancestry = self.manifold.ancestry(leaf)
            included = sum(
                (1 if ancestor in self.manifold.graph.clusters else 0
                 for ancestor in ancestry))
            self.assertEqual(
                1, included,
                f"expected exactly one ancestor to be in graph. Found {included}"
            )
        return
Exemple #27
0
 def test_partition(self):
     manifold = Manifold(datasets.xor()[0], 'euclidean')
     cluster = manifold.select('')
     children = list(cluster.partition())
     self.assertGreater(len(children), 1)
     return
Exemple #28
0
 def setUpClass(cls) -> None:
     cls.data = np.random.randn(1_000, 100)
     cls.manifold = Manifold(cls.data, 'euclidean')
     return
Exemple #29
0
 def setUp(self) -> None:
     self.manifold = Manifold(self.data, 'euclidean')
     return