Ejemplo n.º 1
0
 def test_build(self):
     m = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(1))
     self.assertEqual(2, len(m.layers))
     m.build(criterion.MaxDepth(2))
     self.assertEqual(3, len(m.layers))
     self.assertEqual(len(self.data), m.graphs[0].population)
     return
Ejemplo n.º 2
0
 def test_partition_backends(self):
     data = datasets.random(n=100, dimensions=5)[0]
     m_single = Manifold(data, 'euclidean')._partition_single(
         [criterion.MaxDepth(5)])
     m_thread = Manifold(data, 'euclidean')._partition_threaded(
         [criterion.MaxDepth(5)])
     self.assertEqual(m_single, m_thread)
     return
Ejemplo n.º 3
0
    def test_replace(self):
        manifold = Manifold(self.data, 'euclidean').build(
            criterion.MaxDepth(12),
            criterion.LFDRange(80, 20),
        )
        graph = manifold.layers[-1].build_edges()

        for i in range(10):
            clusters: Dict[int, Cluster] = {c: cluster for c, cluster in zip(range(graph.cardinality), graph.clusters)}
            if len(clusters) < 10:
                break
            sample_size = len(clusters) // 10
            samples: List[int] = list(map(int, np.random.choice(graph.cardinality, size=sample_size, replace=False)))
            removals: Set[Cluster] = {clusters[c] for c in samples if clusters[c].children}
            additions: Set[Cluster] = set()
            [additions.update(cluster.children) for cluster in removals]

            graph.replace_clusters(
                removals=removals,
                additions=additions,
            )

            clusters: Set[Cluster] = set(graph.clusters)

            self.assertEqual(0, len(removals.intersection(clusters)), f'\n1. Some removals clusters were still in the graph. iter {i}')
            self.assertTrue(additions.issubset(clusters), f'\n2. Some additions clusters were not in the graph. iter {i}')

            removal_edges: Set[Edge] = {edge for cluster in removals for edge in graph.edges if cluster in edge}
            self.assertEqual(0, len(removal_edges), f'\n3. Some removals clusters were still found among edges. iter {i}')

            self.assertEqual(0, len(graph.cache), f'\n4. Graph cache had some elements. {[k for k in graph.cache.keys()]}. iter {i}')
        return
Ejemplo n.º 4
0
    def test_jaccard(self):
        manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(5))

        for i, left in enumerate(manifold.layers):
            self.assertEqual(1, left.jaccard(left), 'identical graphs should have a jaccard index of 1.')
            for j, right in enumerate(manifold.layers):
                if i != j:
                    self.assertEqual(1, left.jaccard(right), f'different layers should have a jaccard index of 1.')

        while len(manifold.layers[-1].components) < 2:
            manifold.build(criterion.MaxDepth(manifold.depth + 1))

        for i, left in enumerate(manifold.layers[-1].components):
            self.assertEqual(1, left.jaccard(left), 'identical components should have a jaccard index of 1.')
            for j, right in enumerate(manifold.layers[-1].components):
                if i != j:
                    self.assertEqual(0, left.jaccard(right), f'different components should have a jaccard index of 0.')
Ejemplo n.º 5
0
 def setUpClass(cls) -> None:
     cls.data, cls.labels = datasets.random(n=1000, dimensions=3)
     cls.manifold = Manifold(cls.data, 'euclidean')
     cls.manifold.build(
         criterion.MaxDepth(8),
         criterion.LFDRange(60, 50),
     )
     return
Ejemplo n.º 6
0
 def setUpClass(cls) -> None:
     np.random.seed(42)
     cls.data, _ = datasets.bullseye(n=1000, num_rings=2)
     cls.manifold = Manifold(cls.data, 'euclidean')
     cls.manifold.build(
         criterion.MaxDepth(10),
         criterion.LFDRange(60, 50),
     )
     return
Ejemplo n.º 7
0
    def test_pruned(self):
        manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(10), criterion.Layer(8))
        graph = manifold.graphs[0]
        pruned_graph, subsumed_clusters = graph.pruned_graph

        self.assertLessEqual(pruned_graph.cardinality, graph.cardinality)
        self.assertSetEqual(set(pruned_graph.clusters), set(subsumed_clusters.keys()))
        for cluster, subsumed in subsumed_clusters.items():
            self.assertEqual(0, len(subsumed.intersection(set(pruned_graph.clusters))))
Ejemplo n.º 8
0
 def setUpClass(cls) -> None:
     np.random.seed(42)
     cls.data, _ = datasets.bullseye(n=1000, num_rings=2)
     cls.manifold: Manifold = Manifold(cls.data, 'euclidean')
     cls.manifold.build(
         criterion.MaxDepth(10),
         criterion.Layer(5),
     )
     cls.graph: Graph = cls.manifold.graphs[0]
     return
Ejemplo n.º 9
0
    def test_neighbors(self):
        for dataset in [
                datasets.bullseye,
        ]:  # datasets.spiral_2d, datasets.tori, datasets.skewer, datasets.line]:
            data, labels = dataset()
            manifold = Manifold(data, 'euclidean').build(
                criterion.MaxDepth(12),
                criterion.Layer(8),
            )

            for cluster in manifold.graphs[0].clusters:
                potential_neighbors: List[Cluster] = [
                    c for c in manifold.graphs[0].clusters
                    if c.name != cluster.name
                ]
                argcenters: List[int] = [
                    c.argmedoid for c in potential_neighbors
                ]
                distances: List[float] = list(
                    cluster.distance_from(argcenters))
                radii: List[float] = [
                    cluster.radius + c.radius for c in potential_neighbors
                ]
                true_neighbors = {
                    c: d
                    for c, d, r in zip(potential_neighbors, distances, radii)
                    if d <= r
                }
                neighbors = {
                    edge.neighbor(cluster): edge.distance
                    for edge in manifold.graphs[0].edges_from(cluster)
                }

                extras = set(neighbors.keys()) - set(true_neighbors.keys())
                self.assertEqual(
                    0,
                    len(extras),
                    msg=
                    f'got extra neighbors: optimal, true {len(true_neighbors)}, actual {len(neighbors)}\n'
                    + "\n".join([
                        f"{c.name}, {cluster.radius + c.radius:.6f}"
                        for c in extras
                    ]))

                missed = set(true_neighbors.keys()) - set(neighbors.keys())
                self.assertEqual(
                    0,
                    len(missed),
                    msg=
                    f'missed some neighbors: optimal, true {len(true_neighbors)}, actual {len(neighbors)}\n'
                    + '\n'.join([
                        f'{c.name}, {cluster.radius + c.radius:.6f}'
                        for c in missed
                    ]))
        return
Ejemplo n.º 10
0
 def test_combinations(self):
     min_points, max_depth = 10, 8
     self.manifold.build(criterion.MinPoints(min_points),
                         criterion.MaxDepth(max_depth))
     [
         self.assertLessEqual(len(c.children), 1)
         for g in self.manifold.layers for c in g
         if len(c.argpoints) <= min_points or c.depth >= max_depth
     ]
     # self.plot()
     return
Ejemplo n.º 11
0
    def test_eq(self):
        self.assertEqual(self.manifold, self.manifold)
        other = Manifold(self.data, 'euclidean', argpoints=0.2).build(
            criterion.MaxDepth(10),
            criterion.LFDRange(60, 50),
        )
        self.assertNotEqual(self.manifold, other)
        self.assertEqual(other, other)

        other = Manifold(self.data, 'cosine')
        self.assertNotEqual(self.manifold, other)
        return
Ejemplo n.º 12
0
    def test_lfd_range(self):
        self.manifold.build(criterion.MaxDepth(12), criterion.LFDRange(60, 50))

        for leaf in self.manifold.layers[-1].clusters:
            ancestry = self.manifold.ancestry(leaf)
            included = sum(
                (1 if ancestor in self.manifold.graph.clusters else 0
                 for ancestor in ancestry))
            self.assertEqual(
                1, included,
                f"expected exactly one ancestor to be in graph. Found {included}"
            )
        return
Ejemplo n.º 13
0
 def test_random_large(self):
     data = np.random.randn(1000, 3)
     manifold = Manifold(data, 'euclidean').build(
         criterion.MaxDepth(10),
         criterion.LFDRange(60, 50),
     )
     for _ in range(10):
         point = int(np.random.choice(3))
         linear_results = linear_search(data[point], 0.5, data,
                                        manifold.metric)
         self.assertEqual(len(linear_results),
                          len(manifold.find_points(data[point], 0.5)))
     return
Ejemplo n.º 14
0
    def test_build_tree(self):
        m = Manifold(self.data, 'euclidean')
        self.assertEqual(1, len(m.layers))

        m.build_tree(criterion.AddLevels(2))
        self.assertEqual(3, len(m.layers))

        # MaxDepth shouldn't do anything in build_tree if we're beyond that depth already.
        m.build_tree(criterion.MaxDepth(1))
        self.assertEqual(3, len(m.layers))

        m.build_tree()
        self.assertEqual(len(self.data), m.layers[-1].cardinality)
        return
def create_barcodes(
    data: np.array,
    *,
    normalize: bool = True,
    merge: Optional[int] = 4,
) -> Dict[int, Barcodes]:
    manifold: Manifold = Manifold(data, 'euclidean').build_tree(
        criterion.MaxDepth(20))
    barcodes: Barcodes = dict()

    # living-clusters is a heap with highest radius at the top
    living_clusters = [Code(manifold.root, manifold.root.radius)]
    heapq.heapify(living_clusters)

    while living_clusters:  # Go over max-heap
        current: Code = heapq.heappop(living_clusters)

        if current.cluster.children:  # handle children
            current.set_birth(current.radius)
            [left, right] = list(current.cluster.children)

            if left.radius >= current.radius:  # left is still-born
                barcodes[left] = (current.radius, current.radius)
            else:  # or added to living clusters
                heapq.heappush(living_clusters, Code(left, current.radius))

            if right.radius >= current.radius:  # right is still-born
                barcodes[right] = (current.radius, current.radius)
            else:  # or added to living-clusters
                heapq.heappush(living_clusters, Code(right, current.radius))

        else:  # otherwise set birth to zero-radius
            current.set_birth(0.)
        # add current to dict of barcodes
        barcodes[current.cluster] = (current.birth, current.death)

    if normalize:
        barcodes = _normalize(manifold.root.radius, barcodes)

    barcodes_by_cardinality = _group_by_cardinality(barcodes)

    if merge is not None:
        barcodes_by_cardinality = _merge_high_cardinalities(
            merge, barcodes_by_cardinality)

    return barcodes_by_cardinality
Ejemplo n.º 16
0
def volume_ratios(data: np.ndarray, filename: str) -> pd.DataFrame:
    if os.path.exists(filename):
        volumes_df = pd.read_csv(filename)
        volumes_df.fillna('', inplace=True)
    else:
        # Create manifold from data
        manifold = Manifold(data, 'euclidean').build(criterion.MaxDepth(16), )

        # get volumes of all clusters in the manifold
        volumes: Dict[Cluster, float] = {
            cluster: cluster.radius**3
            for layer in manifold.layers for cluster in layer.clusters
        }
        clusters: List[Cluster] = list(sorted(list(volumes.keys())))
        clusters_enumerations: Dict[Cluster, int] = {
            c: i
            for i, c in enumerate(clusters)
        }

        # Initialize table for volume ratios
        ratios = np.zeros(shape=(len(volumes), manifold.depth + 1),
                          dtype=np.float32)
        for c, i in clusters_enumerations.items():
            ratios[i][c.depth] = c.radius**3

        # populate table with correct ratios
        for graph in manifold.graphs:
            for cluster in graph.clusters:
                for g in manifold.graphs[cluster.depth + 1:]:
                    children = [
                        c for c in g if cluster.name == c.name[:cluster.depth]
                    ]
                    total_volume = sum(
                        (volumes[c]
                         for c in children)) + np.finfo(np.float32).eps
                    ratios[clusters_enumerations[cluster]][
                        g.depth] = ratios[clusters_enumerations[cluster]][
                            cluster.depth] / total_volume
                ratios[clusters_enumerations[cluster]][cluster.depth] = 0.

        # write a .csv of ratios
        volumes_df = pd.DataFrame(data=ratios)
        volumes_df['cluster_names'] = [cluster.name for cluster in clusters]
        volumes_df.to_csv(filename, index=False)

    return volumes_df
Ejemplo n.º 17
0
    def build(self, *, max_depth: Optional[int] = None) -> 'Search':
        """ Builds the search tree upto leaves, or an optional maximum depth.

        This method can be called repeatedly, with higher depth values, to further increase the depth of the tree.

        :param max_depth: optional maximum depth of search tree
        :return: the modified Search object.
        """
        if max_depth is None:
            self.manifold.build(criterion.Layer(-1))
        elif max_depth < 1:
            raise ValueError(
                f'Expected a positive integer for max_depth. Got {max_depth} instead.'
            )
        elif max_depth > self.depth:
            self.manifold.build_tree(criterion.MaxDepth(max_depth),
                                     criterion.Layer(-1))
        return self
Ejemplo n.º 18
0
    def test_jaccard(self):
        manifold: Manifold = Manifold(self.data,
                                      'euclidean').build(criterion.MaxDepth(4))

        for i, left in enumerate(manifold.layers[-1].clusters):
            self.assertEqual(
                1, left.jaccard(left),
                'identical clusters should have a jaccard index of 1.')
            for j, right in enumerate(manifold.layers[-1].clusters):
                if i != j:
                    self.assertEqual(
                        0, left.jaccard(right),
                        f'different clusters should have a jaccard index of 0.'
                    )
            self.assertEqual(
                left.cardinality / left.parent.cardinality,
                left.jaccard(left.parent),
                f'jaccard index with parent should be equal to child/parent cardinality ratio.',
            )
Ejemplo n.º 19
0
    def test_find_knn(self):
        data = datasets.bullseye()[0]
        point = data[0]
        points = sorted([
            (d, p)
            for p, d in zip(range(data.shape[0]),
                            cdist(np.asarray([point]), data, 'euclidean')[0])
        ])

        m = Manifold(data, 'euclidean')
        m.build_tree(criterion.MinPoints(10), criterion.MaxDepth(10))

        ks = list(range(10))
        ks.extend(range(10, data.shape[0], 1000))
        for k in ks:
            naive_results = {p for d, p in points[:k]}
            results = m.find_knn(point, k)
            self.assertEqual(k, len(results))
            self.assertSetEqual(naive_results, {p for p, _ in results})
Ejemplo n.º 20
0
    def test_minimize_subsumed(self):
        fraction: float = 0.2

        self.manifold.build(
            criterion.MaxDepth(12),
            criterion.LFDRange(80, 20),
            criterion.MinimizeSubsumed(fraction),
        )

        for leaf in self.manifold.layers[-1].clusters:
            ancestry = self.manifold.ancestry(leaf)
            included = sum(
                (1 if ancestor in self.manifold.graph.clusters else 0
                 for ancestor in ancestry))
            self.assertEqual(
                1, included,
                f"expected exactly one ancestor to be in graph. Found {included}"
            )
        return
Ejemplo n.º 21
0
 def test_tree_search(self):
     np.random.seed(42)
     data, labels = datasets.line()
     manifold = Manifold(data, 'euclidean')
     manifold.build_tree(criterion.MinPoints(10), criterion.MaxDepth(5))
     # Finding points that are in data.
     for depth, layer in enumerate(manifold.layers):
         for cluster in layer.clusters:
             linear = set([
                 c for c in layer
                 if c.overlaps(cluster.medoid, cluster.radius)
             ])
             tree = set(
                 next(iter(manifold.layers[0])).tree_search(
                     cluster.medoid, cluster.radius, cluster.depth).keys())
             self.assertSetEqual(set(), tree - linear)
             for d in range(depth, 0, -1):
                 parents = set([
                     manifold.select(cluster.name[:-1])
                     for cluster in linear
                 ])
                 for parent in parents:
                     results = parent.tree_search(cluster.medoid,
                                                  cluster.radius,
                                                  parent.depth)
                     self.assertIn(
                         parent,
                         results,
                         msg=
                         f'\n{parent.name} not in {[c.name for c in results]}. '
                         f'got {len(results)} hits.')
     # Attempting to find points that *may* be in the data
     results = manifold.root.tree_search(point=np.asarray([0, 1]),
                                         radius=0.,
                                         depth=-1)
     self.assertEqual(0, len(results))
     with self.assertRaises(ValueError):
         _ = manifold.root.tree_search(point=np.asarray([0, 1]),
                                       radius=0.,
                                       depth=-5)
     return
Ejemplo n.º 22
0
    def test_replace(self):
        self.manifold.build(
            criterion.MaxDepth(12),
            criterion.LFDRange(80, 20),
        )
        self.manifold.layers[-1].build_edges()

        for i in range(100):
            clusters: Dict[int, Cluster] = {
                c: cluster
                for c, cluster in zip(range(self.manifold.graph.cardinality),
                                      self.manifold.graph.clusters)
            }
            if len(clusters) < 10:
                break
            sample_size = len(clusters) // 10
            samples: List[int] = list(
                map(
                    int,
                    np.random.choice(self.manifold.graph.cardinality,
                                     size=sample_size,
                                     replace=False)))
            removals: Set[Cluster] = {
                clusters[c]
                for c in samples if clusters[c].children
            }
            additions: Set[Cluster] = set()
            [additions.update(cluster.children) for cluster in removals]

            self.manifold.graph.replace_clusters(
                removals=removals,
                additions=additions,
                recompute_probabilities=True,
            )

            clusters: Set[Cluster] = set(self.manifold.graph.clusters)

            subsumed_clusters: Set[Cluster] = self.manifold.graph.cache[
                'subsumed_clusters']
            subsumed_edges: Dict[
                Cluster,
                Set[Edge]] = self.manifold.graph.cache['subsumed_edges']

            walkable_clusters: Set[Cluster] = self.manifold.graph.cache[
                'walkable_clusters']
            walkable_edges: Dict[
                Cluster,
                Set[Edge]] = self.manifold.graph.cache['walkable_edges']

            self.assertTrue(
                subsumed_clusters.issubset(clusters),
                f"\n1. subsumed clusters were not subset of clusters. iter: {i}"
            )
            self.assertTrue(
                walkable_clusters.issubset(clusters),
                f"\n2. walkable clusters were not subset of clusters. iter: {i}"
            )
            self.assertTrue(
                walkable_clusters.isdisjoint(subsumed_clusters),
                f"\n3. walkable clusters and subsumed clusters were not disjoint sets. iter: {i}"
            )
            self.assertSetEqual(
                clusters, subsumed_clusters.union(walkable_clusters),
                f"\n4. union of subsumed and walkable clusters was not the same as all clusters. iter: {i}"
            )
            self.assertSetEqual(
                clusters, set(subsumed_edges.keys()),
                f"\n5. keys in subsumed edges were not the same as clusters. iter: {i}"
            )
            self.assertSetEqual(
                walkable_clusters, set(walkable_edges.keys()),
                f"\n6. keys in walkable edges were not the same as walkable clusters. iter: {i}"
            )
        return
Ejemplo n.º 23
0
 def test_medoid_near_centroid(self):
     self.manifold.build(criterion.MedoidNearCentroid(),
                         criterion.MaxDepth(8))
     # self.plot()
     return
Ejemplo n.º 24
0
 def test_uniform_distribution(self):
     self.manifold.build(criterion.UniformDistribution(),
                         criterion.MaxDepth(8))
     # self.plot()
     return