def test_build(self): m = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(1)) self.assertEqual(2, len(m.layers)) m.build(criterion.MaxDepth(2)) self.assertEqual(3, len(m.layers)) self.assertEqual(len(self.data), m.graphs[0].population) return
def test_partition_backends(self): data = datasets.random(n=100, dimensions=5)[0] m_single = Manifold(data, 'euclidean')._partition_single( [criterion.MaxDepth(5)]) m_thread = Manifold(data, 'euclidean')._partition_threaded( [criterion.MaxDepth(5)]) self.assertEqual(m_single, m_thread) return
def test_replace(self): manifold = Manifold(self.data, 'euclidean').build( criterion.MaxDepth(12), criterion.LFDRange(80, 20), ) graph = manifold.layers[-1].build_edges() for i in range(10): clusters: Dict[int, Cluster] = {c: cluster for c, cluster in zip(range(graph.cardinality), graph.clusters)} if len(clusters) < 10: break sample_size = len(clusters) // 10 samples: List[int] = list(map(int, np.random.choice(graph.cardinality, size=sample_size, replace=False))) removals: Set[Cluster] = {clusters[c] for c in samples if clusters[c].children} additions: Set[Cluster] = set() [additions.update(cluster.children) for cluster in removals] graph.replace_clusters( removals=removals, additions=additions, ) clusters: Set[Cluster] = set(graph.clusters) self.assertEqual(0, len(removals.intersection(clusters)), f'\n1. Some removals clusters were still in the graph. iter {i}') self.assertTrue(additions.issubset(clusters), f'\n2. Some additions clusters were not in the graph. iter {i}') removal_edges: Set[Edge] = {edge for cluster in removals for edge in graph.edges if cluster in edge} self.assertEqual(0, len(removal_edges), f'\n3. Some removals clusters were still found among edges. iter {i}') self.assertEqual(0, len(graph.cache), f'\n4. Graph cache had some elements. {[k for k in graph.cache.keys()]}. iter {i}') return
def test_jaccard(self): manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(5)) for i, left in enumerate(manifold.layers): self.assertEqual(1, left.jaccard(left), 'identical graphs should have a jaccard index of 1.') for j, right in enumerate(manifold.layers): if i != j: self.assertEqual(1, left.jaccard(right), f'different layers should have a jaccard index of 1.') while len(manifold.layers[-1].components) < 2: manifold.build(criterion.MaxDepth(manifold.depth + 1)) for i, left in enumerate(manifold.layers[-1].components): self.assertEqual(1, left.jaccard(left), 'identical components should have a jaccard index of 1.') for j, right in enumerate(manifold.layers[-1].components): if i != j: self.assertEqual(0, left.jaccard(right), f'different components should have a jaccard index of 0.')
def setUpClass(cls) -> None: cls.data, cls.labels = datasets.random(n=1000, dimensions=3) cls.manifold = Manifold(cls.data, 'euclidean') cls.manifold.build( criterion.MaxDepth(8), criterion.LFDRange(60, 50), ) return
def setUpClass(cls) -> None: np.random.seed(42) cls.data, _ = datasets.bullseye(n=1000, num_rings=2) cls.manifold = Manifold(cls.data, 'euclidean') cls.manifold.build( criterion.MaxDepth(10), criterion.LFDRange(60, 50), ) return
def test_pruned(self): manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(10), criterion.Layer(8)) graph = manifold.graphs[0] pruned_graph, subsumed_clusters = graph.pruned_graph self.assertLessEqual(pruned_graph.cardinality, graph.cardinality) self.assertSetEqual(set(pruned_graph.clusters), set(subsumed_clusters.keys())) for cluster, subsumed in subsumed_clusters.items(): self.assertEqual(0, len(subsumed.intersection(set(pruned_graph.clusters))))
def setUpClass(cls) -> None: np.random.seed(42) cls.data, _ = datasets.bullseye(n=1000, num_rings=2) cls.manifold: Manifold = Manifold(cls.data, 'euclidean') cls.manifold.build( criterion.MaxDepth(10), criterion.Layer(5), ) cls.graph: Graph = cls.manifold.graphs[0] return
def test_neighbors(self): for dataset in [ datasets.bullseye, ]: # datasets.spiral_2d, datasets.tori, datasets.skewer, datasets.line]: data, labels = dataset() manifold = Manifold(data, 'euclidean').build( criterion.MaxDepth(12), criterion.Layer(8), ) for cluster in manifold.graphs[0].clusters: potential_neighbors: List[Cluster] = [ c for c in manifold.graphs[0].clusters if c.name != cluster.name ] argcenters: List[int] = [ c.argmedoid for c in potential_neighbors ] distances: List[float] = list( cluster.distance_from(argcenters)) radii: List[float] = [ cluster.radius + c.radius for c in potential_neighbors ] true_neighbors = { c: d for c, d, r in zip(potential_neighbors, distances, radii) if d <= r } neighbors = { edge.neighbor(cluster): edge.distance for edge in manifold.graphs[0].edges_from(cluster) } extras = set(neighbors.keys()) - set(true_neighbors.keys()) self.assertEqual( 0, len(extras), msg= f'got extra neighbors: optimal, true {len(true_neighbors)}, actual {len(neighbors)}\n' + "\n".join([ f"{c.name}, {cluster.radius + c.radius:.6f}" for c in extras ])) missed = set(true_neighbors.keys()) - set(neighbors.keys()) self.assertEqual( 0, len(missed), msg= f'missed some neighbors: optimal, true {len(true_neighbors)}, actual {len(neighbors)}\n' + '\n'.join([ f'{c.name}, {cluster.radius + c.radius:.6f}' for c in missed ])) return
def test_combinations(self): min_points, max_depth = 10, 8 self.manifold.build(criterion.MinPoints(min_points), criterion.MaxDepth(max_depth)) [ self.assertLessEqual(len(c.children), 1) for g in self.manifold.layers for c in g if len(c.argpoints) <= min_points or c.depth >= max_depth ] # self.plot() return
def test_eq(self): self.assertEqual(self.manifold, self.manifold) other = Manifold(self.data, 'euclidean', argpoints=0.2).build( criterion.MaxDepth(10), criterion.LFDRange(60, 50), ) self.assertNotEqual(self.manifold, other) self.assertEqual(other, other) other = Manifold(self.data, 'cosine') self.assertNotEqual(self.manifold, other) return
def test_lfd_range(self): self.manifold.build(criterion.MaxDepth(12), criterion.LFDRange(60, 50)) for leaf in self.manifold.layers[-1].clusters: ancestry = self.manifold.ancestry(leaf) included = sum( (1 if ancestor in self.manifold.graph.clusters else 0 for ancestor in ancestry)) self.assertEqual( 1, included, f"expected exactly one ancestor to be in graph. Found {included}" ) return
def test_random_large(self): data = np.random.randn(1000, 3) manifold = Manifold(data, 'euclidean').build( criterion.MaxDepth(10), criterion.LFDRange(60, 50), ) for _ in range(10): point = int(np.random.choice(3)) linear_results = linear_search(data[point], 0.5, data, manifold.metric) self.assertEqual(len(linear_results), len(manifold.find_points(data[point], 0.5))) return
def test_build_tree(self): m = Manifold(self.data, 'euclidean') self.assertEqual(1, len(m.layers)) m.build_tree(criterion.AddLevels(2)) self.assertEqual(3, len(m.layers)) # MaxDepth shouldn't do anything in build_tree if we're beyond that depth already. m.build_tree(criterion.MaxDepth(1)) self.assertEqual(3, len(m.layers)) m.build_tree() self.assertEqual(len(self.data), m.layers[-1].cardinality) return
def create_barcodes( data: np.array, *, normalize: bool = True, merge: Optional[int] = 4, ) -> Dict[int, Barcodes]: manifold: Manifold = Manifold(data, 'euclidean').build_tree( criterion.MaxDepth(20)) barcodes: Barcodes = dict() # living-clusters is a heap with highest radius at the top living_clusters = [Code(manifold.root, manifold.root.radius)] heapq.heapify(living_clusters) while living_clusters: # Go over max-heap current: Code = heapq.heappop(living_clusters) if current.cluster.children: # handle children current.set_birth(current.radius) [left, right] = list(current.cluster.children) if left.radius >= current.radius: # left is still-born barcodes[left] = (current.radius, current.radius) else: # or added to living clusters heapq.heappush(living_clusters, Code(left, current.radius)) if right.radius >= current.radius: # right is still-born barcodes[right] = (current.radius, current.radius) else: # or added to living-clusters heapq.heappush(living_clusters, Code(right, current.radius)) else: # otherwise set birth to zero-radius current.set_birth(0.) # add current to dict of barcodes barcodes[current.cluster] = (current.birth, current.death) if normalize: barcodes = _normalize(manifold.root.radius, barcodes) barcodes_by_cardinality = _group_by_cardinality(barcodes) if merge is not None: barcodes_by_cardinality = _merge_high_cardinalities( merge, barcodes_by_cardinality) return barcodes_by_cardinality
def volume_ratios(data: np.ndarray, filename: str) -> pd.DataFrame: if os.path.exists(filename): volumes_df = pd.read_csv(filename) volumes_df.fillna('', inplace=True) else: # Create manifold from data manifold = Manifold(data, 'euclidean').build(criterion.MaxDepth(16), ) # get volumes of all clusters in the manifold volumes: Dict[Cluster, float] = { cluster: cluster.radius**3 for layer in manifold.layers for cluster in layer.clusters } clusters: List[Cluster] = list(sorted(list(volumes.keys()))) clusters_enumerations: Dict[Cluster, int] = { c: i for i, c in enumerate(clusters) } # Initialize table for volume ratios ratios = np.zeros(shape=(len(volumes), manifold.depth + 1), dtype=np.float32) for c, i in clusters_enumerations.items(): ratios[i][c.depth] = c.radius**3 # populate table with correct ratios for graph in manifold.graphs: for cluster in graph.clusters: for g in manifold.graphs[cluster.depth + 1:]: children = [ c for c in g if cluster.name == c.name[:cluster.depth] ] total_volume = sum( (volumes[c] for c in children)) + np.finfo(np.float32).eps ratios[clusters_enumerations[cluster]][ g.depth] = ratios[clusters_enumerations[cluster]][ cluster.depth] / total_volume ratios[clusters_enumerations[cluster]][cluster.depth] = 0. # write a .csv of ratios volumes_df = pd.DataFrame(data=ratios) volumes_df['cluster_names'] = [cluster.name for cluster in clusters] volumes_df.to_csv(filename, index=False) return volumes_df
def build(self, *, max_depth: Optional[int] = None) -> 'Search': """ Builds the search tree upto leaves, or an optional maximum depth. This method can be called repeatedly, with higher depth values, to further increase the depth of the tree. :param max_depth: optional maximum depth of search tree :return: the modified Search object. """ if max_depth is None: self.manifold.build(criterion.Layer(-1)) elif max_depth < 1: raise ValueError( f'Expected a positive integer for max_depth. Got {max_depth} instead.' ) elif max_depth > self.depth: self.manifold.build_tree(criterion.MaxDepth(max_depth), criterion.Layer(-1)) return self
def test_jaccard(self): manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(4)) for i, left in enumerate(manifold.layers[-1].clusters): self.assertEqual( 1, left.jaccard(left), 'identical clusters should have a jaccard index of 1.') for j, right in enumerate(manifold.layers[-1].clusters): if i != j: self.assertEqual( 0, left.jaccard(right), f'different clusters should have a jaccard index of 0.' ) self.assertEqual( left.cardinality / left.parent.cardinality, left.jaccard(left.parent), f'jaccard index with parent should be equal to child/parent cardinality ratio.', )
def test_find_knn(self): data = datasets.bullseye()[0] point = data[0] points = sorted([ (d, p) for p, d in zip(range(data.shape[0]), cdist(np.asarray([point]), data, 'euclidean')[0]) ]) m = Manifold(data, 'euclidean') m.build_tree(criterion.MinPoints(10), criterion.MaxDepth(10)) ks = list(range(10)) ks.extend(range(10, data.shape[0], 1000)) for k in ks: naive_results = {p for d, p in points[:k]} results = m.find_knn(point, k) self.assertEqual(k, len(results)) self.assertSetEqual(naive_results, {p for p, _ in results})
def test_minimize_subsumed(self): fraction: float = 0.2 self.manifold.build( criterion.MaxDepth(12), criterion.LFDRange(80, 20), criterion.MinimizeSubsumed(fraction), ) for leaf in self.manifold.layers[-1].clusters: ancestry = self.manifold.ancestry(leaf) included = sum( (1 if ancestor in self.manifold.graph.clusters else 0 for ancestor in ancestry)) self.assertEqual( 1, included, f"expected exactly one ancestor to be in graph. Found {included}" ) return
def test_tree_search(self): np.random.seed(42) data, labels = datasets.line() manifold = Manifold(data, 'euclidean') manifold.build_tree(criterion.MinPoints(10), criterion.MaxDepth(5)) # Finding points that are in data. for depth, layer in enumerate(manifold.layers): for cluster in layer.clusters: linear = set([ c for c in layer if c.overlaps(cluster.medoid, cluster.radius) ]) tree = set( next(iter(manifold.layers[0])).tree_search( cluster.medoid, cluster.radius, cluster.depth).keys()) self.assertSetEqual(set(), tree - linear) for d in range(depth, 0, -1): parents = set([ manifold.select(cluster.name[:-1]) for cluster in linear ]) for parent in parents: results = parent.tree_search(cluster.medoid, cluster.radius, parent.depth) self.assertIn( parent, results, msg= f'\n{parent.name} not in {[c.name for c in results]}. ' f'got {len(results)} hits.') # Attempting to find points that *may* be in the data results = manifold.root.tree_search(point=np.asarray([0, 1]), radius=0., depth=-1) self.assertEqual(0, len(results)) with self.assertRaises(ValueError): _ = manifold.root.tree_search(point=np.asarray([0, 1]), radius=0., depth=-5) return
def test_replace(self): self.manifold.build( criterion.MaxDepth(12), criterion.LFDRange(80, 20), ) self.manifold.layers[-1].build_edges() for i in range(100): clusters: Dict[int, Cluster] = { c: cluster for c, cluster in zip(range(self.manifold.graph.cardinality), self.manifold.graph.clusters) } if len(clusters) < 10: break sample_size = len(clusters) // 10 samples: List[int] = list( map( int, np.random.choice(self.manifold.graph.cardinality, size=sample_size, replace=False))) removals: Set[Cluster] = { clusters[c] for c in samples if clusters[c].children } additions: Set[Cluster] = set() [additions.update(cluster.children) for cluster in removals] self.manifold.graph.replace_clusters( removals=removals, additions=additions, recompute_probabilities=True, ) clusters: Set[Cluster] = set(self.manifold.graph.clusters) subsumed_clusters: Set[Cluster] = self.manifold.graph.cache[ 'subsumed_clusters'] subsumed_edges: Dict[ Cluster, Set[Edge]] = self.manifold.graph.cache['subsumed_edges'] walkable_clusters: Set[Cluster] = self.manifold.graph.cache[ 'walkable_clusters'] walkable_edges: Dict[ Cluster, Set[Edge]] = self.manifold.graph.cache['walkable_edges'] self.assertTrue( subsumed_clusters.issubset(clusters), f"\n1. subsumed clusters were not subset of clusters. iter: {i}" ) self.assertTrue( walkable_clusters.issubset(clusters), f"\n2. walkable clusters were not subset of clusters. iter: {i}" ) self.assertTrue( walkable_clusters.isdisjoint(subsumed_clusters), f"\n3. walkable clusters and subsumed clusters were not disjoint sets. iter: {i}" ) self.assertSetEqual( clusters, subsumed_clusters.union(walkable_clusters), f"\n4. union of subsumed and walkable clusters was not the same as all clusters. iter: {i}" ) self.assertSetEqual( clusters, set(subsumed_edges.keys()), f"\n5. keys in subsumed edges were not the same as clusters. iter: {i}" ) self.assertSetEqual( walkable_clusters, set(walkable_edges.keys()), f"\n6. keys in walkable edges were not the same as walkable clusters. iter: {i}" ) return
def test_medoid_near_centroid(self): self.manifold.build(criterion.MedoidNearCentroid(), criterion.MaxDepth(8)) # self.plot() return
def test_uniform_distribution(self): self.manifold.build(criterion.UniformDistribution(), criterion.MaxDepth(8)) # self.plot() return