def test_build(self): m = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(1)) self.assertEqual(2, len(m.layers)) m.build(criterion.MaxDepth(2)) self.assertEqual(3, len(m.layers)) self.assertEqual(len(self.data), m.graphs[0].population) return
def test_partition_backends(self): data = datasets.random(n=100, dimensions=5)[0] m_single = Manifold(data, 'euclidean')._partition_single( [criterion.MaxDepth(5)]) m_thread = Manifold(data, 'euclidean')._partition_threaded( [criterion.MaxDepth(5)]) self.assertEqual(m_single, m_thread) return
def test_random_no_limits(self): # We begin by getting some data and building with no constraints. data = np.random.randn(100, 3) manifold = Manifold(data, 'euclidean').build() # With no constraints, clusters should be singletons. self.assertEqual(data.shape[0], manifold.graph.population) self.assertEqual(1, len(manifold.find_clusters(data[0], 0., -1))) self.assertEqual(1, len(manifold.find_points(data[0], 0.))) self.assertEqual(data.shape[0], manifold.layers[-1].cardinality) return
def test_eq(self): self.assertEqual(self.manifold, self.manifold) other = Manifold(self.data, 'euclidean', argpoints=0.2).build( criterion.MaxDepth(10), criterion.LFDRange(60, 50), ) self.assertNotEqual(self.manifold, other) self.assertEqual(other, other) other = Manifold(self.data, 'cosine') self.assertNotEqual(self.manifold, other) return
def test_random_large(self): data = np.random.randn(1000, 3) manifold = Manifold(data, 'euclidean').build( criterion.MaxDepth(10), criterion.LFDRange(60, 50), ) for _ in range(10): point = int(np.random.choice(3)) linear_results = linear_search(data[point], 0.5, data, manifold.metric) self.assertEqual(len(linear_results), len(manifold.find_points(data[point], 0.5))) return
def depth_distribution(shape: str, filename: str): data = SHAPES[shape](num_points=10**3).T manifold: Manifold = Manifold(data, 'euclidean').build() depths: List[int] = [leaf.depth for leaf in manifold.layers[-1].clusters] hist_plot(depths, manifold.depth, False, 'depths', f'depths of leaves for {shape}', filename) return
def test_replace(self): manifold = Manifold(self.data, 'euclidean').build( criterion.MaxDepth(12), criterion.LFDRange(80, 20), ) graph = manifold.layers[-1].build_edges() for i in range(10): clusters: Dict[int, Cluster] = {c: cluster for c, cluster in zip(range(graph.cardinality), graph.clusters)} if len(clusters) < 10: break sample_size = len(clusters) // 10 samples: List[int] = list(map(int, np.random.choice(graph.cardinality, size=sample_size, replace=False))) removals: Set[Cluster] = {clusters[c] for c in samples if clusters[c].children} additions: Set[Cluster] = set() [additions.update(cluster.children) for cluster in removals] graph.replace_clusters( removals=removals, additions=additions, ) clusters: Set[Cluster] = set(graph.clusters) self.assertEqual(0, len(removals.intersection(clusters)), f'\n1. Some removals clusters were still in the graph. iter {i}') self.assertTrue(additions.issubset(clusters), f'\n2. Some additions clusters were not in the graph. iter {i}') removal_edges: Set[Edge] = {edge for cluster in removals for edge in graph.edges if cluster in edge} self.assertEqual(0, len(removal_edges), f'\n3. Some removals clusters were still found among edges. iter {i}') self.assertEqual(0, len(graph.cache), f'\n4. Graph cache had some elements. {[k for k in graph.cache.keys()]}. iter {i}') return
def test_argsamples(self): data = np.zeros((100, 100)) for i in range(10): data = np.concatenate([data, np.ones((1, 100)) * i], axis=0) manifold = Manifold(data, 'euclidean') cluster = Cluster(manifold, manifold.argpoints, '') self.assertLessEqual(i + 1, len(cluster.argsamples)) return
def setUpClass(cls) -> None: cls.data, cls.labels = datasets.random(n=1000, dimensions=3) cls.manifold = Manifold(cls.data, 'euclidean') cls.manifold.build( criterion.MaxDepth(8), criterion.LFDRange(60, 50), ) return
def test_pruned(self): manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(10), criterion.Layer(8)) graph = manifold.graphs[0] pruned_graph, subsumed_clusters = graph.pruned_graph self.assertLessEqual(pruned_graph.cardinality, graph.cardinality) self.assertSetEqual(set(pruned_graph.clusters), set(subsumed_clusters.keys())) for cluster, subsumed in subsumed_clusters.items(): self.assertEqual(0, len(subsumed.intersection(set(pruned_graph.clusters))))
def test_two_points_with_dups(self): # Here we have two distinct clusters. data = np.concatenate([np.ones((500, 2)) * -2, np.ones((500, 2)) * 2]) manifold = Manifold(data, 'euclidean').build() # We expect building to stop with two clusters. self.assertEqual( 2, manifold.graph.cardinality, f'Expected 2 clusters, got {manifold.graph.cardinality}') return
def test_init(self): m = Manifold(self.data, 'euclidean') self.assertEqual(1, len(m.layers)) m = Manifold(self.data, 'euclidean', [1, 2, 3]) self.assertListEqual([1, 2, 3], m.argpoints) fraction = 0.2 m = Manifold(self.data, 'euclidean', fraction) self.assertEqual(int(len(self.data) * fraction), len(m.argpoints)) with self.assertRaises(ValueError): # noinspection PyTypeChecker Manifold(self.data, 'euclidean', ['a', 'b', 'c']) with self.assertRaises(ValueError): # noinspection PyTypeChecker Manifold(self.data, 'euclidean', 'apples') return
def radii_distribution(shape: str, filename: str): data = SHAPES[shape](num_points=10**3).T manifold: Manifold = Manifold(data, 'euclidean').build() radii: List[float] = [ cluster.radius for layer in manifold.layers for cluster in layer.clusters if cluster.radius > 0 ] hist_plot(radii, 32, True, 'radius', f'radii of clusters in tree for {shape}', filename) return
def setUpClass(cls) -> None: np.random.seed(42) cls.data, _ = datasets.bullseye(n=1000, num_rings=2) cls.manifold: Manifold = Manifold(cls.data, 'euclidean') cls.manifold.build( criterion.MaxDepth(10), criterion.Layer(5), ) cls.graph: Graph = cls.manifold.graphs[0] return
def test_neighbors(self): for dataset in [ datasets.bullseye, ]: # datasets.spiral_2d, datasets.tori, datasets.skewer, datasets.line]: data, labels = dataset() manifold = Manifold(data, 'euclidean').build( criterion.MaxDepth(12), criterion.Layer(8), ) for cluster in manifold.graphs[0].clusters: potential_neighbors: List[Cluster] = [ c for c in manifold.graphs[0].clusters if c.name != cluster.name ] argcenters: List[int] = [ c.argmedoid for c in potential_neighbors ] distances: List[float] = list( cluster.distance_from(argcenters)) radii: List[float] = [ cluster.radius + c.radius for c in potential_neighbors ] true_neighbors = { c: d for c, d, r in zip(potential_neighbors, distances, radii) if d <= r } neighbors = { edge.neighbor(cluster): edge.distance for edge in manifold.graphs[0].edges_from(cluster) } extras = set(neighbors.keys()) - set(true_neighbors.keys()) self.assertEqual( 0, len(extras), msg= f'got extra neighbors: optimal, true {len(true_neighbors)}, actual {len(neighbors)}\n' + "\n".join([ f"{c.name}, {cluster.radius + c.radius:.6f}" for c in extras ])) missed = set(true_neighbors.keys()) - set(neighbors.keys()) self.assertEqual( 0, len(missed), msg= f'missed some neighbors: optimal, true {len(true_neighbors)}, actual {len(neighbors)}\n' + '\n'.join([ f'{c.name}, {cluster.radius + c.radius:.6f}' for c in missed ])) return
def test_build_tree(self): m = Manifold(self.data, 'euclidean') self.assertEqual(1, len(m.layers)) m.build_tree(criterion.AddLevels(2)) self.assertEqual(3, len(m.layers)) # MaxDepth shouldn't do anything in build_tree if we're beyond that depth already. m.build_tree(criterion.MaxDepth(1)) self.assertEqual(3, len(m.layers)) m.build_tree() self.assertEqual(len(self.data), m.layers[-1].cardinality) return
def get_persistent_components(data: np.array) -> np.array: manifold: Manifold = Manifold(data, 'euclidean').build() num_cells = (len(data) * (len(data) - 1)) // 2 persistence_vectors: np.array = np.zeros(shape=(manifold.depth + 1, num_cells), dtype=int) for depth, layer in enumerate(manifold.layers): for i, component in enumerate(layer.components): points: List[int] = list() [points.extend(cluster.argpoints) for cluster in component] for j, left in enumerate(points): for right in points[j + 1:]: persistence_vectors[depth, key(left, right)] = i + 1 return persistence_vectors
def test_dot_file(self): manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MinRadius(0.2), criterion.Layer(7)) graph: Graph = manifold.graphs[0] old_clusters: Set[Cluster] = {cluster for cluster in graph.clusters} old_edges: Set[Edge] = {edge for edge in graph.edges} dot_string = manifold.graphs[0].as_dot_string('bullseye_d7') graph = graph.from_dot_string(dot_string) new_clusters: Set[Cluster] = {cluster for cluster in graph.clusters} new_edges: Set[Edge] = {edge for edge in graph.edges} self.assertEqual(old_clusters, new_clusters, f'Found mismatch between old and new clusters.') self.assertEqual(old_edges, new_edges, f'Found mismatch between old and new edges.') return
def test_jaccard(self): manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(5)) for i, left in enumerate(manifold.layers): self.assertEqual(1, left.jaccard(left), 'identical graphs should have a jaccard index of 1.') for j, right in enumerate(manifold.layers): if i != j: self.assertEqual(1, left.jaccard(right), f'different layers should have a jaccard index of 1.') while len(manifold.layers[-1].components) < 2: manifold.build(criterion.MaxDepth(manifold.depth + 1)) for i, left in enumerate(manifold.layers[-1].components): self.assertEqual(1, left.jaccard(left), 'identical components should have a jaccard index of 1.') for j, right in enumerate(manifold.layers[-1].components): if i != j: self.assertEqual(0, left.jaccard(right), f'different components should have a jaccard index of 0.')
def create_barcodes( data: np.array, *, normalize: bool = True, merge: Optional[int] = 4, ) -> Dict[int, Barcodes]: manifold: Manifold = Manifold(data, 'euclidean').build_tree( criterion.MaxDepth(20)) barcodes: Barcodes = dict() # living-clusters is a heap with highest radius at the top living_clusters = [Code(manifold.root, manifold.root.radius)] heapq.heapify(living_clusters) while living_clusters: # Go over max-heap current: Code = heapq.heappop(living_clusters) if current.cluster.children: # handle children current.set_birth(current.radius) [left, right] = list(current.cluster.children) if left.radius >= current.radius: # left is still-born barcodes[left] = (current.radius, current.radius) else: # or added to living clusters heapq.heappush(living_clusters, Code(left, current.radius)) if right.radius >= current.radius: # right is still-born barcodes[right] = (current.radius, current.radius) else: # or added to living-clusters heapq.heappush(living_clusters, Code(right, current.radius)) else: # otherwise set birth to zero-radius current.set_birth(0.) # add current to dict of barcodes barcodes[current.cluster] = (current.birth, current.death) if normalize: barcodes = _normalize(manifold.root.radius, barcodes) barcodes_by_cardinality = _group_by_cardinality(barcodes) if merge is not None: barcodes_by_cardinality = _merge_high_cardinalities( merge, barcodes_by_cardinality) return barcodes_by_cardinality
def volume_ratios(data: np.ndarray, filename: str) -> pd.DataFrame: if os.path.exists(filename): volumes_df = pd.read_csv(filename) volumes_df.fillna('', inplace=True) else: # Create manifold from data manifold = Manifold(data, 'euclidean').build(criterion.MaxDepth(16), ) # get volumes of all clusters in the manifold volumes: Dict[Cluster, float] = { cluster: cluster.radius**3 for layer in manifold.layers for cluster in layer.clusters } clusters: List[Cluster] = list(sorted(list(volumes.keys()))) clusters_enumerations: Dict[Cluster, int] = { c: i for i, c in enumerate(clusters) } # Initialize table for volume ratios ratios = np.zeros(shape=(len(volumes), manifold.depth + 1), dtype=np.float32) for c, i in clusters_enumerations.items(): ratios[i][c.depth] = c.radius**3 # populate table with correct ratios for graph in manifold.graphs: for cluster in graph.clusters: for g in manifold.graphs[cluster.depth + 1:]: children = [ c for c in g if cluster.name == c.name[:cluster.depth] ] total_volume = sum( (volumes[c] for c in children)) + np.finfo(np.float32).eps ratios[clusters_enumerations[cluster]][ g.depth] = ratios[clusters_enumerations[cluster]][ cluster.depth] / total_volume ratios[clusters_enumerations[cluster]][cluster.depth] = 0. # write a .csv of ratios volumes_df = pd.DataFrame(data=ratios) volumes_df['cluster_names'] = [cluster.name for cluster in clusters] volumes_df.to_csv(filename, index=False) return volumes_df
def test_load(self): original = self.manifold with TemporaryFile() as fp: original.dump(fp) fp.seek(0) loaded = Manifold.load(fp, self.data) self.assertEqual(original, loaded) self.assertEqual(set(original.layers[-1]), set(loaded.layers[-1])) self.assertEqual(original.graphs[0], loaded.graphs[0]) for layer in loaded.layers: for cluster in layer: self.assertIn('radius', cluster.cache) self.assertIn('argradius', cluster.cache) self.assertIn('argsamples', cluster.cache) self.assertIn('argmedoid', cluster.cache) self.assertIn('local_fractal_dimension', cluster.cache) return
def test_jaccard(self): manifold: Manifold = Manifold(self.data, 'euclidean').build(criterion.MaxDepth(4)) for i, left in enumerate(manifold.layers[-1].clusters): self.assertEqual( 1, left.jaccard(left), 'identical clusters should have a jaccard index of 1.') for j, right in enumerate(manifold.layers[-1].clusters): if i != j: self.assertEqual( 0, left.jaccard(right), f'different clusters should have a jaccard index of 0.' ) self.assertEqual( left.cardinality / left.parent.cardinality, left.jaccard(left.parent), f'jaccard index with parent should be equal to child/parent cardinality ratio.', )
def test_all_same(self): # A bit simpler, every point is the same. data = np.ones((1000, 3)) manifold = Manifold(data, 'euclidean').build() # There should only ever be one cluster here. self.assertEqual(1, len(manifold.layers)) manifold.build_tree() # Even after explicit deepen calls. self.assertEqual(1, len(manifold.layers)) self.assertEqual( 1, len(manifold.find_clusters(np.asarray([1, 1, 1]), 0.0, -1))) # And, we should get all 1000 points back for any of the data. self.assertEqual(1000, len(manifold.find_points(data[0], 0.0))) return
def build_data( method_function: Callable[[Graph], Dict[Cluster, float]], graph: Graph, manifold: Manifold, labels: numpy.ndarray, ): cluster_scores = list((cluster, score) for cluster, score in method_function(graph).items()) train_x = numpy.zeros(shape=(len(cluster_scores), 6), dtype=numpy.float32) train_y = numpy.zeros(shape=(len(cluster_scores),)) for i, (cluster, score) in enumerate(cluster_scores): train_x[i] = manifold.cluster_ratios(cluster) y_true = numpy.asarray(labels[cluster.argpoints], dtype=numpy.float32) # TODO: Why was this numpy.sum instead of float? loss = float(numpy.mean(numpy.square(score - y_true))) / cluster.cardinality train_y[i] = 1. - loss return train_x, train_y
class TestCriterion(unittest.TestCase): @classmethod def setUpClass(cls) -> None: cls.data = datasets.bullseye(n=500)[0] return def setUp(self) -> None: self.manifold = Manifold(self.data, 'euclidean') return def test_min_radius(self): min_radius = 0.1 self.manifold.build(criterion.MinRadius(min_radius), ) self.assertTrue( all((cluster.radius > min_radius for layer in self.manifold.layers for cluster in layer if cluster.children))) self.assertTrue( all((cluster.radius <= min_radius for layer in self.manifold.layers for cluster in layer if not cluster.children))) return def test_combinations(self): min_points, max_depth = 10, 8 self.manifold.build(criterion.MinPoints(min_points), criterion.MaxDepth(max_depth)) [ self.assertLessEqual(len(c.children), 1) for g in self.manifold.layers for c in g if len(c.argpoints) <= min_points or c.depth >= max_depth ] # self.plot() return def test_medoid_near_centroid(self): self.manifold.build(criterion.MedoidNearCentroid(), criterion.MaxDepth(8)) # self.plot() return def test_uniform_distribution(self): self.manifold.build(criterion.UniformDistribution(), criterion.MaxDepth(8)) # self.plot() return def test_lfd_range(self): self.manifold.build(criterion.MaxDepth(12), criterion.LFDRange(60, 50)) for leaf in self.manifold.layers[-1].clusters: ancestry = self.manifold.ancestry(leaf) included = sum( (1 if ancestor in self.manifold.graph.clusters else 0 for ancestor in ancestry)) self.assertEqual( 1, included, f"expected exactly one ancestor to be in graph. Found {included}" ) return def test_minimize_subsumed(self): fraction: float = 0.2 self.manifold.build( criterion.MaxDepth(12), criterion.LFDRange(80, 20), criterion.MinimizeSubsumed(fraction), ) for leaf in self.manifold.layers[-1].clusters: ancestry = self.manifold.ancestry(leaf) included = sum( (1 if ancestor in self.manifold.graph.clusters else 0 for ancestor in ancestry)) self.assertEqual( 1, included, f"expected exactly one ancestor to be in graph. Found {included}" ) return
def test_partition(self): manifold = Manifold(datasets.xor()[0], 'euclidean') cluster = manifold.select('') children = list(cluster.partition()) self.assertGreater(len(children), 1) return
def setUpClass(cls) -> None: cls.data = np.random.randn(1_000, 100) cls.manifold = Manifold(cls.data, 'euclidean') return
def setUp(self) -> None: self.manifold = Manifold(self.data, 'euclidean') return