def test_mass_grid(self): """ Check that the mass-based grid is constructed correctly. """ ## Test typical input - should be sorted levels = utl.define_density_mass_grid(self.unique_density) answer = np.sort(self.unique_density) assert_array_equal(answer, levels) ## Test more levels than density values (answer is the same as typical # input). levels = utl.define_density_mass_grid(self.unique_density, num_levels=self.n * 2) assert_array_equal(answer, levels) ## Test fewer levels than density values. levels = utl.define_density_mass_grid(self.unique_density, num_levels=2) answer = np.array([1, 10]) assert_array_equal(answer, levels) ## Test negative values. levels = utl.define_density_mass_grid(self.generic_array) answer = np.sort(self.generic_array) assert_array_equal(answer, levels) ## Test uniform input. levels = utl.define_density_mass_grid(self.uniform_density) self.assertItemsEqual(levels, [1.])
def construct_tree_from_graph(adjacency_list, density, prune_threshold=None, num_levels=None, verbose=False): """ Construct a level set tree from a similarity graph and a density estimate. Parameters ---------- adjacency_list : list [list] Adjacency list of the k-nearest neighbors graph on the data. Each entry contains the indices of the `k` closest neighbors to the data point at the same row index. density : list [float] Estimate of the density function, evaluated at the data points represented by the keys in `adjacency_list`. prune_threshold : int, optional Leaf nodes with fewer than this number of members are recursively merged into larger nodes. If 'None' (the default), then no pruning is performed. num_levels : list int, optional Number of density levels in the constructed tree. If None (default), `num_levels` is internally set to be the number of rows in `X`. verbose : bool, optional If True, a progress indicator is printed at every 100th level of tree construction. Returns ------- T : levelSetTree See the LevelSetTree class for attributes and method definitions. See Also -------- construct_tree, LevelSetTree Examples -------- >>> X = numpy.random.rand(100, 2) >>> knn_graph, radii = debacl.utils.knn_graph(X, k=8) >>> density = debacl.utils.knn_density(radii, n=100, p=2, k=8) >>> tree = debacl.construct_tree_from_graph(knn_graph, density, ... prune_threshold=5) >>> print tree +----+-------------+-----------+------------+----------+------+--------+----------+ | id | start_level | end_level | start_mass | end_mass | size | parent | children | +----+-------------+-----------+------------+----------+------+--------+----------+ | 0 | 0.000 | 0.768 | 0.000 | 0.390 | 100 | None | [1, 2] | | 1 | 0.768 | 1.494 | 0.390 | 0.790 | 30 | 0 | [7, 8] | | 2 | 0.768 | 4.812 | 0.390 | 1.000 | 31 | 0 | [] | | 7 | 1.494 | 2.375 | 0.790 | 0.950 | 6 | 1 | [] | | 8 | 1.494 | 2.308 | 0.790 | 0.940 | 5 | 1 | [] | +----+-------------+-----------+------------+----------+------+--------+----------+ """ ## Initialize the graph and cluster tree levels = _utl.define_density_mass_grid(density, num_levels=num_levels) G = _nx.from_dict_of_lists( {i: neighbors for i, neighbors in enumerate(adjacency_list)}) T = LevelSetTree(density, levels) ## Figure out roots of the tree cc0 = _nx.connected_components(G) for i, c in enumerate(cc0): # c is only the vertex list, not the subgraph T._subgraphs[i] = G.subgraph(c) T.nodes[i] = ConnectedComponent( i, parent=None, children=[], start_level=0., end_level=None, start_mass=0., end_mass=None, members=c) # Loop through the removal grid previous_level = 0. n = float(len(adjacency_list)) for i, level in enumerate(levels): if verbose and i % 100 == 0: _logging.info("iteration {}".format(i)) ## figure out which points to remove, i.e. the background set. bg = _np.where((density > previous_level) & (density <= level))[0] previous_level = level ## compute the mass after the current bg set is removed old_vcount = sum([x.number_of_nodes() for x in T._subgraphs.itervalues()]) current_mass = 1. - ((old_vcount - len(bg)) / n) # loop through active components, i.e. subgraphs deactivate_keys = [] # subgraphs to deactivate at the iter end activate_subgraphs = {} # new subgraphs to add at the end of the iter for (k, H) in T._subgraphs.iteritems(): ## remove nodes at the current level H.remove_nodes_from(bg) ## check if subgraph has vanished if H.number_of_nodes() == 0: T.nodes[k].end_level = level T.nodes[k].end_mass = current_mass deactivate_keys.append(k) else: # subgraph hasn't vanished ## check if subgraph now has multiple connected components # NOTE: this is *the* bottleneck if not _nx.is_connected(H): ## deactivate the parent subgraph T.nodes[k].end_level = level T.nodes[k].end_mass = current_mass deactivate_keys.append(k) ## start a new subgraph & node for each child component cc = _nx.connected_components(H) for c in cc: new_key = max(T.nodes.keys()) + 1 T.nodes[k].children.append(new_key) activate_subgraphs[new_key] = H.subgraph(c) T.nodes[new_key] = ConnectedComponent( new_key, parent=k, children=[], start_level=level, end_level=None, start_mass=current_mass, end_mass=None, members=c) # update active components for k in deactivate_keys: del T._subgraphs[k] T._subgraphs.update(activate_subgraphs) ## Prune the tree if prune_threshold is not None: T = T.prune(threshold=prune_threshold) return T