def highlights(comments, min_size=5, dist_cutoff=0.5): """ This takes a set of comments, clusters them, and then returns representatives from clusters above some threshold size. Args: | comments -- list of Commentables | min_size -- int, minimium cluster size to consider | dist_cutoff -- float, the density at which to snip the hierarchy for clusters Future improvements: - Persist hierarchies instead of rebuilding from scratch (using Hierarchy.load & Hierarchy.save) - Tweak min_size and dist_cutoff for the domain. """ v = joblib.load(geiger_path) vecs = v.vectorize([strip_tags(c.body) for c in comments], train=False) vecs = vecs.toarray() log.info('Clustering {0} comments...'.format(vecs.shape[0])) # Build the hierarchy. h = Hierarchy(metric='cosine', lower_limit_scale=0.9, upper_limit_scale=1.2) ids = h.fit(vecs) log.info('Processing resulting clusters...') # Build a map of hierarchy ids to comments. map = {ids[i]: c for i, c in enumerate(comments)} # Generate the clusters. clusters = h.clusters(distance_threshold=dist_cutoff, with_labels=False) # Filter to clusters of at least some minimum size. clusters = [c for c in clusters if len(c) >= min_size] # Get the clusters as comments. clusters = [[map[id] for id in clus] for clus in clusters] # From each cluster, pick the comment with the highest score. highlights = [max(clus, key=lambda c: c.score) for clus in clusters] # Suppress replies, show only top-level. for h in highlights: h.replies = [] log.info('Done.') return highlights
class HierarchyTest(unittest.TestCase): def setUp(self): self.initial_vecs = [[10], [30]] self.h = Hierarchy(metric='euclidean', lower_limit_scale=0.1, upper_limit_scale=1.5) self.h.fit(self.initial_vecs) self.initial_leaves = [0,1] self.initial_clus = 2 def _build_cluster_node(self, num_children=2): """ Just builds a cluster node with two leaf node children. """ children = [self.h.create_node(vec=[i*20]) for i in range(num_children)] return self.h.create_node(children=children) def test_init(self): # The dist and graph matrices are square (nxn). self.assertEqual(self.h.dists.shape, (3,3)) self.assertEqual(self.h.g.mx.shape, (3,3)) # The centers matrix is nxm. self.assertEqual(self.h.centers.shape, (3,1)) def test_fit_returns_uuids(self): vecs = [[20], [30], [40]] new_uuids = self.h.fit(vecs) self.assertEqual(new_uuids, [3,4,6]) def test_save_and_load(self): save_path = '/tmp/hierarchy.ihac' if os.path.exists(save_path): os.remove(save_path) points = generate_random_points() self.h.fit(points) ids = self.h.ids graph = self.h.g.mx dists = self.h.dists ndists = self.h.ndists centers = self.h.centers avail = self.h.available_ids self.h.save(save_path) self.h.fit(points) h = Hierarchy.load(save_path) assert_array_equal(graph, h.g.mx) assert_array_equal(dists, h.dists) assert_array_equal(ids, h.ids) assert_array_equal(ndists, h.ndists) assert_array_equal(centers, h.centers) assert_array_equal(avail, h.available_ids) def test_create_node(self): node = self.h.create_node(vec=[20]) expected_dists = np.array([[ 0., 20., 10., 10.], [ 20., 0., 10., 10.], [ 10., 10., 0., 0.], [ 10., 10., 0., 0.]]) # Distance matrix should be reshaped. self.assertEqual(self.h.dists.shape, (4,4)) self.assertTrue((self.h.dists == expected_dists).all()) self.assertEqual(self.h.nodes, self.initial_leaves + [self.initial_clus, node]) # Id should properly be assigned. self.assertEqual(node, 3) # Params should be passed through. self.assertEqual(self.h.centers[node], [20]) def test_delete_node(self): # Create a simple hierarchy to test. nodes = [self.h.create_node(vec=[i*10]) for i in range(5)] children = nodes[:3] siblings = nodes[3:] n = self.h.create_node(children=children) parent = self.h.create_node(children=siblings + [n]) assert_array_equal(self.h.g.get_siblings(n), siblings) assert_array_equal(self.h.g.get_parent(n), parent) assert_array_equal(self.h.g.get_children(n), children) old_ids = [n] + [c for c in self.h.g.get_children(n)] self.h.delete_node(n) # Node should be gone from the hierarchy. assert_array_equal(self.h.g.get_siblings(n), []) self.assertEqual(self.h.g.get_parent(n), None) self.assertTrue(n not in self.h.g.get_children(parent)) for s in siblings: self.assertTrue(n not in self.h.g.get_siblings(s)) # The node and its children's ids should be available for reuse. self.assertEqual(set(self.h.available_ids), set(old_ids)) # Its children should also be deleted. for c in children: self.assertEqual(self.h.g.get_siblings(c), []) self.assertEqual(self.h.g.get_parent(c), None) # The ids should be reused. new_nodes = [self.h.create_node(vec=[i*10]) for i in range(len(old_ids))] for nn in new_nodes: self.assertTrue(nn in old_ids) def test_demote(self): # If N_i and N_j are sibling nodes, # DEMOTE(N_i, N_j) # will set N_j as a child to N_i. # Build three sibling cluster nodes. n_i = self._build_cluster_node() n_j = self._build_cluster_node() n_k = self._build_cluster_node() parent = self.h.create_node(children=[n_i, n_j, n_k]) self.h.demote(n_i, n_j) self.assertEqual(n_i, self.h.g.get_parent(n_j)) def test_demote_omits_clusters_with_only_childs(self): # If demoting causes a cluster node to have only one child, that node # should be removed and replaced by its only child node. # Build two sibling cluster nodes. n_i = self._build_cluster_node() n_j = self._build_cluster_node() parent = self.h.create_node(children=[n_i, n_j]) self.h.demote(n_i, n_j) # The parent should be removed. self.assertFalse(parent in self.h.nodes) # n_i should be the root now. self.assertTrue(self.h.g.is_root(n_i)) self.assertTrue(n_j in self.h.g.get_children(n_i)) self.assertEqual(self.h.g.get_parent(n_j), n_i) def test_merge(self): # If N_i and N_j are sibling nodes under a parent N, # MERGE(N_i, N_j) # will create a new cluster node, N_k, with N_i and N_j # as its chidlren and N as its parent. parent = self._build_cluster_node(num_children=3) n_i, n_j, n_p = self.h.g.get_children(parent) # All three nodes are siblings. self.assertEqual(self.h.g.get_siblings(n_p), [n_i, n_j]) self.h.merge(n_i, n_j) # The old parent should be replaced. n_k = self.h.g.get_parent(n_i) self.assertNotEqual(n_k, parent) # Now n_i and n_j should be in their own cluster. self.assertEqual(self.h.g.get_siblings(n_i), [n_j]) # And that new cluster should be a sibling to the remaining node. self.assertEqual(self.h.g.get_siblings(n_p), [n_k]) def test_split(self): # If N_k is a cluster node with a set of children S_k, # SPLIT(θ, N_k) # will split N_k into two new nodes, N_i and N_j, each with a different # subset of S_k (S_i and S_j, respectively). S_k is split by disconnecting an # edge in N_k's minimum spanning tree (MST). n_i = self.h.to_iid(self.h.incorporate([10.05])) n_j = self.h.to_iid(self.h.incorporate([10.08])) n_k = self.h.to_iid(self.h.incorporate([10.10])) sibs = self.h.g.get_siblings(self.initial_leaves[0]) parent = self.h.g.get_parent(self.initial_leaves[0]) self.assertEqual(sibs, [n_i, n_j, n_k]) self.h.split(parent) # n_i, n_j, and n_k are all closer to each other than they are to the initial leaf, # so we expect them to be in their own cluster. self.assertEqual(self.h.g.get_siblings(n_i), [n_j, n_k]) self.assertEqual(self.h.g.get_siblings(self.initial_leaves[0]), [self.initial_leaves[1], self.h.g.get_parent(n_i)]) def test_restructure(self): # This isn't a comprehensive test, but a simple check. # As the only two nodes initially, these two are siblings. self.assertEqual(self.h.g.get_siblings(self.initial_leaves[0]), [self.initial_leaves[1]]) # Add some new nodes very close to the first leaf node ([10]). n_i = self.h.to_iid(self.h.incorporate([10.05])) n_j = self.h.to_iid(self.h.incorporate([10.08])) n_k = self.h.to_iid(self.h.incorporate([10.10])) # The second leaf node ([30]) should be different enough that it should # have moved to its own cluster. self.assertNotEqual(self.h.g.get_siblings(self.initial_leaves[0]), [self.initial_leaves[1]]) def test_ins_hierarchy(self): # If N_i is a node in the hierarchy, child to node N, # and N_j is a node not yet in the hierarchy, # INS_HIERARCHY(N_i, N_j) # creates a new node N_k with children N_i and N_j and N as its parent. # Build a node with a parent and another node. n_i = self.h.g.get_children(self._build_cluster_node())[0] n_j = self.h.create_node(vec=[20]) self.h.ins_hierarchy(n_i, n_j) # The nodes should be siblings now. self.assertEqual(self.h.g.get_siblings(n_i), [n_j]) def test_incorporate_adds_to_existing_cluster_node(self): # A node this close should be added as a sibling. node_i = self.h.to_iid(self.h.incorporate([11])) self.assertEqual(self.h.g.get_siblings(self.initial_leaves[0]), [node_i]) def test_incorporate_creates_new_cluster_node(self): # The cluster node and the new node should be siblings. node_i = self.h.to_iid(self.h.incorporate([90])) self.assertEqual(self.h.g.get_siblings(self.initial_clus), [node_i]) def test_prune(self): self.h.fit([[20], [30]]) self.assertEqual(self.h.available_ids, []) assert_array_equal(self.h.g.leaves, [0,1,3,4]) self.h.prune([5]) self.assertEqual(self.h.available_ids, [1,4,5]) assert_array_equal(self.h.g.leaves, [0,3]) assert_array_equal(self.h.nodes, [0,2,3]) def test_clusters(self): node_i = self.h.to_iid(self.h.incorporate([90])) node_j = self.h.to_iid(self.h.incorporate([40])) clusters = self.h.clusters(distance_threshold=14.0, with_labels=False) self.assertEqual(clusters, [self.initial_leaves + [node_j], [node_i]]) clusters = self.h.clusters(distance_threshold=71.0, with_labels=False) self.assertEqual(clusters, [self.initial_leaves + [node_j, node_i]]) clusters = self.h.clusters(distance_threshold=0.0, with_labels=False) self.assertEqual(clusters, [[self.initial_leaves[0]], [self.initial_leaves[1]], [node_j], [node_i]])
class ClusteringTest(unittest.TestCase): def setUp(self): self.h = Hierarchy() def test_labels(self): points_1 = [np.array([p]) for p in np.arange(0.1, 1.0, 0.1)] points_2 = [np.array([p]) for p in np.arange(20.0, 21.0, 0.1)] points_3 = [np.array([p]) for p in np.arange(0.1, 1.0, 0.1)] self.h.fit(points_1) self.h.fit(points_2) self.h.fit(points_3) clusters, labels = self.h.clusters(distance_threshold=0.5) # Expect that the labels preserve the input order. num_1 = len(points_1) labels_1 = labels[:num_1] num_2 = len(points_2) labels_2 = labels[num_1:num_1+num_2] labels_3 = labels[num_1+num_2:] # labels_1 and labels_3 are operating off the same data (points) so they should be equivalent. self.assertEqual(labels_1, labels_3) self.assertNotEqual(labels_1, labels_2) def test_no_cluster_nodes_with_single_cluster_child(self): points = [0.30, 0.40, 0.80, 2.70, 0.20, 2.40] points = [np.array([p]) for p in points] points_1, points_2 = points[:4], points[4:] self.h.fit(points_1) bad_nodes = [n for n in self.h.nodes if self.h.g.is_cluster(n) and self.h.g.get_children(n).size <= 1] self.assertFalse(bad_nodes) self.h.fit(points_2) bad_nodes = [n for n in self.h.nodes if self.h.g.is_cluster(n) and self.h.g.get_children(n).size <= 1] self.assertFalse(bad_nodes) def test_many_points(self): """ Test clustering with 160 points. This should just execute without error. """ points = generate_random_points() self.h.fit(points) def test_fit_uuids_are_unique(self): save_path = '/tmp/hierarchy.ihac' if os.path.exists(save_path): os.remove(save_path) points = generate_random_points() points_list = np.array_split(points, 10) uuids = [] for group in points_list: uuids += self.h.fit(group) self.h.save(save_path) self.h = Hierarchy.load(save_path) num_uuids = len(uuids) num_u_uuids = len(set(uuids)) self.assertEqual(num_uuids, num_u_uuids)
class ClusteringTest(unittest.TestCase): def setUp(self): self.h = Hierarchy() def test_labels(self): points_1 = [np.array([p]) for p in np.arange(0.1, 1.0, 0.1)] points_2 = [np.array([p]) for p in np.arange(20.0, 21.0, 0.1)] points_3 = [np.array([p]) for p in np.arange(0.1, 1.0, 0.1)] self.h.fit(points_1) self.h.fit(points_2) self.h.fit(points_3) clusters, labels = self.h.clusters(distance_threshold=0.5) # Expect that the labels preserve the input order. num_1 = len(points_1) labels_1 = labels[:num_1] num_2 = len(points_2) labels_2 = labels[num_1:num_1 + num_2] labels_3 = labels[num_1 + num_2:] # labels_1 and labels_3 are operating off the same data (points) so they should be equivalent. self.assertEqual(labels_1, labels_3) self.assertNotEqual(labels_1, labels_2) def test_no_cluster_nodes_with_single_cluster_child(self): points = [0.30, 0.40, 0.80, 2.70, 0.20, 2.40] points = [np.array([p]) for p in points] points_1, points_2 = points[:4], points[4:] self.h.fit(points_1) bad_nodes = [ n for n in self.h.nodes if self.h.g.is_cluster(n) and self.h.g.get_children(n).size <= 1 ] self.assertFalse(bad_nodes) self.h.fit(points_2) bad_nodes = [ n for n in self.h.nodes if self.h.g.is_cluster(n) and self.h.g.get_children(n).size <= 1 ] self.assertFalse(bad_nodes) def test_many_points(self): """ Test clustering with 160 points. This should just execute without error. """ points = generate_random_points() self.h.fit(points) def test_fit_uuids_are_unique(self): save_path = '/tmp/hierarchy.ihac' if os.path.exists(save_path): os.remove(save_path) points = generate_random_points() points_list = np.array_split(points, 10) uuids = [] for group in points_list: uuids += self.h.fit(group) self.h.save(save_path) self.h = Hierarchy.load(save_path) num_uuids = len(uuids) num_u_uuids = len(set(uuids)) self.assertEqual(num_uuids, num_u_uuids)
class HierarchyTest(unittest.TestCase): def setUp(self): self.initial_vecs = [[10], [30]] self.h = Hierarchy(metric='euclidean', lower_limit_scale=0.1, upper_limit_scale=1.5) self.h.fit(self.initial_vecs) self.initial_leaves = [0, 1] self.initial_clus = 2 def _build_cluster_node(self, num_children=2): """ Just builds a cluster node with two leaf node children. """ children = [ self.h.create_node(vec=[i * 20]) for i in range(num_children) ] return self.h.create_node(children=children) def test_init(self): # The dist and graph matrices are square (nxn). self.assertEqual(self.h.dists.shape, (3, 3)) self.assertEqual(self.h.g.mx.shape, (3, 3)) # The centers matrix is nxm. self.assertEqual(self.h.centers.shape, (3, 1)) def test_fit_returns_uuids(self): vecs = [[20], [30], [40]] new_uuids = self.h.fit(vecs) self.assertEqual(new_uuids, [3, 4, 6]) def test_save_and_load(self): save_path = '/tmp/hierarchy.ihac' if os.path.exists(save_path): os.remove(save_path) points = generate_random_points() self.h.fit(points) ids = self.h.ids graph = self.h.g.mx dists = self.h.dists ndists = self.h.ndists centers = self.h.centers avail = self.h.available_ids self.h.save(save_path) self.h.fit(points) h = Hierarchy.load(save_path) assert_array_equal(graph, h.g.mx) assert_array_equal(dists, h.dists) assert_array_equal(ids, h.ids) assert_array_equal(ndists, h.ndists) assert_array_equal(centers, h.centers) assert_array_equal(avail, h.available_ids) def test_create_node(self): node = self.h.create_node(vec=[20]) expected_dists = np.array([[0., 20., 10., 10.], [20., 0., 10., 10.], [10., 10., 0., 0.], [10., 10., 0., 0.]]) # Distance matrix should be reshaped. self.assertEqual(self.h.dists.shape, (4, 4)) self.assertTrue((self.h.dists == expected_dists).all()) self.assertEqual(self.h.nodes, self.initial_leaves + [self.initial_clus, node]) # Id should properly be assigned. self.assertEqual(node, 3) # Params should be passed through. self.assertEqual(self.h.centers[node], [20]) def test_delete_node(self): # Create a simple hierarchy to test. nodes = [self.h.create_node(vec=[i * 10]) for i in range(5)] children = nodes[:3] siblings = nodes[3:] n = self.h.create_node(children=children) parent = self.h.create_node(children=siblings + [n]) assert_array_equal(self.h.g.get_siblings(n), siblings) assert_array_equal(self.h.g.get_parent(n), parent) assert_array_equal(self.h.g.get_children(n), children) old_ids = [n] + [c for c in self.h.g.get_children(n)] self.h.delete_node(n) # Node should be gone from the hierarchy. assert_array_equal(self.h.g.get_siblings(n), []) self.assertEqual(self.h.g.get_parent(n), None) self.assertTrue(n not in self.h.g.get_children(parent)) for s in siblings: self.assertTrue(n not in self.h.g.get_siblings(s)) # The node and its children's ids should be available for reuse. self.assertEqual(set(self.h.available_ids), set(old_ids)) # Its children should also be deleted. for c in children: self.assertEqual(self.h.g.get_siblings(c), []) self.assertEqual(self.h.g.get_parent(c), None) # The ids should be reused. new_nodes = [ self.h.create_node(vec=[i * 10]) for i in range(len(old_ids)) ] for nn in new_nodes: self.assertTrue(nn in old_ids) def test_demote(self): # If N_i and N_j are sibling nodes, # DEMOTE(N_i, N_j) # will set N_j as a child to N_i. # Build three sibling cluster nodes. n_i = self._build_cluster_node() n_j = self._build_cluster_node() n_k = self._build_cluster_node() parent = self.h.create_node(children=[n_i, n_j, n_k]) self.h.demote(n_i, n_j) self.assertEqual(n_i, self.h.g.get_parent(n_j)) def test_demote_omits_clusters_with_only_childs(self): # If demoting causes a cluster node to have only one child, that node # should be removed and replaced by its only child node. # Build two sibling cluster nodes. n_i = self._build_cluster_node() n_j = self._build_cluster_node() parent = self.h.create_node(children=[n_i, n_j]) self.h.demote(n_i, n_j) # The parent should be removed. self.assertFalse(parent in self.h.nodes) # n_i should be the root now. self.assertTrue(self.h.g.is_root(n_i)) self.assertTrue(n_j in self.h.g.get_children(n_i)) self.assertEqual(self.h.g.get_parent(n_j), n_i) def test_merge(self): # If N_i and N_j are sibling nodes under a parent N, # MERGE(N_i, N_j) # will create a new cluster node, N_k, with N_i and N_j # as its chidlren and N as its parent. parent = self._build_cluster_node(num_children=3) n_i, n_j, n_p = self.h.g.get_children(parent) # All three nodes are siblings. self.assertEqual(self.h.g.get_siblings(n_p), [n_i, n_j]) self.h.merge(n_i, n_j) # The old parent should be replaced. n_k = self.h.g.get_parent(n_i) self.assertNotEqual(n_k, parent) # Now n_i and n_j should be in their own cluster. self.assertEqual(self.h.g.get_siblings(n_i), [n_j]) # And that new cluster should be a sibling to the remaining node. self.assertEqual(self.h.g.get_siblings(n_p), [n_k]) def test_split(self): # If N_k is a cluster node with a set of children S_k, # SPLIT(θ, N_k) # will split N_k into two new nodes, N_i and N_j, each with a different # subset of S_k (S_i and S_j, respectively). S_k is split by disconnecting an # edge in N_k's minimum spanning tree (MST). n_i = self.h.to_iid(self.h.incorporate([10.05])) n_j = self.h.to_iid(self.h.incorporate([10.08])) n_k = self.h.to_iid(self.h.incorporate([10.10])) sibs = self.h.g.get_siblings(self.initial_leaves[0]) parent = self.h.g.get_parent(self.initial_leaves[0]) self.assertEqual(sibs, [n_i, n_j, n_k]) self.h.split(parent) # n_i, n_j, and n_k are all closer to each other than they are to the initial leaf, # so we expect them to be in their own cluster. self.assertEqual(self.h.g.get_siblings(n_i), [n_j, n_k]) self.assertEqual(self.h.g.get_siblings(self.initial_leaves[0]), [self.initial_leaves[1], self.h.g.get_parent(n_i)]) def test_restructure(self): # This isn't a comprehensive test, but a simple check. # As the only two nodes initially, these two are siblings. self.assertEqual(self.h.g.get_siblings(self.initial_leaves[0]), [self.initial_leaves[1]]) # Add some new nodes very close to the first leaf node ([10]). n_i = self.h.to_iid(self.h.incorporate([10.05])) n_j = self.h.to_iid(self.h.incorporate([10.08])) n_k = self.h.to_iid(self.h.incorporate([10.10])) # The second leaf node ([30]) should be different enough that it should # have moved to its own cluster. self.assertNotEqual(self.h.g.get_siblings(self.initial_leaves[0]), [self.initial_leaves[1]]) def test_ins_hierarchy(self): # If N_i is a node in the hierarchy, child to node N, # and N_j is a node not yet in the hierarchy, # INS_HIERARCHY(N_i, N_j) # creates a new node N_k with children N_i and N_j and N as its parent. # Build a node with a parent and another node. n_i = self.h.g.get_children(self._build_cluster_node())[0] n_j = self.h.create_node(vec=[20]) self.h.ins_hierarchy(n_i, n_j) # The nodes should be siblings now. self.assertEqual(self.h.g.get_siblings(n_i), [n_j]) def test_incorporate_adds_to_existing_cluster_node(self): # A node this close should be added as a sibling. node_i = self.h.to_iid(self.h.incorporate([11])) self.assertEqual(self.h.g.get_siblings(self.initial_leaves[0]), [node_i]) def test_incorporate_creates_new_cluster_node(self): # The cluster node and the new node should be siblings. node_i = self.h.to_iid(self.h.incorporate([90])) self.assertEqual(self.h.g.get_siblings(self.initial_clus), [node_i]) def test_prune(self): self.h.fit([[20], [30]]) self.assertEqual(self.h.available_ids, []) assert_array_equal(self.h.g.leaves, [0, 1, 3, 4]) self.h.prune([5]) self.assertEqual(self.h.available_ids, [1, 4, 5]) assert_array_equal(self.h.g.leaves, [0, 3]) assert_array_equal(self.h.nodes, [0, 2, 3]) def test_clusters(self): node_i = self.h.to_iid(self.h.incorporate([90])) node_j = self.h.to_iid(self.h.incorporate([40])) clusters = self.h.clusters(distance_threshold=14.0, with_labels=False) self.assertEqual(clusters, [self.initial_leaves + [node_j], [node_i]]) clusters = self.h.clusters(distance_threshold=71.0, with_labels=False) self.assertEqual(clusters, [self.initial_leaves + [node_j, node_i]]) clusters = self.h.clusters(distance_threshold=0.0, with_labels=False) self.assertEqual(clusters, [[self.initial_leaves[0]], [self.initial_leaves[1]], [node_j], [node_i]])