def setUp(self): """ Keep it simple: 5 1-dimensional datapoints:: [[1], [2], [4], [8], [12]] The child distance matrix will look like:: [[ 0. 1. 3. 7. 11.] [ 1. 0. 2. 6. 10.] [ 3. 2. 0. 4. 8.] [ 7. 6. 4. 0. 4.] [ 11. 10. 8. 4. 0.]] """ self.data = np.array([[1], [2], [4], [8], [12]]) # Initialize the hierarchy with the first two datapoints. self.h = Hierarchy() self.h.fit(self.data[:2]) # Create (leaf) nodes for each other datapoint. self.nodes = self.h.nodes[:2] + [ self.h.create_node(vec=vec) for vec in self.data[2:] ] # Create the cluster node to test with. self.c = self.h.create_node(children=self.nodes)
def setUp(self): self.initial_vecs = [[10], [30]] self.h = Hierarchy(metric='euclidean', lower_limit_scale=0.1, upper_limit_scale=1.5) self.h.fit(self.initial_vecs) self.initial_leaves = [0, 1] self.initial_clus = 2
def load_hierarchy(): PATH = os.path.expanduser(conf['hierarchy_path']) if os.path.exists(PATH): return Hierarchy.load(PATH) else: return Hierarchy(metric=conf['metric'], lower_limit_scale=conf['lower_limit_scale'], upper_limit_scale=conf['upper_limit_scale'])
def highlights(comments, min_size=5, dist_cutoff=0.5): """ This takes a set of comments, clusters them, and then returns representatives from clusters above some threshold size. Args: | comments -- list of Commentables | min_size -- int, minimium cluster size to consider | dist_cutoff -- float, the density at which to snip the hierarchy for clusters Future improvements: - Persist hierarchies instead of rebuilding from scratch (using Hierarchy.load & Hierarchy.save) - Tweak min_size and dist_cutoff for the domain. """ v = joblib.load(geiger_path) vecs = v.vectorize([strip_tags(c.body) for c in comments], train=False) vecs = vecs.toarray() log.info('Clustering {0} comments...'.format(vecs.shape[0])) # Build the hierarchy. h = Hierarchy(metric='cosine', lower_limit_scale=0.9, upper_limit_scale=1.2) ids = h.fit(vecs) log.info('Processing resulting clusters...') # Build a map of hierarchy ids to comments. map = {ids[i]: c for i, c in enumerate(comments)} # Generate the clusters. clusters = h.clusters(distance_threshold=dist_cutoff, with_labels=False) # Filter to clusters of at least some minimum size. clusters = [c for c in clusters if len(c) >= min_size] # Get the clusters as comments. clusters = [[map[id] for id in clus] for clus in clusters] # From each cluster, pick the comment with the highest score. highlights = [max(clus, key=lambda c: c.score) for clus in clusters] # Suppress replies, show only top-level. for h in highlights: h.replies = [] log.info('Done.') return highlights
def setUp(self): self.vecs = [[10], [20], [0], [20]] self.initial_vecs = self.vecs[:2] self.h = Hierarchy(metric='euclidean', lower_limit_scale=0.1, upper_limit_scale=1.5) self.h.fit(self.initial_vecs) children = [self.h.create_node(vec=vec) for vec in self.vecs[2:]] n = self.h.create_node(children=children) self.h.g.add_child(2, n) self.leaves = [0, 1, 3, 4] self.clusters = [2, 5]
def setUp(self): self.h = Hierarchy()