Example #1
0
    def setUp(self):
        """
        Keep it simple: 5 1-dimensional datapoints::

            [[1],
             [2],
             [4],
             [8],
             [12]]

        The child distance matrix will look like::

            [[  0.   1.   3.   7.  11.]
             [  1.   0.   2.   6.  10.]
             [  3.   2.   0.   4.   8.]
             [  7.   6.   4.   0.   4.]
             [ 11.  10.   8.   4.   0.]]
        """
        self.data = np.array([[1], [2], [4], [8], [12]])

        # Initialize the hierarchy with the first two datapoints.
        self.h = Hierarchy()
        self.h.fit(self.data[:2])

        # Create (leaf) nodes for each other datapoint.
        self.nodes = self.h.nodes[:2] + [
            self.h.create_node(vec=vec) for vec in self.data[2:]
        ]

        # Create the cluster node to test with.
        self.c = self.h.create_node(children=self.nodes)
Example #2
0
 def setUp(self):
     self.initial_vecs = [[10], [30]]
     self.h = Hierarchy(metric='euclidean',
                        lower_limit_scale=0.1,
                        upper_limit_scale=1.5)
     self.h.fit(self.initial_vecs)
     self.initial_leaves = [0, 1]
     self.initial_clus = 2
Example #3
0
def load_hierarchy():
    PATH = os.path.expanduser(conf['hierarchy_path'])
    if os.path.exists(PATH):
        return Hierarchy.load(PATH)
    else:
        return Hierarchy(metric=conf['metric'],
                      lower_limit_scale=conf['lower_limit_scale'],
                      upper_limit_scale=conf['upper_limit_scale'])
Example #4
0
def highlights(comments, min_size=5, dist_cutoff=0.5):
    """
    This takes a set of comments,
    clusters them, and then returns representatives from clusters above
    some threshold size.

    Args:
        | comments      -- list of Commentables
        | min_size      -- int, minimium cluster size to consider
        | dist_cutoff   -- float, the density at which to snip the hierarchy for clusters

    Future improvements:
        - Persist hierarchies instead of rebuilding from scratch (using Hierarchy.load & Hierarchy.save)
        - Tweak min_size and dist_cutoff for the domain.
    """
    v = joblib.load(geiger_path)
    vecs = v.vectorize([strip_tags(c.body) for c in comments], train=False)
    vecs = vecs.toarray()

    log.info('Clustering {0} comments...'.format(vecs.shape[0]))

    # Build the hierarchy.
    h = Hierarchy(metric='cosine',
                  lower_limit_scale=0.9,
                  upper_limit_scale=1.2)
    ids = h.fit(vecs)

    log.info('Processing resulting clusters...')

    # Build a map of hierarchy ids to comments.
    map = {ids[i]: c for i, c in enumerate(comments)}

    # Generate the clusters.
    clusters = h.clusters(distance_threshold=dist_cutoff, with_labels=False)

    # Filter to clusters of at least some minimum size.
    clusters = [c for c in clusters if len(c) >= min_size]

    # Get the clusters as comments.
    clusters = [[map[id] for id in clus] for clus in clusters]

    # From each cluster, pick the comment with the highest score.
    highlights = [max(clus, key=lambda c: c.score) for clus in clusters]

    # Suppress replies, show only top-level.
    for h in highlights:
        h.replies = []

    log.info('Done.')

    return highlights
Example #5
0
    def setUp(self):
        self.vecs = [[10], [20], [0], [20]]
        self.initial_vecs = self.vecs[:2]
        self.h = Hierarchy(metric='euclidean',
                           lower_limit_scale=0.1,
                           upper_limit_scale=1.5)
        self.h.fit(self.initial_vecs)

        children = [self.h.create_node(vec=vec) for vec in self.vecs[2:]]
        n = self.h.create_node(children=children)
        self.h.g.add_child(2, n)

        self.leaves = [0, 1, 3, 4]
        self.clusters = [2, 5]
Example #6
0
 def setUp(self):
     self.h = Hierarchy()