Beispiel #1
0
    def test_grouper(self):
        # without fill value (default should be None)
        a, b, c = iters.grouper(3, "ABCDEFG")
        self.assertEqual(["A", "B", "C"], list(a))
        self.assertEqual(["D", "E", "F"], list(b))
        self.assertEqual(["G", None, None], list(c))

        # with fill value
        a, b, c = iters.grouper(3, "ABCDEFG", "x")
        self.assertEqual(["A", "B", "C"], list(a))
        self.assertEqual(["D", "E", "F"], list(b))
        self.assertEqual(["G", "x", "x"], list(c))
Beispiel #2
0
    def test_grouper(self):
        # without fill value (default should be None)
        a, b, c = iters.grouper(3, "ABCDEFG")
        self.assertEqual(["A","B","C"], list(a))
        self.assertEqual(["D","E","F"], list(b))
        self.assertEqual(["G",None,None], list(c))

        # with fill value
        a, b, c = iters.grouper(3, "ABCDEFG", "x")
        self.assertEqual(["A","B","C"], list(a))
        self.assertEqual(["D","E","F"], list(b))
        self.assertEqual(["G","x","x"], list(c))
Beispiel #3
0
def churn_jaccard(drop=False, repeats=TAGS_REPEATS):
    """
    We'll be working with pre-created list of ShredsDistances documents - s_distances
    This way we don't spend CPU cycles & mallocs to create millions of
    objects to be thrown away.

    It would be even more efficient to not create these documents at all
    and insert raw data into mongo, but I haven't dug deep enough.
    """

    if drop:
        ShredsDistances.objects(distance_type='jaccard').delete()

    shreds_tags = fetch_normalized_shreds_tags(repeats=repeats)
    s_distances = [ShredsDistances() for _ in xrange(BULK_INSERT_SIZE)]

    for distances in grouper(BULK_INSERT_SIZE, jaccard_distances_iterator(shreds_tags)):
        for i, distance in enumerate(distances):
            if distance:
                # assign data to pre-created ShredsDistances document
                tag_a, tag_b, dist = distance
                s_d = s_distances[i]
                s_d.shreds_pair = [tag_a, tag_b]
                s_d.distance = dist
                s_d.distance_type = 'jaccard'
            else:
                # Cut the tail of pre-created documents from last bulk set
                s_distances[i] = None
        ShredsDistances.objects.insert(filter(None, s_distances), load_bulk=False)
Beispiel #4
0
    def churn(self, drop=False, repeats=TAGS_REPEATS):
        """Creates ShredsDistance entries for every pair of Clusters.

        Args:
            drop: If True, clears the current data before starting.
            repeats: Optional int minimum number the tag should be mentioned to
                be considered.
        """
        if drop:
            # Too slow.
            #ShredsDistances.objects(distance_type=self.type_name).delete()
            ShredsDistances.drop_collection()

        shreds_tags = _fetch_normalized_shreds_tags(repeats=repeats)
        total_num_pairs = len(shreds_tags) * (len(shreds_tags) - 1) / 2

        distances = self._distances_iterator(shreds_tags)
        batches_of_distances = grouper(BULK_INSERT_SIZE, distances)
        insert_args = ((batch, self.type_name)
                       for batch in batches_of_distances)

        wrote_total = 0
        inserting_pool = multiprocessing.Pool()
        for _ in inserting_pool.imap_unordered(insert_batch, insert_args):
            wrote_total += BULK_INSERT_SIZE
            wrote_total = min(wrote_total, total_num_pairs)
            logging.info("Wrote another %d documents. Total complete: %d/%d %.2f%%",
                         BULK_INSERT_SIZE, wrote_total, total_num_pairs,
                         float(wrote_total)/total_num_pairs * 100)
        inserting_pool.close()
        inserting_pool.join()