def test_grouper(self): # without fill value (default should be None) a, b, c = iters.grouper(3, "ABCDEFG") self.assertEqual(["A", "B", "C"], list(a)) self.assertEqual(["D", "E", "F"], list(b)) self.assertEqual(["G", None, None], list(c)) # with fill value a, b, c = iters.grouper(3, "ABCDEFG", "x") self.assertEqual(["A", "B", "C"], list(a)) self.assertEqual(["D", "E", "F"], list(b)) self.assertEqual(["G", "x", "x"], list(c))
def test_grouper(self): # without fill value (default should be None) a, b, c = iters.grouper(3, "ABCDEFG") self.assertEqual(["A","B","C"], list(a)) self.assertEqual(["D","E","F"], list(b)) self.assertEqual(["G",None,None], list(c)) # with fill value a, b, c = iters.grouper(3, "ABCDEFG", "x") self.assertEqual(["A","B","C"], list(a)) self.assertEqual(["D","E","F"], list(b)) self.assertEqual(["G","x","x"], list(c))
def churn_jaccard(drop=False, repeats=TAGS_REPEATS): """ We'll be working with pre-created list of ShredsDistances documents - s_distances This way we don't spend CPU cycles & mallocs to create millions of objects to be thrown away. It would be even more efficient to not create these documents at all and insert raw data into mongo, but I haven't dug deep enough. """ if drop: ShredsDistances.objects(distance_type='jaccard').delete() shreds_tags = fetch_normalized_shreds_tags(repeats=repeats) s_distances = [ShredsDistances() for _ in xrange(BULK_INSERT_SIZE)] for distances in grouper(BULK_INSERT_SIZE, jaccard_distances_iterator(shreds_tags)): for i, distance in enumerate(distances): if distance: # assign data to pre-created ShredsDistances document tag_a, tag_b, dist = distance s_d = s_distances[i] s_d.shreds_pair = [tag_a, tag_b] s_d.distance = dist s_d.distance_type = 'jaccard' else: # Cut the tail of pre-created documents from last bulk set s_distances[i] = None ShredsDistances.objects.insert(filter(None, s_distances), load_bulk=False)
def churn(self, drop=False, repeats=TAGS_REPEATS): """Creates ShredsDistance entries for every pair of Clusters. Args: drop: If True, clears the current data before starting. repeats: Optional int minimum number the tag should be mentioned to be considered. """ if drop: # Too slow. #ShredsDistances.objects(distance_type=self.type_name).delete() ShredsDistances.drop_collection() shreds_tags = _fetch_normalized_shreds_tags(repeats=repeats) total_num_pairs = len(shreds_tags) * (len(shreds_tags) - 1) / 2 distances = self._distances_iterator(shreds_tags) batches_of_distances = grouper(BULK_INSERT_SIZE, distances) insert_args = ((batch, self.type_name) for batch in batches_of_distances) wrote_total = 0 inserting_pool = multiprocessing.Pool() for _ in inserting_pool.imap_unordered(insert_batch, insert_args): wrote_total += BULK_INSERT_SIZE wrote_total = min(wrote_total, total_num_pairs) logging.info("Wrote another %d documents. Total complete: %d/%d %.2f%%", BULK_INSERT_SIZE, wrote_total, total_num_pairs, float(wrote_total)/total_num_pairs * 100) inserting_pool.close() inserting_pool.join()