def test_estimator_cardinality_sparse_mode(self):
   estimator = HllCardinality()
   for truth in [0, 1, 1024]:
     hll = HyperLogLogPlusPlus(random_seed=89, length=1024)
     for i in range(truth):
       hll.add(i)
     estimated = estimator([hll])[0]
     self.assertEqual(estimated, truth)
 def test_estimator_cardinality_dense_mode(self):
   estimator = HllCardinality()
   for truth in [1025, 2048]:
     hll = HyperLogLogPlusPlus(random_seed=89, length=1024)
     for i in range(truth):
       hll.add(i)
     estimated = estimator([hll])[0]
     self.assertAlmostEqual(estimated, truth, delta=truth * 0.05)
  def insertion_test_helper(self, number_to_insert, acceptable_error=.05):
    hll = HyperLogLogPlusPlus(random_seed=137)

    for i in range(number_to_insert):
      hll.add(i)

    error_ratio = hll.estimate_cardinality() / number_to_insert
    self.assertAlmostEqual(error_ratio, 1.0, delta=acceptable_error)
  def test_insert_same(self):
    hll = HyperLogLogPlusPlus(random_seed=42)

    hll.add(1)
    card_one = hll.estimate_cardinality()
    hll.add(1)

    self.assertEqual(card_one, hll.estimate_cardinality())
  def estimator_tester_helper(self, number_of_hlls, acceptable_error=.05):
    estimator = HllCardinality()
    hll_list = []
    for i in range(number_of_hlls):
      hll = HyperLogLogPlusPlus(random_seed=42)
      hll.add(i)
      hll_list.append(hll)

    error_ratio = estimator(hll_list)[0] / number_of_hlls
    self.assertAlmostEqual(error_ratio, 1.0, delta=acceptable_error)
 def test_merge_sparse_with_sparse_to_sparse(self):
   hll1 = HyperLogLogPlusPlus(length=16, random_seed=234)
   hll1.add(1)
   hll2 = HyperLogLogPlusPlus(length=16, random_seed=234)
   hll2.add(1)
   merged_hll = hll1.merge(hll2)
   self.assertTrue(merged_hll.sparse_mode,
                   'Merged sketch is not in sparse mode.')
   self.assertTrue(all(hll1.buckets == merged_hll.buckets),
                   'Merged sketch is not correct.')
   self.assertSameElements(merged_hll.temp_set, set([1]),
                           'Temp set is not correct.')
   self.assertEqual(merged_hll.estimate_cardinality(), 1,
                    'Estimated cardinality is not correct.')
  def test_merge_dense_with_dense(self):
    hll1 = HyperLogLogPlusPlus(length=16, random_seed=234)
    hll2 = HyperLogLogPlusPlus(length=16, random_seed=234)
    for i in range(16 * 6 + 1):
      hll1.add(i)
      hll2.add(i + 100)

    merged_hll = hll1.merge(hll2)
    self.assertFalse(merged_hll.sparse_mode,
                     'Merged sketch should not be in sparse mode.')
    self.assertGreater(sum(hll2.buckets == merged_hll.buckets), 0,
                       'Merged sketch is not correct.')
    self.assertSameElements(merged_hll.temp_set, set(),
                            'Temp set is not correct.')
    self.assertAlmostEqual(
        merged_hll.estimate_cardinality(), 194, delta=194 * 0.1
        )
  def test_merge_sparse_with_dense(self):
    hll1 = HyperLogLogPlusPlus(length=16, random_seed=234)
    hll1.add(100)
    hll2 = HyperLogLogPlusPlus(length=16, random_seed=234)
    for i in range(16 * 6 + 1):
      hll2.add(i)

    merged_hll = hll1.merge(hll2)
    self.assertFalse(merged_hll.sparse_mode,
                     'Merged sketch should not be in sparse mode.')
    # Should change one bucket value given this random seed.
    self.assertEqual(sum(hll2.buckets == merged_hll.buckets), 16 - 1,
                     'Merged sketch is not correct.')
    self.assertSameElements(merged_hll.temp_set, set(),
                            'Temp set is not correct.')
    self.assertGreater(merged_hll.estimate_cardinality(),
                       hll2.estimate_cardinality())
  def test_single_correct_bucket_placement(self):
    for bucket_idx, bucket_bin_str in self.bucket_idx_to_bin_str.items():
      for leading_0_bin_str, num_leading_0s in self.bin_str_to_leading_zeros.items(
      ):

        hll = HyperLogLogPlusPlus(
            length=self.vector_length,
            random_seed=42,
            hash_class=NoOpHasher,
            num_integer_bits=self.num_integer_bits)

        total_bin_str = bucket_bin_str + leading_0_bin_str
        hll.add(int(total_bin_str, 2))

        expected_buckets = np.zeros(16, dtype=np.int32)
        expected_buckets[bucket_idx] = num_leading_0s + 1

        self.assertSameElements(hll.buckets, expected_buckets)
  def test_merge_sparse_with_sparse_to_dense(self):
    hll1 = HyperLogLogPlusPlus(length=16, random_seed=234)
    hll2 = HyperLogLogPlusPlus(length=16, random_seed=234)
    for i in range(int(16 * 6 / 2)):
      hll1.add(i)
      hll2.add(i + 100)

    merged_hll = hll1.merge(hll2)
    self.assertTrue(merged_hll.sparse_mode,
                    'Merged sketch should be in sparse mode.')
    self.assertEqual(merged_hll.estimate_cardinality(), 96,
                     'Estimated cardinality not correct under sparse mode.')

    hll1.add(1000)
    merged_hll = hll1.merge(hll2)
    self.assertFalse(merged_hll.sparse_mode,
                     'Merged sketch should not be in sparse mode.')
    self.assertAlmostEqual(
        merged_hll.estimate_cardinality(), 97, delta=97 * 0.05,
        msg='Estimated cardinality not correct under dense mode.'
    )