def test_estimator_cardinality_dense_mode(self):
   estimator = HllCardinality()
   for truth in [1025, 2048]:
     hll = HyperLogLogPlusPlus(random_seed=89, length=1024)
     for i in range(truth):
       hll.add(i)
     estimated = estimator([hll])[0]
     self.assertAlmostEqual(estimated, truth, delta=truth * 0.05)
  def insertion_test_helper(self, number_to_insert, acceptable_error=.05):
    hll = HyperLogLogPlusPlus(random_seed=137)

    for i in range(number_to_insert):
      hll.add(i)

    error_ratio = hll.estimate_cardinality() / number_to_insert
    self.assertAlmostEqual(error_ratio, 1.0, delta=acceptable_error)
 def test_estimator_cardinality_sparse_mode(self):
   estimator = HllCardinality()
   for truth in [0, 1, 1024]:
     hll = HyperLogLogPlusPlus(random_seed=89, length=1024)
     for i in range(truth):
       hll.add(i)
     estimated = estimator([hll])[0]
     self.assertEqual(estimated, truth)
  def estimator_tester_helper(self, number_of_hlls, acceptable_error=.05):
    estimator = HllCardinality()
    hll_list = []
    for i in range(number_of_hlls):
      hll = HyperLogLogPlusPlus(random_seed=42)
      hll.add(i)
      hll_list.append(hll)

    error_ratio = estimator(hll_list)[0] / number_of_hlls
    self.assertAlmostEqual(error_ratio, 1.0, delta=acceptable_error)
    def test_simple_estimate_smaller(self):
        hll = HyperLogLogPlusPlus(length=self.vector_length,
                                  random_seed=42,
                                  num_integer_bits=self.num_integer_bits)

        one_vector = np.ones(self.vector_length)
        hll.buckets = one_vector
        alpha_16 = 0.673
        hll_should_estimate = alpha_16 * self.vector_length**2 * 2 / self.vector_length

        self.assertEqual(alpha_16, hll.alpha)
        self.assertEqual(hll.estimate_cardinality(), hll_should_estimate)
    def test_simple_estimate_larger(self):
        m = 2**14
        hll = HyperLogLogPlusPlus(length=m,
                                  random_seed=42,
                                  num_integer_bits=self.num_integer_bits)

        thirty_vector = 30 * np.ones(m)
        hll.buckets = thirty_vector
        alpha_m = 0.7213 / (1 + 1.079 / m)
        hll_should_estimate = alpha_m * m**2 * 2**30 / m

        self.assertEqual(alpha_m, hll.alpha)
        self.assertEqual(hll.estimate_cardinality(), hll_should_estimate)
  def test_insert_same(self):
    hll = HyperLogLogPlusPlus(random_seed=42)

    hll.add(1)
    card_one = hll.estimate_cardinality()
    hll.add(1)

    self.assertEqual(card_one, hll.estimate_cardinality())
  def test_merge_sparse_with_dense(self):
    hll1 = HyperLogLogPlusPlus(length=16, random_seed=234)
    hll1.add(100)
    hll2 = HyperLogLogPlusPlus(length=16, random_seed=234)
    for i in range(16 * 6 + 1):
      hll2.add(i)

    merged_hll = hll1.merge(hll2)
    self.assertFalse(merged_hll.sparse_mode,
                     'Merged sketch should not be in sparse mode.')
    # Should change one bucket value given this random seed.
    self.assertEqual(sum(hll2.buckets == merged_hll.buckets), 16 - 1,
                     'Merged sketch is not correct.')
    self.assertSameElements(merged_hll.temp_set, set(),
                            'Temp set is not correct.')
    self.assertGreater(merged_hll.estimate_cardinality(),
                       hll2.estimate_cardinality())
 def test_merge_sparse_with_sparse_to_sparse(self):
   hll1 = HyperLogLogPlusPlus(length=16, random_seed=234)
   hll1.add(1)
   hll2 = HyperLogLogPlusPlus(length=16, random_seed=234)
   hll2.add(1)
   merged_hll = hll1.merge(hll2)
   self.assertTrue(merged_hll.sparse_mode,
                   'Merged sketch is not in sparse mode.')
   self.assertTrue(all(hll1.buckets == merged_hll.buckets),
                   'Merged sketch is not correct.')
   self.assertSameElements(merged_hll.temp_set, set([1]),
                           'Temp set is not correct.')
   self.assertEqual(merged_hll.estimate_cardinality(), 1,
                    'Estimated cardinality is not correct.')
  def test_single_correct_bucket_placement(self):
    for bucket_idx, bucket_bin_str in self.bucket_idx_to_bin_str.items():
      for leading_0_bin_str, num_leading_0s in self.bin_str_to_leading_zeros.items(
      ):

        hll = HyperLogLogPlusPlus(
            length=self.vector_length,
            random_seed=42,
            hash_class=NoOpHasher,
            num_integer_bits=self.num_integer_bits)

        total_bin_str = bucket_bin_str + leading_0_bin_str
        hll.add(int(total_bin_str, 2))

        expected_buckets = np.zeros(16, dtype=np.int32)
        expected_buckets[bucket_idx] = num_leading_0s + 1

        self.assertSameElements(hll.buckets, expected_buckets)
  def test_merge_dense_with_dense(self):
    hll1 = HyperLogLogPlusPlus(length=16, random_seed=234)
    hll2 = HyperLogLogPlusPlus(length=16, random_seed=234)
    for i in range(16 * 6 + 1):
      hll1.add(i)
      hll2.add(i + 100)

    merged_hll = hll1.merge(hll2)
    self.assertFalse(merged_hll.sparse_mode,
                     'Merged sketch should not be in sparse mode.')
    self.assertGreater(sum(hll2.buckets == merged_hll.buckets), 0,
                       'Merged sketch is not correct.')
    self.assertSameElements(merged_hll.temp_set, set(),
                            'Temp set is not correct.')
    self.assertAlmostEqual(
        merged_hll.estimate_cardinality(), 194, delta=194 * 0.1
        )
  def test_merge_sparse_with_sparse_to_dense(self):
    hll1 = HyperLogLogPlusPlus(length=16, random_seed=234)
    hll2 = HyperLogLogPlusPlus(length=16, random_seed=234)
    for i in range(int(16 * 6 / 2)):
      hll1.add(i)
      hll2.add(i + 100)

    merged_hll = hll1.merge(hll2)
    self.assertTrue(merged_hll.sparse_mode,
                    'Merged sketch should be in sparse mode.')
    self.assertEqual(merged_hll.estimate_cardinality(), 96,
                     'Estimated cardinality not correct under sparse mode.')

    hll1.add(1000)
    merged_hll = hll1.merge(hll2)
    self.assertFalse(merged_hll.sparse_mode,
                     'Merged sketch should not be in sparse mode.')
    self.assertAlmostEqual(
        merged_hll.estimate_cardinality(), 97, delta=97 * 0.05,
        msg='Estimated cardinality not correct under dense mode.'
    )
Ejemplo n.º 13
0
  def setUp(self):
    super(InteroperabilityTest, self).setUp()
    self.number_of_trials = 2
    self.universe_size = 2000
    self.set_size_list = [5, 7, 9]
    self.large_set_size = 6
    self.small_set_size = 3
    self.sketch_size = 128
    self.number_of_sets = 3
    self.set_size = 50
    self.num_large_sets = 1
    self.num_small_sets = 3
    self.order = set_generator.ORDER_RANDOM
    self.user_activity_association = (
        set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
    self.shared_prop = 0.2
    self.num_bloom_filter_hashes = 2
    self.exponential_bloom_filter_decay_rate = 10
    self.noiser_epsilon = np.log(3)
    self.noiser_flip_probability = .25

    self.set_random_state = np.random.RandomState(42)
    self.sketch_random_state = np.random.RandomState(137)
    self.noise_random_state = np.random.RandomState(3)

    # non-noised estimators
    estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator())

    estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator())

    estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(method='log'))

    estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(method='exp'))

    estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator())

    estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator())

    estimator_config_hll = SketchEstimatorConfig(
        name='hyper_log_log',
        sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(self.sketch_size),
        estimator=HllCardinality())

    config_list = [
        estimator_config_exact,
        estimator_config_cascading_legions,
        estimator_config_bloom_filter,
        estimator_config_logarithmic_bloom_filter,
        estimator_config_exponential_bloom_filter,
        estimator_config_voc,
        estimator_config_hll,
    ]

    self.name_to_non_noised_estimator_config = {
        config.name: config for config in config_list
    }

    # noised estimators
    noised_estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator(),
        sketch_noiser=Noiser(self.noiser_flip_probability))

    noised_estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator(),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(
            method='log',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(
            method='exp',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator(),
        sketch_noiser=LaplaceNoiser())

    noised_estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator(),
        sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state))

    noised_config_list = [
        noised_estimator_config_exact,
        noised_estimator_config_cascading_legions,
        noised_estimator_config_bloom_filter,
        noised_estimator_config_logarithmic_bloom_filter,
        noised_estimator_config_exponential_bloom_filter,
        noised_estimator_config_voc,
    ]

    self.name_to_noised_estimator_config = {
        config.name: config for config in noised_config_list
    }
Ejemplo n.º 14
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  estimator_config_cascading_legions = SketchEstimatorConfig(
      name='cascading-legions',
      sketch_factory=CascadingLegions.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.sketch_size),
      estimator=Estimator())

  estimator_config_bloom_filter = SketchEstimatorConfig(
      name='bloom_filter-union_estimator',
      sketch_factory=BloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.num_bloom_filter_hashes),
      estimator=UnionEstimator())

  estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
      name='log_bloom_filter-first_moment_log',
      sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
          FLAGS.sketch_size),
      estimator=FirstMomentEstimator(method='log'))

  estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
      name='exp_bloom_filter-first_moment_exp',
      sketch_factory=ExponentialBloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.exponential_bloom_filter_decay_rate),
      estimator=FirstMomentEstimator(method='exp'))

  estimator_config_voc = SketchEstimatorConfig(
      name='vector_of_counts-sequential',
      sketch_factory=VectorOfCounts.get_sketch_factory(FLAGS.sketch_size),
      estimator=SequentialEstimator())

  estimator_config_hll = SketchEstimatorConfig(
      name='hll++',
      sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(FLAGS.sketch_size),
      estimator=HllCardinality())

  estimator_config_exact = SketchEstimatorConfig(
      name='exact_set-lossless',
      sketch_factory=ExactSet.get_sketch_factory(),
      estimator=LosslessEstimator())

  estimator_config_list = [
      estimator_config_bloom_filter,
      estimator_config_logarithmic_bloom_filter,
      estimator_config_exponential_bloom_filter,
      estimator_config_cascading_legions,
      estimator_config_exact,
      estimator_config_hll,
      estimator_config_voc,
  ]

  name_to_estimator_config = {
      'bloom_filter': estimator_config_bloom_filter,
      'logarithmic_bloom_filter': estimator_config_logarithmic_bloom_filter,
      'exponential_bloom_filter': estimator_config_exponential_bloom_filter,
      'cascading_legions': estimator_config_cascading_legions,
      'exact_set': estimator_config_exact,
      'hll++': estimator_config_hll,
      'vector_of_counts': estimator_config_voc,
  }
  set_generator_factory = (
      set_generator.IndependentSetGenerator.
      get_generator_factory_with_num_and_size(
          universe_size=FLAGS.universe_size,
          num_sets=FLAGS.number_of_sets,
          set_size=FLAGS.set_size))

  for estimator_method_config in estimator_config_list:
    print(f'Calculations for {estimator_method_config.name}')
    set_rs = np.random.RandomState(1)
    sketch_rs = np.random.RandomState(1)
    simulator = Simulator(
        num_runs=FLAGS.number_of_trials,
        set_generator_factory=set_generator_factory,
        sketch_estimator_config=estimator_method_config,
        set_random_state=set_rs,
        sketch_random_state=sketch_rs)

    _, agg_data = simulator.run_all_and_aggregate()
    print(f'Aggregate Statistics for {estimator_method_config.name}')
    print(agg_data)
    def setUp(self):
        super(InteroperabilityTest, self).setUp()
        self.number_of_trials = 2
        self.universe_size = 2000
        self.set_size = 5
        self.large_set_size = 6
        self.small_set_size = 3
        self.sketch_size = 64
        self.number_of_sets = 2
        self.num_large_sets = 1
        self.num_small_sets = 3
        self.order = set_generator.ORDER_RANDOM
        self.user_activity_association = (
            set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
        self.shared_prop = 0.2
        self.num_bloom_filter_hashes = 2
        self.exponential_bloom_filter_decay_rate = 10
        self.noiser_epsilon = np.log(3)
        self.noiser_flip_probability = .25

        self.set_random_state = np.random.RandomState(42)
        self.sketch_random_state = np.random.RandomState(137)
        self.noise_random_state = np.random.RandomState(3)

        # non-noised estimators
        estimator_config_cascading_legions = EstimatorConfig(
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_bloom_filter = EstimatorConfig(
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_logarithmic_bloom_filter = EstimatorConfig(
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(method='log'),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_exponential_bloom_filter = EstimatorConfig(
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(method='exp'),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_geometric_bloom_filter = EstimatorConfig(
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=GeometricUnionEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_voc = EstimatorConfig(
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_exact = EstimatorConfig(
            sketch_factory=ExactSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_hll = EstimatorConfig(
            sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(
                self.sketch_size),
            estimator=HllCardinality(),
            sketch_noiser=None,
            estimate_noiser=None)

        self.name_to_non_noised_estimator_config = {
            'exact_set': estimator_config_exact,
            'cascading_legions': estimator_config_cascading_legions,
            'bloom_filter': estimator_config_bloom_filter,
            'logarithmic_bloom_filter':
            estimator_config_logarithmic_bloom_filter,
            'exponential_bloom_filter':
            estimator_config_exponential_bloom_filter,
            'geometric_bloom_filter': estimator_config_geometric_bloom_filter,
            'vector_of_counts': estimator_config_voc,
            'hll': estimator_config_hll,
        }

        # noised estimators

        noised_estimator_config_cascading_legions = EstimatorConfig(
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=Noiser(self.noiser_flip_probability),
            estimate_noiser=None)

        noised_estimator_config_bloom_filter = EstimatorConfig(
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state),
            estimate_noiser=None)

        noised_estimator_config_logarithmic_bloom_filter = EstimatorConfig(
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(
                method='log',
                denoiser=SurrealDenoiser(
                    probability=self.noiser_flip_probability)),
            sketch_noiser=None,
            estimate_noiser=None)

        noised_estimator_config_exponential_bloom_filter = EstimatorConfig(
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(
                method='exp',
                denoiser=SurrealDenoiser(
                    probability=self.noiser_flip_probability)),
            sketch_noiser=None,
            estimate_noiser=None)

        noised_estimator_config_geometric_bloom_filter = EstimatorConfig(
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=GeometricUnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state),
            estimate_noiser=None)

        noised_estimator_config_voc = EstimatorConfig(
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=LaplaceNoiser(),
            estimate_noiser=None)

        noised_estimator_config_exact = EstimatorConfig(
            sketch_factory=ExactSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state),
            estimate_noiser=None)

        self.name_to_noised_estimator_config = {
            'exact_set': noised_estimator_config_exact,
            'cascading_legions': noised_estimator_config_cascading_legions,
            'bloom_filter': noised_estimator_config_bloom_filter,
            'logarithmic_bloom_filter':
            noised_estimator_config_logarithmic_bloom_filter,
            'exponential_bloom_filter':
            noised_estimator_config_exponential_bloom_filter,
            'geometric_bloom_filter':
            noised_estimator_config_geometric_bloom_filter,
            'vector_of_counts': noised_estimator_config_voc,
        }