Esempio n. 1
0
 def test_noiser(self):
   s = ExactSet()
   s.add_ids([1, 2])
   n = AddRandomElementsNoiser(
       num_random_elements=3, random_state=np.random.RandomState(1))
   s_copy = n(s)
   self.assertLen(s, 2)
   self.assertLen(s_copy, 5)
Esempio n. 2
0
 def test_less_one_estimator_multiple(self):
   s1 = ExactSet()
   s1.add_ids([1, 2])
   s2 = ExactSet()
   s2.add_ids([1, 3, 4])
   e = LessOneEstimator()
   self.assertEqual(e([s1, s2]), 3)
Esempio n. 3
0
  def setUp(self):
    super(InteroperabilityTest, self).setUp()
    self.number_of_trials = 2
    self.universe_size = 2000
    self.set_size_list = [5, 7, 9]
    self.large_set_size = 6
    self.small_set_size = 3
    self.sketch_size = 128
    self.number_of_sets = 3
    self.set_size = 50
    self.num_large_sets = 1
    self.num_small_sets = 3
    self.order = set_generator.ORDER_RANDOM
    self.user_activity_association = (
        set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
    self.shared_prop = 0.2
    self.num_bloom_filter_hashes = 2
    self.exponential_bloom_filter_decay_rate = 10
    self.noiser_epsilon = np.log(3)
    self.noiser_flip_probability = .25

    self.set_random_state = np.random.RandomState(42)
    self.sketch_random_state = np.random.RandomState(137)
    self.noise_random_state = np.random.RandomState(3)

    # non-noised estimators
    estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator())

    estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator())

    estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(method='log'))

    estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(method='exp'))

    estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator())

    estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator())

    estimator_config_hll = SketchEstimatorConfig(
        name='hyper_log_log',
        sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(self.sketch_size),
        estimator=HllCardinality())

    config_list = [
        estimator_config_exact,
        estimator_config_cascading_legions,
        estimator_config_bloom_filter,
        estimator_config_logarithmic_bloom_filter,
        estimator_config_exponential_bloom_filter,
        estimator_config_voc,
        estimator_config_hll,
    ]

    self.name_to_non_noised_estimator_config = {
        config.name: config for config in config_list
    }

    # noised estimators
    noised_estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator(),
        sketch_noiser=Noiser(self.noiser_flip_probability))

    noised_estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator(),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(
            method='log',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(
            method='exp',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator(),
        sketch_noiser=LaplaceNoiser())

    noised_estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator(),
        sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state))

    noised_config_list = [
        noised_estimator_config_exact,
        noised_estimator_config_cascading_legions,
        noised_estimator_config_bloom_filter,
        noised_estimator_config_logarithmic_bloom_filter,
        noised_estimator_config_exponential_bloom_filter,
        noised_estimator_config_voc,
    ]

    self.name_to_noised_estimator_config = {
        config.name: config for config in noised_config_list
    }
Esempio n. 4
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  estimator_config_cascading_legions = SketchEstimatorConfig(
      name='cascading-legions',
      sketch_factory=CascadingLegions.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.sketch_size),
      estimator=Estimator())

  estimator_config_bloom_filter = SketchEstimatorConfig(
      name='bloom_filter-union_estimator',
      sketch_factory=BloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.num_bloom_filter_hashes),
      estimator=UnionEstimator())

  estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
      name='log_bloom_filter-first_moment_log',
      sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
          FLAGS.sketch_size),
      estimator=FirstMomentEstimator(method='log'))

  estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
      name='exp_bloom_filter-first_moment_exp',
      sketch_factory=ExponentialBloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.exponential_bloom_filter_decay_rate),
      estimator=FirstMomentEstimator(method='exp'))

  estimator_config_voc = SketchEstimatorConfig(
      name='vector_of_counts-sequential',
      sketch_factory=VectorOfCounts.get_sketch_factory(FLAGS.sketch_size),
      estimator=SequentialEstimator())

  estimator_config_hll = SketchEstimatorConfig(
      name='hll++',
      sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(FLAGS.sketch_size),
      estimator=HllCardinality())

  estimator_config_exact = SketchEstimatorConfig(
      name='exact_set-lossless',
      sketch_factory=ExactSet.get_sketch_factory(),
      estimator=LosslessEstimator())

  estimator_config_list = [
      estimator_config_bloom_filter,
      estimator_config_logarithmic_bloom_filter,
      estimator_config_exponential_bloom_filter,
      estimator_config_cascading_legions,
      estimator_config_exact,
      estimator_config_hll,
      estimator_config_voc,
  ]

  name_to_estimator_config = {
      'bloom_filter': estimator_config_bloom_filter,
      'logarithmic_bloom_filter': estimator_config_logarithmic_bloom_filter,
      'exponential_bloom_filter': estimator_config_exponential_bloom_filter,
      'cascading_legions': estimator_config_cascading_legions,
      'exact_set': estimator_config_exact,
      'hll++': estimator_config_hll,
      'vector_of_counts': estimator_config_voc,
  }
  set_generator_factory = (
      set_generator.IndependentSetGenerator.
      get_generator_factory_with_num_and_size(
          universe_size=FLAGS.universe_size,
          num_sets=FLAGS.number_of_sets,
          set_size=FLAGS.set_size))

  for estimator_method_config in estimator_config_list:
    print(f'Calculations for {estimator_method_config.name}')
    set_rs = np.random.RandomState(1)
    sketch_rs = np.random.RandomState(1)
    simulator = Simulator(
        num_runs=FLAGS.number_of_trials,
        set_generator_factory=set_generator_factory,
        sketch_estimator_config=estimator_method_config,
        set_random_state=set_rs,
        sketch_random_state=sketch_rs)

    _, agg_data = simulator.run_all_and_aggregate()
    print(f'Aggregate Statistics for {estimator_method_config.name}')
    print(agg_data)
Esempio n. 5
0
 def test_lossless_estimator(self):
   s = ExactSet()
   s.add_ids([1, 2])
   e = LosslessEstimator()
   self.assertEqual(e([s]), 2)
Esempio n. 6
0
 def test_sketch(self):
   s = ExactSet()
   s.add_ids([1, 2])
   self.assertLen(s, 2)
   self.assertIn(2, s)
   self.assertNotIn(3, s)
    def setUp(self):
        super(InteroperabilityTest, self).setUp()
        self.number_of_trials = 2
        self.universe_size = 2000
        self.set_size = 5
        self.large_set_size = 6
        self.small_set_size = 3
        self.sketch_size = 64
        self.number_of_sets = 2
        self.num_large_sets = 1
        self.num_small_sets = 3
        self.order = set_generator.ORDER_RANDOM
        self.user_activity_association = (
            set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
        self.shared_prop = 0.2
        self.num_bloom_filter_hashes = 2
        self.exponential_bloom_filter_decay_rate = 10
        self.noiser_epsilon = np.log(3)
        self.noiser_flip_probability = .25

        self.set_random_state = np.random.RandomState(42)
        self.sketch_random_state = np.random.RandomState(137)
        self.noise_random_state = np.random.RandomState(3)

        # non-noised estimators
        estimator_config_cascading_legions = EstimatorConfig(
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_bloom_filter = EstimatorConfig(
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_logarithmic_bloom_filter = EstimatorConfig(
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(method='log'),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_exponential_bloom_filter = EstimatorConfig(
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(method='exp'),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_geometric_bloom_filter = EstimatorConfig(
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=GeometricUnionEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_voc = EstimatorConfig(
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_exact = EstimatorConfig(
            sketch_factory=ExactSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=None,
            estimate_noiser=None)

        estimator_config_hll = EstimatorConfig(
            sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(
                self.sketch_size),
            estimator=HllCardinality(),
            sketch_noiser=None,
            estimate_noiser=None)

        self.name_to_non_noised_estimator_config = {
            'exact_set': estimator_config_exact,
            'cascading_legions': estimator_config_cascading_legions,
            'bloom_filter': estimator_config_bloom_filter,
            'logarithmic_bloom_filter':
            estimator_config_logarithmic_bloom_filter,
            'exponential_bloom_filter':
            estimator_config_exponential_bloom_filter,
            'geometric_bloom_filter': estimator_config_geometric_bloom_filter,
            'vector_of_counts': estimator_config_voc,
            'hll': estimator_config_hll,
        }

        # noised estimators

        noised_estimator_config_cascading_legions = EstimatorConfig(
            sketch_factory=CascadingLegions.get_sketch_factory(
                self.sketch_size, self.sketch_size),
            estimator=Estimator(),
            sketch_noiser=Noiser(self.noiser_flip_probability),
            estimate_noiser=None)

        noised_estimator_config_bloom_filter = EstimatorConfig(
            sketch_factory=BloomFilter.get_sketch_factory(
                self.sketch_size, self.num_bloom_filter_hashes),
            estimator=UnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state),
            estimate_noiser=None)

        noised_estimator_config_logarithmic_bloom_filter = EstimatorConfig(
            sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=FirstMomentEstimator(
                method='log',
                denoiser=SurrealDenoiser(
                    probability=self.noiser_flip_probability)),
            sketch_noiser=None,
            estimate_noiser=None)

        noised_estimator_config_exponential_bloom_filter = EstimatorConfig(
            sketch_factory=ExponentialBloomFilter.get_sketch_factory(
                self.sketch_size, self.exponential_bloom_filter_decay_rate),
            estimator=FirstMomentEstimator(
                method='exp',
                denoiser=SurrealDenoiser(
                    probability=self.noiser_flip_probability)),
            sketch_noiser=None,
            estimate_noiser=None)

        noised_estimator_config_geometric_bloom_filter = EstimatorConfig(
            sketch_factory=GeometricBloomFilter.get_sketch_factory(
                self.sketch_size),
            estimator=GeometricUnionEstimator(),
            sketch_noiser=BlipNoiser(self.noiser_epsilon,
                                     self.noise_random_state),
            estimate_noiser=None)

        noised_estimator_config_voc = EstimatorConfig(
            sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
            estimator=SequentialEstimator(),
            sketch_noiser=LaplaceNoiser(),
            estimate_noiser=None)

        noised_estimator_config_exact = EstimatorConfig(
            sketch_factory=ExactSet.get_sketch_factory(),
            estimator=LosslessEstimator(),
            sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state),
            estimate_noiser=None)

        self.name_to_noised_estimator_config = {
            'exact_set': noised_estimator_config_exact,
            'cascading_legions': noised_estimator_config_cascading_legions,
            'bloom_filter': noised_estimator_config_bloom_filter,
            'logarithmic_bloom_filter':
            noised_estimator_config_logarithmic_bloom_filter,
            'exponential_bloom_filter':
            noised_estimator_config_exponential_bloom_filter,
            'geometric_bloom_filter':
            noised_estimator_config_geometric_bloom_filter,
            'vector_of_counts': noised_estimator_config_voc,
        }