def test_simulator_run_all_and_aggregate_write_file(self):
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactSet,
            estimator=LosslessEstimator())
        set_generator_factory = (set_generator.IndependentSetGenerator.
                                 get_generator_factory_with_num_and_size(
                                     universe_size=1, num_sets=1, set_size=1))

        file_df = io.StringIO()
        file_df_agg = io.StringIO()
        sim = Simulator(num_runs=5,
                        set_generator_factory=set_generator_factory,
                        sketch_estimator_config=sketch_estimator_config,
                        file_handle_raw=file_df,
                        file_handle_agg=file_df_agg)
        df, df_agg = sim()

        # Test if the saved data frame is the same as the one returned from the
        # simulator.
        file_df.seek(0)
        df_from_csv = pd.read_csv(file_df)
        pd.testing.assert_frame_equal(df, df_from_csv)

        file_df_agg.seek(0)
        df_agg_from_csv = pd.read_csv(file_df_agg, header=[0, 1], index_col=0)
        pd.testing.assert_frame_equal(df_agg, df_agg_from_csv)
 def test_simulator_run_one_with_estimate_noiser(self):
     fake_estimate_noiser = FakeEstimateNoiser()
     sketch_estimator_config = SketchEstimatorConfig(
         name='exact_set-lossless',
         sketch_factory=ExactSet,
         estimator=LosslessEstimator(),
         estimate_noiser=fake_estimate_noiser)
     sim = get_simple_simulator(sketch_estimator_config)
     data_frame = sim.run_one()
     self.assertLen(data_frame, 1)
     self.assertEqual(data_frame['estimated_cardinality'].iloc[0], 10)
     self.assertEqual(fake_estimate_noiser._calls, 1)
def get_simple_simulator(sketch_estimator_config=None):
    if not sketch_estimator_config:
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactSet,
            estimator=LosslessEstimator())
    set_generator_factory = (set_generator.IndependentSetGenerator.
                             get_generator_factory_with_num_and_size(
                                 universe_size=1, num_sets=1, set_size=1))

    return Simulator(num_runs=1,
                     set_generator_factory=set_generator_factory,
                     sketch_estimator_config=sketch_estimator_config,
                     sketch_random_state=np.random.RandomState(1),
                     set_random_state=np.random.RandomState(2))
 def test_get_sketch_different_runs_different_random_state(self):
     sketch_estimator_config = SketchEstimatorConfig(
         name='random_sketch-estimator_for_test_random_seed',
         sketch_factory=RandomSketchForTestRandomSeed,
         estimator=EstimatorForTestRandomSeed())
     set_generator_factory = (set_generator.IndependentSetGenerator.
                              get_generator_factory_with_num_and_size(
                                  universe_size=1, num_sets=1, set_size=1))
     sim = Simulator(num_runs=2,
                     set_generator_factory=set_generator_factory,
                     sketch_estimator_config=sketch_estimator_config)
     df, _ = sim()
     self.assertNotEqual(
         df.loc[df['run_index'] == 0, 'estimated_cardinality'].values,
         df.loc[df['run_index'] == 1, 'estimated_cardinality'].values)
 def test_get_sketch_same_run_same_random_state(self):
     sketch_estimator_config = SketchEstimatorConfig(
         name='exact_set-lossless',
         sketch_factory=RandomSketchForTestRandomSeed,
         estimator=EstimatorForTestRandomSeed())
     set_generator_factory = (set_generator.IndependentSetGenerator.
                              get_generator_factory_with_num_and_size(
                                  universe_size=1, num_sets=2, set_size=1))
     sim = Simulator(num_runs=1,
                     set_generator_factory=set_generator_factory,
                     sketch_estimator_config=sketch_estimator_config)
     df, _ = sim()
     self.assertEqual(
         df.loc[df['num_sets'] == 1, 'estimated_cardinality'].values,
         df.loc[df['num_sets'] == 2, 'estimated_cardinality'].values)
    def test_simulator_run_all_and_aggregate_with_noise(self):
        rs = np.random.RandomState(3)
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactSet,
            estimator=LosslessEstimator(),
            sketch_noiser=AddRandomElementsNoiser(num_random_elements=3,
                                                  random_state=rs))
        sim = get_simple_simulator(sketch_estimator_config)

        data_frames = sim.run_all_and_aggregate()
        self.assertLen(data_frames, 2)
        for pub in data_frames[0]['num_sets']:
            self.assertEqual(pub, 1)
        self.assertEqual(data_frames[0]['estimated_cardinality'][0], 4)
        self.assertEqual(data_frames[0]['true_cardinality'][0], 1)
        self.assertEqual(data_frames[0]['relative_error'][0], 3)
    def test_simulator_run_all_and_aggregate_multiple_runs(self):
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactSet,
            estimator=LosslessEstimator())
        set_generator_factory = (set_generator.IndependentSetGenerator.
                                 get_generator_factory_with_num_and_size(
                                     universe_size=1, num_sets=1, set_size=1))

        sim = Simulator(num_runs=5,
                        set_generator_factory=set_generator_factory,
                        sketch_estimator_config=sketch_estimator_config)

        data_frames = sim.run_all_and_aggregate()
        self.assertLen(data_frames, 2)
        self.assertLen(data_frames[0], 5)
        for pub in data_frames[0]['num_sets']:
            self.assertEqual(pub, 1)
Esempio n. 8
0
  def setUp(self):
    super(InteroperabilityTest, self).setUp()
    self.number_of_trials = 2
    self.universe_size = 2000
    self.set_size_list = [5, 7, 9]
    self.large_set_size = 6
    self.small_set_size = 3
    self.sketch_size = 128
    self.number_of_sets = 3
    self.set_size = 50
    self.num_large_sets = 1
    self.num_small_sets = 3
    self.order = set_generator.ORDER_RANDOM
    self.user_activity_association = (
        set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT)
    self.shared_prop = 0.2
    self.num_bloom_filter_hashes = 2
    self.exponential_bloom_filter_decay_rate = 10
    self.noiser_epsilon = np.log(3)
    self.noiser_flip_probability = .25

    self.set_random_state = np.random.RandomState(42)
    self.sketch_random_state = np.random.RandomState(137)
    self.noise_random_state = np.random.RandomState(3)

    # non-noised estimators
    estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator())

    estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator())

    estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(method='log'))

    estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(method='exp'))

    estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator())

    estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator())

    estimator_config_hll = SketchEstimatorConfig(
        name='hyper_log_log',
        sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(self.sketch_size),
        estimator=HllCardinality())

    config_list = [
        estimator_config_exact,
        estimator_config_cascading_legions,
        estimator_config_bloom_filter,
        estimator_config_logarithmic_bloom_filter,
        estimator_config_exponential_bloom_filter,
        estimator_config_voc,
        estimator_config_hll,
    ]

    self.name_to_non_noised_estimator_config = {
        config.name: config for config in config_list
    }

    # noised estimators
    noised_estimator_config_cascading_legions = SketchEstimatorConfig(
        name='cascading_legions',
        sketch_factory=CascadingLegions.get_sketch_factory(
            self.sketch_size, self.sketch_size),
        estimator=Estimator(),
        sketch_noiser=Noiser(self.noiser_flip_probability))

    noised_estimator_config_bloom_filter = SketchEstimatorConfig(
        name='bloom_filter-union_estimator',
        sketch_factory=BloomFilter.get_sketch_factory(
            self.sketch_size, self.num_bloom_filter_hashes),
        estimator=UnionEstimator(),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
        name='log_bloom_filter-first_moment_log',
        sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
            self.sketch_size),
        estimator=FirstMomentEstimator(
            method='log',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
        name='exp_bloom_filter-first_moment_exp',
        sketch_factory=ExponentialBloomFilter.get_sketch_factory(
            self.sketch_size, self.exponential_bloom_filter_decay_rate),
        estimator=FirstMomentEstimator(
            method='exp',
            denoiser=SurrealDenoiser(
                probability=self.noiser_flip_probability)),
        sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state))

    noised_estimator_config_voc = SketchEstimatorConfig(
        name='vector_of_counts-sequential',
        sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size),
        estimator=SequentialEstimator(),
        sketch_noiser=LaplaceNoiser())

    noised_estimator_config_exact = SketchEstimatorConfig(
        name='exact_set-lossless',
        sketch_factory=ExactSet.get_sketch_factory(),
        estimator=LosslessEstimator(),
        sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state))

    noised_config_list = [
        noised_estimator_config_exact,
        noised_estimator_config_cascading_legions,
        noised_estimator_config_bloom_filter,
        noised_estimator_config_logarithmic_bloom_filter,
        noised_estimator_config_exponential_bloom_filter,
        noised_estimator_config_voc,
    ]

    self.name_to_noised_estimator_config = {
        config.name: config for config in noised_config_list
    }
Esempio n. 9
0
def main(argv):
  if len(argv) > 1:
    raise app.UsageError('Too many command-line arguments.')

  estimator_config_cascading_legions = SketchEstimatorConfig(
      name='cascading-legions',
      sketch_factory=CascadingLegions.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.sketch_size),
      estimator=Estimator())

  estimator_config_bloom_filter = SketchEstimatorConfig(
      name='bloom_filter-union_estimator',
      sketch_factory=BloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.num_bloom_filter_hashes),
      estimator=UnionEstimator())

  estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig(
      name='log_bloom_filter-first_moment_log',
      sketch_factory=LogarithmicBloomFilter.get_sketch_factory(
          FLAGS.sketch_size),
      estimator=FirstMomentEstimator(method='log'))

  estimator_config_exponential_bloom_filter = SketchEstimatorConfig(
      name='exp_bloom_filter-first_moment_exp',
      sketch_factory=ExponentialBloomFilter.get_sketch_factory(
          FLAGS.sketch_size, FLAGS.exponential_bloom_filter_decay_rate),
      estimator=FirstMomentEstimator(method='exp'))

  estimator_config_voc = SketchEstimatorConfig(
      name='vector_of_counts-sequential',
      sketch_factory=VectorOfCounts.get_sketch_factory(FLAGS.sketch_size),
      estimator=SequentialEstimator())

  estimator_config_hll = SketchEstimatorConfig(
      name='hll++',
      sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(FLAGS.sketch_size),
      estimator=HllCardinality())

  estimator_config_exact = SketchEstimatorConfig(
      name='exact_set-lossless',
      sketch_factory=ExactSet.get_sketch_factory(),
      estimator=LosslessEstimator())

  estimator_config_list = [
      estimator_config_bloom_filter,
      estimator_config_logarithmic_bloom_filter,
      estimator_config_exponential_bloom_filter,
      estimator_config_cascading_legions,
      estimator_config_exact,
      estimator_config_hll,
      estimator_config_voc,
  ]

  name_to_estimator_config = {
      'bloom_filter': estimator_config_bloom_filter,
      'logarithmic_bloom_filter': estimator_config_logarithmic_bloom_filter,
      'exponential_bloom_filter': estimator_config_exponential_bloom_filter,
      'cascading_legions': estimator_config_cascading_legions,
      'exact_set': estimator_config_exact,
      'hll++': estimator_config_hll,
      'vector_of_counts': estimator_config_voc,
  }
  set_generator_factory = (
      set_generator.IndependentSetGenerator.
      get_generator_factory_with_num_and_size(
          universe_size=FLAGS.universe_size,
          num_sets=FLAGS.number_of_sets,
          set_size=FLAGS.set_size))

  for estimator_method_config in estimator_config_list:
    print(f'Calculations for {estimator_method_config.name}')
    set_rs = np.random.RandomState(1)
    sketch_rs = np.random.RandomState(1)
    simulator = Simulator(
        num_runs=FLAGS.number_of_trials,
        set_generator_factory=set_generator_factory,
        sketch_estimator_config=estimator_method_config,
        set_random_state=set_rs,
        sketch_random_state=sketch_rs)

    _, agg_data = simulator.run_all_and_aggregate()
    print(f'Aggregate Statistics for {estimator_method_config.name}')
    print(agg_data)
    conf().name: conf for conf in EVALUATION_CONFIGS_TUPLE
}

EVALUATION_CONFIG_NAMES = tuple(NAME_TO_EVALUATION_CONFIGS.keys())


# Document the estimators.
# The name attribute of the SketchEstimatorConfig should conform to
# name_of_sketch-param_of_sketch-epsilon_value-estimator_specification.
# For example, if a user want to evaluate Bloom Filter of length 1000 with
# epsilon 0.1, and the UnionEstimator, then the name could be:
# bloom_filter-1e4-0.1-union.
LOG_BLOOM_FILTER_1E5_LN3_FIRST_MOMENT_LOG = SketchEstimatorConfig(
    name='log_bloom_filter-1e5-ln3-first_moment_log',
    sketch_factory=bloom_filters.LogarithmicBloomFilter.get_sketch_factory(
        length=10**5),
    estimator=bloom_filters.FirstMomentEstimator(
        method=bloom_filters.FirstMomentEstimator.METHOD_LOG,
        denoiser=bloom_filters.SurrealDenoiser(probability=0.25)),
    sketch_noiser=bloom_filters.BlipNoiser(epsilon=np.log(3)))

LOG_BLOOM_FILTER_1E5_INFTY_FIRST_MOMENT_LOG = SketchEstimatorConfig(
    name='log_bloom_filter-1e5-infty-first_moment_log',
    sketch_factory=bloom_filters.LogarithmicBloomFilter.get_sketch_factory(
        length=10**5),
    estimator=bloom_filters.FirstMomentEstimator(
        method=bloom_filters.FirstMomentEstimator.METHOD_LOG))

EXP_BLOOM_FILTER_1E5_10_LN3_FIRST_MOMENT_LOG = SketchEstimatorConfig(
    name='exp_bloom_filter-1e5_10-ln3-first_moment_exp',
    sketch_factory=bloom_filters.ExponentialBloomFilter.get_sketch_factory(
        length=10**5, decay_rate=10),