def test_simulator_run_all_and_aggregate_write_file(self): sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet, estimator=LosslessEstimator()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=1, set_size=1)) file_df = io.StringIO() file_df_agg = io.StringIO() sim = Simulator(num_runs=5, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config, file_handle_raw=file_df, file_handle_agg=file_df_agg) df, df_agg = sim() # Test if the saved data frame is the same as the one returned from the # simulator. file_df.seek(0) df_from_csv = pd.read_csv(file_df) pd.testing.assert_frame_equal(df, df_from_csv) file_df_agg.seek(0) df_agg_from_csv = pd.read_csv(file_df_agg, header=[0, 1], index_col=0) pd.testing.assert_frame_equal(df_agg, df_agg_from_csv)
def test_simulator_run_one_with_estimate_noiser(self): fake_estimate_noiser = FakeEstimateNoiser() sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet, estimator=LosslessEstimator(), estimate_noiser=fake_estimate_noiser) sim = get_simple_simulator(sketch_estimator_config) data_frame = sim.run_one() self.assertLen(data_frame, 1) self.assertEqual(data_frame['estimated_cardinality'].iloc[0], 10) self.assertEqual(fake_estimate_noiser._calls, 1)
def get_simple_simulator(sketch_estimator_config=None): if not sketch_estimator_config: sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet, estimator=LosslessEstimator()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=1, set_size=1)) return Simulator(num_runs=1, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config, sketch_random_state=np.random.RandomState(1), set_random_state=np.random.RandomState(2))
def test_get_sketch_different_runs_different_random_state(self): sketch_estimator_config = SketchEstimatorConfig( name='random_sketch-estimator_for_test_random_seed', sketch_factory=RandomSketchForTestRandomSeed, estimator=EstimatorForTestRandomSeed()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=1, set_size=1)) sim = Simulator(num_runs=2, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config) df, _ = sim() self.assertNotEqual( df.loc[df['run_index'] == 0, 'estimated_cardinality'].values, df.loc[df['run_index'] == 1, 'estimated_cardinality'].values)
def test_get_sketch_same_run_same_random_state(self): sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=RandomSketchForTestRandomSeed, estimator=EstimatorForTestRandomSeed()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=2, set_size=1)) sim = Simulator(num_runs=1, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config) df, _ = sim() self.assertEqual( df.loc[df['num_sets'] == 1, 'estimated_cardinality'].values, df.loc[df['num_sets'] == 2, 'estimated_cardinality'].values)
def test_simulator_run_all_and_aggregate_with_noise(self): rs = np.random.RandomState(3) sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet, estimator=LosslessEstimator(), sketch_noiser=AddRandomElementsNoiser(num_random_elements=3, random_state=rs)) sim = get_simple_simulator(sketch_estimator_config) data_frames = sim.run_all_and_aggregate() self.assertLen(data_frames, 2) for pub in data_frames[0]['num_sets']: self.assertEqual(pub, 1) self.assertEqual(data_frames[0]['estimated_cardinality'][0], 4) self.assertEqual(data_frames[0]['true_cardinality'][0], 1) self.assertEqual(data_frames[0]['relative_error'][0], 3)
def test_simulator_run_all_and_aggregate_multiple_runs(self): sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet, estimator=LosslessEstimator()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=1, set_size=1)) sim = Simulator(num_runs=5, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config) data_frames = sim.run_all_and_aggregate() self.assertLen(data_frames, 2) self.assertLen(data_frames[0], 5) for pub in data_frames[0]['num_sets']: self.assertEqual(pub, 1)
def setUp(self): super(InteroperabilityTest, self).setUp() self.number_of_trials = 2 self.universe_size = 2000 self.set_size_list = [5, 7, 9] self.large_set_size = 6 self.small_set_size = 3 self.sketch_size = 128 self.number_of_sets = 3 self.set_size = 50 self.num_large_sets = 1 self.num_small_sets = 3 self.order = set_generator.ORDER_RANDOM self.user_activity_association = ( set_generator.USER_ACTIVITY_ASSOCIATION_INDEPENDENT) self.shared_prop = 0.2 self.num_bloom_filter_hashes = 2 self.exponential_bloom_filter_decay_rate = 10 self.noiser_epsilon = np.log(3) self.noiser_flip_probability = .25 self.set_random_state = np.random.RandomState(42) self.sketch_random_state = np.random.RandomState(137) self.noise_random_state = np.random.RandomState(3) # non-noised estimators estimator_config_cascading_legions = SketchEstimatorConfig( name='cascading_legions', sketch_factory=CascadingLegions.get_sketch_factory( self.sketch_size, self.sketch_size), estimator=Estimator()) estimator_config_bloom_filter = SketchEstimatorConfig( name='bloom_filter-union_estimator', sketch_factory=BloomFilter.get_sketch_factory( self.sketch_size, self.num_bloom_filter_hashes), estimator=UnionEstimator()) estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig( name='log_bloom_filter-first_moment_log', sketch_factory=LogarithmicBloomFilter.get_sketch_factory( self.sketch_size), estimator=FirstMomentEstimator(method='log')) estimator_config_exponential_bloom_filter = SketchEstimatorConfig( name='exp_bloom_filter-first_moment_exp', sketch_factory=ExponentialBloomFilter.get_sketch_factory( self.sketch_size, self.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator(method='exp')) estimator_config_voc = SketchEstimatorConfig( name='vector_of_counts-sequential', sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size), estimator=SequentialEstimator()) estimator_config_exact = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet.get_sketch_factory(), estimator=LosslessEstimator()) estimator_config_hll = SketchEstimatorConfig( name='hyper_log_log', sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(self.sketch_size), estimator=HllCardinality()) config_list = [ estimator_config_exact, estimator_config_cascading_legions, estimator_config_bloom_filter, estimator_config_logarithmic_bloom_filter, estimator_config_exponential_bloom_filter, estimator_config_voc, estimator_config_hll, ] self.name_to_non_noised_estimator_config = { config.name: config for config in config_list } # noised estimators noised_estimator_config_cascading_legions = SketchEstimatorConfig( name='cascading_legions', sketch_factory=CascadingLegions.get_sketch_factory( self.sketch_size, self.sketch_size), estimator=Estimator(), sketch_noiser=Noiser(self.noiser_flip_probability)) noised_estimator_config_bloom_filter = SketchEstimatorConfig( name='bloom_filter-union_estimator', sketch_factory=BloomFilter.get_sketch_factory( self.sketch_size, self.num_bloom_filter_hashes), estimator=UnionEstimator(), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig( name='log_bloom_filter-first_moment_log', sketch_factory=LogarithmicBloomFilter.get_sketch_factory( self.sketch_size), estimator=FirstMomentEstimator( method='log', denoiser=SurrealDenoiser( probability=self.noiser_flip_probability)), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_exponential_bloom_filter = SketchEstimatorConfig( name='exp_bloom_filter-first_moment_exp', sketch_factory=ExponentialBloomFilter.get_sketch_factory( self.sketch_size, self.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator( method='exp', denoiser=SurrealDenoiser( probability=self.noiser_flip_probability)), sketch_noiser=BlipNoiser(self.noiser_epsilon, self.noise_random_state)) noised_estimator_config_voc = SketchEstimatorConfig( name='vector_of_counts-sequential', sketch_factory=VectorOfCounts.get_sketch_factory(self.sketch_size), estimator=SequentialEstimator(), sketch_noiser=LaplaceNoiser()) noised_estimator_config_exact = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet.get_sketch_factory(), estimator=LosslessEstimator(), sketch_noiser=AddRandomElementsNoiser(1, self.noise_random_state)) noised_config_list = [ noised_estimator_config_exact, noised_estimator_config_cascading_legions, noised_estimator_config_bloom_filter, noised_estimator_config_logarithmic_bloom_filter, noised_estimator_config_exponential_bloom_filter, noised_estimator_config_voc, ] self.name_to_noised_estimator_config = { config.name: config for config in noised_config_list }
def main(argv): if len(argv) > 1: raise app.UsageError('Too many command-line arguments.') estimator_config_cascading_legions = SketchEstimatorConfig( name='cascading-legions', sketch_factory=CascadingLegions.get_sketch_factory( FLAGS.sketch_size, FLAGS.sketch_size), estimator=Estimator()) estimator_config_bloom_filter = SketchEstimatorConfig( name='bloom_filter-union_estimator', sketch_factory=BloomFilter.get_sketch_factory( FLAGS.sketch_size, FLAGS.num_bloom_filter_hashes), estimator=UnionEstimator()) estimator_config_logarithmic_bloom_filter = SketchEstimatorConfig( name='log_bloom_filter-first_moment_log', sketch_factory=LogarithmicBloomFilter.get_sketch_factory( FLAGS.sketch_size), estimator=FirstMomentEstimator(method='log')) estimator_config_exponential_bloom_filter = SketchEstimatorConfig( name='exp_bloom_filter-first_moment_exp', sketch_factory=ExponentialBloomFilter.get_sketch_factory( FLAGS.sketch_size, FLAGS.exponential_bloom_filter_decay_rate), estimator=FirstMomentEstimator(method='exp')) estimator_config_voc = SketchEstimatorConfig( name='vector_of_counts-sequential', sketch_factory=VectorOfCounts.get_sketch_factory(FLAGS.sketch_size), estimator=SequentialEstimator()) estimator_config_hll = SketchEstimatorConfig( name='hll++', sketch_factory=HyperLogLogPlusPlus.get_sketch_factory(FLAGS.sketch_size), estimator=HllCardinality()) estimator_config_exact = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactSet.get_sketch_factory(), estimator=LosslessEstimator()) estimator_config_list = [ estimator_config_bloom_filter, estimator_config_logarithmic_bloom_filter, estimator_config_exponential_bloom_filter, estimator_config_cascading_legions, estimator_config_exact, estimator_config_hll, estimator_config_voc, ] name_to_estimator_config = { 'bloom_filter': estimator_config_bloom_filter, 'logarithmic_bloom_filter': estimator_config_logarithmic_bloom_filter, 'exponential_bloom_filter': estimator_config_exponential_bloom_filter, 'cascading_legions': estimator_config_cascading_legions, 'exact_set': estimator_config_exact, 'hll++': estimator_config_hll, 'vector_of_counts': estimator_config_voc, } set_generator_factory = ( set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=FLAGS.universe_size, num_sets=FLAGS.number_of_sets, set_size=FLAGS.set_size)) for estimator_method_config in estimator_config_list: print(f'Calculations for {estimator_method_config.name}') set_rs = np.random.RandomState(1) sketch_rs = np.random.RandomState(1) simulator = Simulator( num_runs=FLAGS.number_of_trials, set_generator_factory=set_generator_factory, sketch_estimator_config=estimator_method_config, set_random_state=set_rs, sketch_random_state=sketch_rs) _, agg_data = simulator.run_all_and_aggregate() print(f'Aggregate Statistics for {estimator_method_config.name}') print(agg_data)
conf().name: conf for conf in EVALUATION_CONFIGS_TUPLE } EVALUATION_CONFIG_NAMES = tuple(NAME_TO_EVALUATION_CONFIGS.keys()) # Document the estimators. # The name attribute of the SketchEstimatorConfig should conform to # name_of_sketch-param_of_sketch-epsilon_value-estimator_specification. # For example, if a user want to evaluate Bloom Filter of length 1000 with # epsilon 0.1, and the UnionEstimator, then the name could be: # bloom_filter-1e4-0.1-union. LOG_BLOOM_FILTER_1E5_LN3_FIRST_MOMENT_LOG = SketchEstimatorConfig( name='log_bloom_filter-1e5-ln3-first_moment_log', sketch_factory=bloom_filters.LogarithmicBloomFilter.get_sketch_factory( length=10**5), estimator=bloom_filters.FirstMomentEstimator( method=bloom_filters.FirstMomentEstimator.METHOD_LOG, denoiser=bloom_filters.SurrealDenoiser(probability=0.25)), sketch_noiser=bloom_filters.BlipNoiser(epsilon=np.log(3))) LOG_BLOOM_FILTER_1E5_INFTY_FIRST_MOMENT_LOG = SketchEstimatorConfig( name='log_bloom_filter-1e5-infty-first_moment_log', sketch_factory=bloom_filters.LogarithmicBloomFilter.get_sketch_factory( length=10**5), estimator=bloom_filters.FirstMomentEstimator( method=bloom_filters.FirstMomentEstimator.METHOD_LOG)) EXP_BLOOM_FILTER_1E5_10_LN3_FIRST_MOMENT_LOG = SketchEstimatorConfig( name='exp_bloom_filter-1e5_10-ln3-first_moment_exp', sketch_factory=bloom_filters.ExponentialBloomFilter.get_sketch_factory( length=10**5, decay_rate=10),