def test_multiple_frequencies(self): sketch_estimator_config = SketchEstimatorConfig( name='exact-set-multiple-frequencies', sketch_factory=ExactMultiSet, estimator=LosslessEstimator(), max_frequency=3) set_generator_factory = (FakeSetGenerator.get_generator_factory( [[1, 1, 1, 2, 2, 3], [1, 1, 1, 3, 3, 4]])) sim = simulator.Simulator( num_runs=1, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config) df, _ = sim() expected_columns = [ 'num_sets', simulator.ESTIMATED_CARDINALITY_BASENAME + '1', simulator.ESTIMATED_CARDINALITY_BASENAME + '2', simulator.ESTIMATED_CARDINALITY_BASENAME + '3', simulator.TRUE_CARDINALITY_BASENAME + '1', simulator.TRUE_CARDINALITY_BASENAME + '2', simulator.TRUE_CARDINALITY_BASENAME + '3', simulator.SHUFFLE_DISTANCE, 'run_index', simulator.RELATIVE_ERROR_BASENAME + '1', simulator.RELATIVE_ERROR_BASENAME + '2', simulator.RELATIVE_ERROR_BASENAME + '3' ] expected_data = [[1, 3, 2, 1, 3, 2, 1, 0., 0, 0., 0., 0.], [2, 4, 3, 2, 4, 3, 2, 0., 0, 0., 0., 0.]] expected_df = pd.DataFrame(expected_data, columns=expected_columns) pd.testing.assert_frame_equal(df, expected_df)
def test_simulator_run_all_and_aggregate_write_file(self): sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactMultiSet, estimator=LosslessEstimator()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=1, set_size=1)) file_df = io.StringIO() file_df_agg = io.StringIO() sim = simulator.Simulator( num_runs=5, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config, file_handle_raw=file_df, file_handle_agg=file_df_agg) df, df_agg = sim() # Test if the saved data frame is the same as the one returned from the # simulator. file_df.seek(0) df_from_csv = pd.read_csv(file_df) pd.testing.assert_frame_equal(df, df_from_csv) file_df_agg.seek(0) df_agg_from_csv = pd.read_csv(file_df_agg, header=[0, 1], index_col=0) pd.testing.assert_frame_equal(df_agg, df_agg_from_csv)
def run_one_scenario(self, scenario_config, sketch_estimator_config): """Run evaluation for an estimator under a scenario.""" logging.info('Scenario: %s', scenario_config.name) scenario_dir = self.description_to_file_dir[ sketch_estimator_config.name][scenario_config.name] # Save an example of the scenario_config. gen = scenario_config.set_generator_factory(np.random.RandomState()) scenario_config_file = os.path.join(scenario_dir, SCENARIO_CONFIG_FILE) with open(scenario_config_file, 'wb') as f: pickle.dump(gen, f) # Run simulations. df_raw_file = os.path.join(scenario_dir, RAW_RESULT_DF_FILENAME) df_agg_file = os.path.join(scenario_dir, AGG_RESULT_DF_FILENAME) with open(df_raw_file, 'w') as f1, open(df_agg_file, 'w') as f2: sim = simulator.Simulator( num_runs=self.evaluation_config.num_runs, set_generator_factory=scenario_config.set_generator_factory, estimator_config=sketch_estimator_config, set_random_state=copy.deepcopy( self.scenario_random_states[scenario_config.name]), file_handle_raw=f1, file_handle_agg=f2) _ = sim()
def get_simple_simulator(sketch_estimator_config=None): if not sketch_estimator_config: sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactMultiSet, estimator=LosslessEstimator()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=1, set_size=1)) return simulator.Simulator(num_runs=1, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config, sketch_random_state=np.random.RandomState(1), set_random_state=np.random.RandomState(2))
def test_get_sketch_different_runs_different_random_state(self): sketch_estimator_config = SketchEstimatorConfig( name='random_sketch-estimator_for_test_random_seed', sketch_factory=RandomSketchForTestRandomSeed, estimator=EstimatorForTestRandomSeed()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=1, set_size=1)) sim = simulator.Simulator( num_runs=2, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config) df, _ = sim() self.assertNotEqual( df.loc[df['run_index'] == 0, simulator.ESTIMATED_CARDINALITY_BASENAME + '1'].values, df.loc[df['run_index'] == 1, simulator.ESTIMATED_CARDINALITY_BASENAME + '1'].values)
def test_shuffle_distance(self): with self.assertRaises(AssertionError): simulator.Simulator(0, 0, 0)._shuffle_distance([], []) with self.assertRaises(AssertionError): simulator.Simulator(0, 0, 0)._shuffle_distance([1], []) self.assertEqual( simulator.Simulator(0, 0, 0)._shuffle_distance([1], [1]), 0.0) self.assertEqual( simulator.Simulator(0, 0, 0)._shuffle_distance([10], [10]), 0.0) self.assertEqual( simulator.Simulator(0, 0, 0)._shuffle_distance([1, 1], [1]), 1.0) self.assertEqual( simulator.Simulator(0, 0, 0)._shuffle_distance([1, 1], [1, 1]), 0.0) self.assertEqual( simulator.Simulator(0, 0, 0)._shuffle_distance([2, 1, 0], [2, 2, 1]), 0.5)
def test_simulator_run_all_and_aggregate_multiple_runs(self): sketch_estimator_config = SketchEstimatorConfig( name='exact_set-lossless', sketch_factory=ExactMultiSet, estimator=LosslessEstimator()) set_generator_factory = (set_generator.IndependentSetGenerator. get_generator_factory_with_num_and_size( universe_size=1, num_sets=1, set_size=1)) sim = simulator.Simulator( num_runs=5, set_generator_factory=set_generator_factory, sketch_estimator_config=sketch_estimator_config) data_frames = sim.run_all_and_aggregate() self.assertLen(data_frames, 2) self.assertLen(data_frames[0], 5) for pub in data_frames[0]['num_sets']: self.assertEqual(pub, 1)