def test_multiple_frequencies(self):
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact-set-multiple-frequencies',
            sketch_factory=ExactMultiSet,
            estimator=LosslessEstimator(),
            max_frequency=3)
        set_generator_factory = (FakeSetGenerator.get_generator_factory(
            [[1, 1, 1, 2, 2, 3], [1, 1, 1, 3, 3, 4]]))
        sim = simulator.Simulator(
            num_runs=1,
            set_generator_factory=set_generator_factory,
            sketch_estimator_config=sketch_estimator_config)
        df, _ = sim()
        expected_columns = [
            'num_sets', simulator.ESTIMATED_CARDINALITY_BASENAME + '1',
            simulator.ESTIMATED_CARDINALITY_BASENAME + '2',
            simulator.ESTIMATED_CARDINALITY_BASENAME + '3',
            simulator.TRUE_CARDINALITY_BASENAME + '1',
            simulator.TRUE_CARDINALITY_BASENAME + '2',
            simulator.TRUE_CARDINALITY_BASENAME + '3',
            simulator.SHUFFLE_DISTANCE, 'run_index',
            simulator.RELATIVE_ERROR_BASENAME + '1',
            simulator.RELATIVE_ERROR_BASENAME + '2',
            simulator.RELATIVE_ERROR_BASENAME + '3'
        ]
        expected_data = [[1, 3, 2, 1, 3, 2, 1, 0., 0, 0., 0., 0.],
                         [2, 4, 3, 2, 4, 3, 2, 0., 0, 0., 0., 0.]]

        expected_df = pd.DataFrame(expected_data, columns=expected_columns)
        pd.testing.assert_frame_equal(df, expected_df)
    def test_simulator_run_all_and_aggregate_write_file(self):
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactMultiSet,
            estimator=LosslessEstimator())
        set_generator_factory = (set_generator.IndependentSetGenerator.
                                 get_generator_factory_with_num_and_size(
                                     universe_size=1, num_sets=1, set_size=1))

        file_df = io.StringIO()
        file_df_agg = io.StringIO()
        sim = simulator.Simulator(
            num_runs=5,
            set_generator_factory=set_generator_factory,
            sketch_estimator_config=sketch_estimator_config,
            file_handle_raw=file_df,
            file_handle_agg=file_df_agg)
        df, df_agg = sim()

        # Test if the saved data frame is the same as the one returned from the
        # simulator.
        file_df.seek(0)
        df_from_csv = pd.read_csv(file_df)
        pd.testing.assert_frame_equal(df, df_from_csv)

        file_df_agg.seek(0)
        df_agg_from_csv = pd.read_csv(file_df_agg, header=[0, 1], index_col=0)
        pd.testing.assert_frame_equal(df_agg, df_agg_from_csv)
Beispiel #3
0
  def run_one_scenario(self, scenario_config, sketch_estimator_config):
    """Run evaluation for an estimator under a scenario."""
    logging.info('Scenario: %s', scenario_config.name)

    scenario_dir = self.description_to_file_dir[
        sketch_estimator_config.name][scenario_config.name]
    # Save an example of the scenario_config.
    gen = scenario_config.set_generator_factory(np.random.RandomState())
    scenario_config_file = os.path.join(scenario_dir, SCENARIO_CONFIG_FILE)
    with open(scenario_config_file, 'wb') as f:
      pickle.dump(gen, f)

    # Run simulations.
    df_raw_file = os.path.join(scenario_dir, RAW_RESULT_DF_FILENAME)
    df_agg_file = os.path.join(scenario_dir, AGG_RESULT_DF_FILENAME)
    with open(df_raw_file, 'w') as f1, open(df_agg_file, 'w') as f2:
      sim = simulator.Simulator(
          num_runs=self.evaluation_config.num_runs,
          set_generator_factory=scenario_config.set_generator_factory,
          estimator_config=sketch_estimator_config,
          set_random_state=copy.deepcopy(
              self.scenario_random_states[scenario_config.name]),
          file_handle_raw=f1,
          file_handle_agg=f2)
      _ = sim()
def get_simple_simulator(sketch_estimator_config=None):
    if not sketch_estimator_config:
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactMultiSet,
            estimator=LosslessEstimator())
    set_generator_factory = (set_generator.IndependentSetGenerator.
                             get_generator_factory_with_num_and_size(
                                 universe_size=1, num_sets=1, set_size=1))

    return simulator.Simulator(num_runs=1,
                               set_generator_factory=set_generator_factory,
                               sketch_estimator_config=sketch_estimator_config,
                               sketch_random_state=np.random.RandomState(1),
                               set_random_state=np.random.RandomState(2))
 def test_get_sketch_different_runs_different_random_state(self):
     sketch_estimator_config = SketchEstimatorConfig(
         name='random_sketch-estimator_for_test_random_seed',
         sketch_factory=RandomSketchForTestRandomSeed,
         estimator=EstimatorForTestRandomSeed())
     set_generator_factory = (set_generator.IndependentSetGenerator.
                              get_generator_factory_with_num_and_size(
                                  universe_size=1, num_sets=1, set_size=1))
     sim = simulator.Simulator(
         num_runs=2,
         set_generator_factory=set_generator_factory,
         sketch_estimator_config=sketch_estimator_config)
     df, _ = sim()
     self.assertNotEqual(
         df.loc[df['run_index'] == 0,
                simulator.ESTIMATED_CARDINALITY_BASENAME + '1'].values,
         df.loc[df['run_index'] == 1,
                simulator.ESTIMATED_CARDINALITY_BASENAME + '1'].values)
 def test_shuffle_distance(self):
     with self.assertRaises(AssertionError):
         simulator.Simulator(0, 0, 0)._shuffle_distance([], [])
     with self.assertRaises(AssertionError):
         simulator.Simulator(0, 0, 0)._shuffle_distance([1], [])
     self.assertEqual(
         simulator.Simulator(0, 0, 0)._shuffle_distance([1], [1]), 0.0)
     self.assertEqual(
         simulator.Simulator(0, 0, 0)._shuffle_distance([10], [10]), 0.0)
     self.assertEqual(
         simulator.Simulator(0, 0, 0)._shuffle_distance([1, 1], [1]), 1.0)
     self.assertEqual(
         simulator.Simulator(0, 0, 0)._shuffle_distance([1, 1], [1, 1]),
         0.0)
     self.assertEqual(
         simulator.Simulator(0, 0, 0)._shuffle_distance([2, 1, 0],
                                                        [2, 2, 1]), 0.5)
    def test_simulator_run_all_and_aggregate_multiple_runs(self):
        sketch_estimator_config = SketchEstimatorConfig(
            name='exact_set-lossless',
            sketch_factory=ExactMultiSet,
            estimator=LosslessEstimator())
        set_generator_factory = (set_generator.IndependentSetGenerator.
                                 get_generator_factory_with_num_and_size(
                                     universe_size=1, num_sets=1, set_size=1))

        sim = simulator.Simulator(
            num_runs=5,
            set_generator_factory=set_generator_factory,
            sketch_estimator_config=sketch_estimator_config)

        data_frames = sim.run_all_and_aggregate()
        self.assertLen(data_frames, 2)
        self.assertLen(data_frames[0], 5)
        for pub in data_frames[0]['num_sets']:
            self.assertEqual(pub, 1)