Ejemplo n.º 1
0
    def __init__(self,
                 out_dir,
                 evaluation_directory,
                 evaluation_run_name,
                 evaluation_name,
                 estimable_criteria_list,
                 plot_params=None):
        """Construct an analyzer.

    Args:
      out_dir: the output directory of analysis results.
      evaluation_directory: the output directory of evaluation results. The
        analyzer will read the evaluation results and output summary tables and
        plots.
      evaluation_run_name: the run name of the evaluation.
      evaluation_name: the name of the evaluation config.
      estimable_criteria_list: a list of tuples of error_margin and
        proportion_of_runs. An error_margin is a positive number setting the
        upper bound of the error, and the proportion_of_runs is a number
        between 0 and 1 that specifies the desired proportion of runs within
        the error margin.
      plot_params: a dictionary of the parameters of plot functions. If not
        given, will use PLOT_PARAMS. Also see PLOT_PARAMS for how it is defined.
    """
        self.estimable_criteria_list = estimable_criteria_list
        if plot_params is None:
            self.plot_params = PLOT_PARAMS
        else:
            self.plot_params = plot_params

        # Get all the raw results.
        self.evaluation_file_dirs = evaluator.load_directory_tree(
            out_dir=evaluation_directory,
            run_name=evaluation_run_name,
            evaluation_name=evaluation_name)
        self.raw_df = (
            CardinalityEstimatorEvaluationAnalyzer.read_evaluation_results(
                self.evaluation_file_dirs))

        # Create the analysis directory.
        if out_dir is None:
            out_dir = os.getcwd()
        if out_dir != evaluation_directory:
            shutil.copytree(self.evaluation_file_dirs[evaluator.KEY_RUN_DIR],
                            os.path.join(out_dir, evaluation_run_name))
        self.analysis_file_dirs = evaluator.load_directory_tree(
            out_dir=out_dir,
            run_name=evaluation_run_name,
            evaluation_name=evaluation_name)
Ejemplo n.º 2
0
    def __init__(self, out_dir, analysis_out_dir, evaluation_run_name,
                 evaluation_name):
        """Read analysis results and generate HTML report.

    Args:
      out_dir: the output direcotry of the report.
      analysis_out_dir: the output folder of the analysis results.
      evaluation_run_name: the run name of the evaluation.
      evaluation_name: the name of the evaluation configuration. For example,
        'smoke_test'.
    """
        if out_dir is None:
            out_dir = os.getcwd()
        self.out_dir = out_dir

        # Copy the analysis results to the report output directory, so that the HTML
        # report can be correctly rendered even if we move the csv files, plots,
        # etc.
        if out_dir != analysis_out_dir:
            analysis_file_dirs = evaluator.load_directory_tree(
                out_dir=analysis_out_dir,
                run_name=evaluation_run_name,
                evaluation_name=evaluation_name)
            shutil.copytree(analysis_file_dirs[evaluator.KEY_RUN_DIR],
                            os.path.join(out_dir, evaluation_run_name))

        self.analysis_results = analyzer.get_analysis_results(
            out_dir, evaluation_run_name, evaluation_name)

        self.analysis_results[KEY_NUM_ESTIMABLE_SETS_STATS_DF] = (
            ReportGenerator.add_parsed_sketch_estimator_name_cols(
                self.analysis_results[KEY_NUM_ESTIMABLE_SETS_STATS_DF],
                analyzer.SKETCH_ESTIMATOR_NAME))
Ejemplo n.º 3
0
def get_analysis_results(analysis_out_dir, evaluation_run_name,
                         evaluation_name):
    """Get analysis results.

  Args:
    analysis_out_dir: the output folder of the analysis results.
    evaluation_run_name: the run name of the evaluation.
    evaluation_name: the name of the evaluation configuration. For example,
      'smoke_test'.

  Returns:
    A dictionary of the analysis results, which include:
      description_to_file_dir: a dictionary of the analysis results file tree.
      num_estimable_sets_stats_df: a data frame containing the number
        of estimable sets of estimators under different scenarios, and also
        the relative error at the number of estimable sets.
      running_time_df: a data frame containing the running time of each
        sketch_estimator.
  """
    # Read analysis result file tree.
    description_to_file_dir = evaluator.load_directory_tree(
        out_dir=analysis_out_dir,
        run_name=evaluation_run_name,
        evaluation_name=evaluation_name)

    # Read number of estimable sets analysis results.
    filename = os.path.join(
        description_to_file_dir[evaluator.KEY_EVALUATION_DIR],
        NUM_ESTIMABLE_SETS_FILENAME)
    with open(filename, 'r') as f:
        num_estimable_sets_stats_df = pd.read_csv(f)

    # Read running time.
    running_time_df = pd.DataFrame(
        [], columns=[SKETCH_ESTIMATOR_COLNAME, RUNNING_TIME_COLNAME])
    for name, directory in description_to_file_dir[
            evaluator.KEY_ESTIMATOR_DIRS].items():
        filename = os.path.join(directory, evaluator.EVALUATION_RUN_TIME_FILE)
        with open(filename, 'r') as f:
            running_time = float(f.readline())
        running_time_df = running_time_df.append(
            {
                SKETCH_ESTIMATOR_COLNAME: name,
                RUNNING_TIME_COLNAME: running_time / RUNNING_TIME_SCALE
            },
            ignore_index=True)
    running_time_df = running_time_df.sort_values(SKETCH_ESTIMATOR_COLNAME)

    return {
        KEY_DESCRIPTION_TO_FILE_DIR: description_to_file_dir,
        KEY_NUM_ESTIMABLE_SETS_STATS_DF: num_estimable_sets_stats_df,
        KEY_RUNNING_TIME_DF: running_time_df
    }
 def test_load_directory_tree(self):
   # Create directory.
   out_dir = self.create_tempdir('test_load_directory_tree')
   created = evaluator._create_directory_tree(
       run_name=self.run_name,
       evaluation_config=self.evaluation_config,
       sketch_estimator_config_list=self.sketch_estimator_config_list,
       out_dir=out_dir,
       overwrite=False)
   # Load directory.
   loaded = evaluator.load_directory_tree(
       run_name=self.run_name,
       evaluation_name=self.evaluation_config.name,
       out_dir=out_dir)
   self.assertEqual(created, loaded)
Ejemplo n.º 5
0
import numpy as np
import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.nonparametric.smoothers_lowess import lowess
from wfa_cardinality_estimation_evaluation_framework.common import plotting
from wfa_cardinality_estimation_evaluation_framework.evaluations import evaluator
from wfa_cardinality_estimation_evaluation_framework.simulations import simulator
from wfa_cardinality_estimation_evaluation_framework.evaluations import analyzer

# Get all the raw results.
evaluation_file_dirs = evaluator.load_directory_tree(
    out_dir=".", run_name="eval_adbf_result", evaluation_name="4_various")
raw_df = (analyzer.CardinalityEstimatorEvaluationAnalyzer.
          read_evaluation_results(evaluation_file_dirs))

raw_df.to_csv("raw_df.csv", index=False)

df = raw_df.groupby(["num_sets", "sketch_estimator", "scenario"])\
    .agg({'relative_error_1': ['mean', 'std']})
df.columns = ['re_mean', 're_std']
df = df.reset_index()
# df["re_std_sqrt"] = np.sqrt(df["re_std"])
df["re_std_sqrt_inv"] = 1 / np.sqrt(df["re_std"])
df["re_std_log"] = np.log(df["re_std"])
df["re_std_log_cens"] = df["re_std_log"]
df.loc[df["re_std_log_cens"] > 0, "re_std_log_cens"] = 0
df["universe_size"] = (1000000 * df["scenario"].astype(float)).astype(int)
Ejemplo n.º 6
0
# limitations under the License.

import pandas as pd
import re
import seaborn as sns
import matplotlib.pyplot as plt
from wfa_cardinality_estimation_evaluation_framework.common import plotting
from wfa_cardinality_estimation_evaluation_framework.evaluations import evaluator
from wfa_cardinality_estimation_evaluation_framework.simulations import simulator
from wfa_cardinality_estimation_evaluation_framework.evaluations import analyzer


## simulation 1
# Get all the raw results.
evaluation_file_dirs = evaluator.load_directory_tree(
    out_dir=".",
    run_name="eval_adbf_result",
    evaluation_name="1_vary_flip_prob")
raw_df = (
    analyzer.CardinalityEstimatorEvaluationAnalyzer
    .read_evaluation_results(evaluation_file_dirs))
raw_df["flipping probabaility"] = \
    raw_df["sketch_estimator"].str.replace(".*_", "", regex=True)
raw_df["bloom filter"] = pd.Categorical(
    raw_df["sketch_estimator"].str.replace("_.*", "", regex=True), 
    categories=["exp", "log", "geo"], ordered=False)

df = raw_df.query('num_sets == 10')
# print(df)
plt.figure(figsize=(6,4))
plt.hlines(0, -1, 4, colors="grey", linestyles="dashed")
sns.boxplot(