def run(self):

        self.vizgraph = VizGraph()
        with open(self.get_workflow_path()) as f:
            json_data = json.load(f)
            for s in json_data["setup"]:
                self.vizgraph.add_viz(s)

            for s in json_data["setup"]:
               self.vizgraph.apply_interaction(Operation(s))

            self.workflow_interactions = json_data["interactions"]

            self.operation_results = OrderedDict({ "args": vars(self.options), "results": OrderedDict() })
            self.current_interaction_index = 0
            self.current_vizrequest_index = 0
            self.benchmark_start_time = util.get_current_ms_time()

            try:
                logger.info("calling \"workflow_start\" on driver")
                self.driver.workflow_start()
            except AttributeError:
                pass

            global do_poll
            do_poll = True
            def poll_results(slf, queue):
                global count
                while do_poll:
                    try:
                        process_result = queue.get(timeout=1)
                    except Empty:
                        logger.info("result queue empty... trying again")
                        continue
                    if process_result is None:
                        continue
                    slf.deliver_viz_request([process_result])
                logger.info("stopped polling results")

                try:
                    while queue.get(timeout=0.01):
                        pass
                except Empty:
                    logger.info("result queue cleard")

            if not self.options.groundtruth:
                thread = Thread(target = poll_results, args = (self, IDEBench.result_queue))
                thread.start()
            
            interaction_index = 0
            while interaction_index < len(self.workflow_interactions):
                self.process_interaction(interaction_index)
                interaction_index +=1

            do_poll = False
            if not self.options.groundtruth:
                thread.join()
            self.end_run()
Exemple #2
0
    def run(self):

        try:
            self.driver.workflow_start()
        except AttributeError:
            pass

        with open(self.get_workflow_path()) as f:
            self.workflow_interactions = json.load(f)["interactions"]

        self.vizgraph = VizGraph()
        self.operation_results = {"args": vars(self.options), "results": {}}
        self.current_interaction_index = 0
        self.current_vizrequest_index = 0
        self.process_interaction(0)
Exemple #3
0
class IDEBench:

    result_queue = multiprocessing.Queue()

    def __init__(self):

        parser = OptionParser()
        parser.add_option("--driver-name",
                          dest="driver_name",
                          action="store",
                          help="Driver name")
        parser.add_option("--driver-create-storage",
                          dest="driver_create_storage",
                          action="store_true",
                          help="Calls create_storage on driver",
                          default=False)
        parser.add_option("--driver-clear-storage",
                          dest="driver_clear_storage",
                          action="store_true",
                          help="Calls clear_storage on driver",
                          default=False)
        parser.add_option("--driver-clear-cache",
                          dest="driver_clear_cache",
                          action="store_true",
                          help="Calls clear_cache on driver",
                          default=False)
        parser.add_option("--driver-args",
                          dest="driver_args",
                          action="store",
                          help="Arguments to pass to the driver",
                          default="")

        parser.add_option("--settings-normalized",
                          dest="settings_normalized",
                          action="store_true",
                          help="Whether joins should be used",
                          default=False)
        parser.add_option("--settings-dataset",
                          dest="settings_dataset",
                          action="store",
                          help="Name of the dataset")
        parser.add_option("--settings-size",
                          dest="settings_size",
                          default="",
                          action="store",
                          help="Number of rows in the dataset")
        parser.add_option("--settings-thinktime",
                          dest="settings_thinktime",
                          type="int",
                          action="store",
                          help="Think-time in seconds between two executions",
                          default=1000)
        parser.add_option("--settings-time-requirement",
                          dest="settings_time_requirement",
                          action="store",
                          help="The Time requirement to be used",
                          default=1000)
        parser.add_option("--settings-confidence-level",
                          dest="settings_confidence_level",
                          action="store",
                          help="The confidence level to be used",
                          default=95)
        parser.add_option("--settings-workflow",
                          dest="settings_workflow",
                          action="store",
                          help="The workflow file to be used")

        parser.add_option("--evaluate",
                          dest="evaluate",
                          action="store_true",
                          help="Size of the dataset in MB",
                          default=False)
        parser.add_option(
            "--create--full-report",
            dest="create_report",
            action="store_true",
            help=
            "Merges all reports in the reports directory into a single file",
            default=False)
        parser.add_option("--run",
                          dest="run",
                          action="store_true",
                          help="Flag to run the benchmark without config file",
                          default=False)
        parser.add_option(
            "--run-config",
            dest="config",
            action="store",
            help="Flag to run the benchmark with the specified config file")
        parser.add_option(
            "--groundtruth",
            dest="groundtruth",
            action="store_true",
            help="If set computes the ground-truth for the specified workflow",
            default=False)

        (self.options, args) = parser.parse_args()

        if not self.options.config:

            if self.options.create_report:
                self.create_report()
                return

            if not self.options.driver_name:
                parser.error("No driver name specified.")

            if not self.options.settings_dataset:
                parser.error("No dataset specified.")

            if not self.options.settings_size:
                print("Warning: No dataset size specified.")

            if self.options.groundtruth or self.options.run:
                self.setup()

            if self.options.groundtruth:

                self.options.think_time = 1
                self.options.time_requirement = 999999

                workflow_files = glob.glob("data/" +
                                           self.options.settings_dataset +
                                           "/workflows/*.json")

                for workflow_file in workflow_files:
                    self.options.settings_workflow = basename(
                        workflow_file).split(".")[0]
                    self.run()

            elif self.options.run:

                if not self.options.settings_workflow:
                    parser.error("No workflow specified.")

                self.run()
            elif self.options.evaluate:
                self.evaluate(self.get_config_hash())
        else:

            with open(self.options.config) as f:
                config = json.load(f)
                assure_path_exists("./results")
                for d in config["settings-datasets"]:
                    assure_path_exists("./data/%s/groundtruths" % d)

                # TODO: create pairs instead
                for dataset in config["settings-datasets"]:
                    self.options.settings_dataset = dataset

                    for driver_name in config["driver-names"]:
                        for driver_arg in config["driver-args"]:

                            self.options.driver_name = driver_name
                            self.setup(driver_arg)

                            for size in config["settings-sizes"]:
                                for workflow in config["settings-workflows"]:
                                    for thinktime in config[
                                            "settings-thinktimes"]:
                                        for time_requirement in config[
                                                "settings-time-requirements"]:
                                            for confidence_level in config[
                                                    "settings-confidence-levels"]:

                                                self.options.driver_name = driver_name

                                                self.options.settings_size = size
                                                self.options.settings_workflow = workflow
                                                self.options.settings_thinktime = thinktime
                                                self.options.settings_time_requirement = time_requirement
                                                self.options.settings_confidence_level = confidence_level
                                                self.options.settings_normalized = config[
                                                    "settings-normalized"]
                                                self.options.groundtruth = config[
                                                    "groundtruth"] if "groundtruth" in config else False
                                                self.options.run = config[
                                                    "run"] if "run" in config else True
                                                self.options.evaluate = config[
                                                    "evaluate"] if "evaluate" in config else True

                                                if self.options.run:
                                                    self.run()

                                                if self.options.evaluate:
                                                    self.evaluate(
                                                        self.get_config_hash())

    def setup(self, driver_arg=None):
        with open(self.get_schema_path()) as f:
            self.schema = Schema(json.load(f),
                                 self.options.settings_normalized)

        module = importlib.import_module("drivers." + self.options.driver_name)
        self.driver = getattr(module, "IDEBenchDriver")()

        try:
            self.driver.init(self.options, self.schema, driver_arg)
        except AttributeError:
            pass

    def run(self):

        try:
            self.driver.workflow_start()
        except AttributeError:
            pass

        with open(self.get_workflow_path()) as f:
            self.workflow_interactions = json.load(f)["interactions"]

        self.vizgraph = VizGraph()
        self.operation_results = {"args": vars(self.options), "results": {}}
        self.current_interaction_index = 0
        self.current_vizrequest_index = 0
        self.process_interaction(0)

    def end_run(self):

        try:
            self.driver.workflow_end()
        except AttributeError:
            pass

        path = "results/%s.json" % (self.get_config_hash())

        if not self.options.groundtruth:
            with open(path, "w") as fp:
                json.dump(self.operation_results, fp)

        if self.options.groundtruth:
            path = "data/%s/groundtruths/%s_%s.json" % (
                self.options.settings_dataset, self.options.settings_size,
                self.options.settings_workflow)
            with open(path, "w") as fp:
                json.dump(self.operation_results, fp)

    def process_interaction(self, interaction_index):
        print("processing!")
        if interaction_index < 0 or interaction_index >= len(
                self.workflow_interactions):
            print("reached end of interactions")
            self.end_run()
            return

        print("thinking...")
        time.sleep(self.options.settings_thinktime / 1000)

        interaction = self.workflow_interactions[interaction_index]
        vizs_to_request = self.vizgraph.apply_interaction(
            Operation(interaction))

        viz_requests = []
        for viz in vizs_to_request:
            viz_requests.append(
                VizRequest(self.current_vizrequest_index,
                           self.current_interaction_index, viz))
            self.current_vizrequest_index += 1

        #if interaction_index == 0:
        #    self.result_queue = multiprocessing.Queue()

        # TODO: document this feature
        try:
            self.driver.before_requests(self.options, self.schema,
                                        IDEBench.result_queue)
        except AttributeError:
            pass

        procs = []
        nprocs = len(viz_requests)
        if hasattr(self.driver,
                   "use_single_process") and self.driver.use_single_process:
            for viz_request in viz_requests:
                self.driver.process_request(viz_request, self.options,
                                            self.schema, IDEBench.result_queue)
        else:
            for viz_request in viz_requests:
                proc = multiprocessing.Process(
                    target=self.driver.process_request,
                    args=(viz_request, self.options, self.schema,
                          IDEBench.result_queue))
                procs.append(proc)
                proc.start()

        resultlist = []
        for i in range(nprocs):
            resultlist.append(IDEBench.result_queue.get())

        for proc in procs:
            proc.join()

        self.deliver_viz_request(resultlist)
        self.current_interaction_index += 1
        self.process_interaction(self.current_interaction_index)

    def deliver_viz_request(self, viz_requests):

        for viz_request in viz_requests:
            if len(viz_request.result.keys()) == 0:
                pass

            operation_result = {}
            operation_result["id"] = viz_request.operation_id
            operation_result[
                "sql"] = viz_request.viz.get_computed_filter_as_sql(
                    self.schema)
            operation_result["viz_name"] = viz_request.viz.name
            operation_result[
                "parent_operation_id"] = viz_request.parent_operation_id
            operation_result["start_time"] = viz_request.start_time
            operation_result["end_time"] = viz_request.end_time
            operation_result["time_violated"] = viz_request.timedout
            operation_result["t_pause"] = viz_request.t_pause
            operation_result["t_start"] = viz_request.t_start
            operation_result["progress"] = viz_request.progress
            operation_result["output"] = viz_request.result
            operation_result["margins"] = viz_request.margins
            operation_result["num_binning_dimensions"] = len(
                viz_request.viz.binning)
            operation_result["num_aggregates_per_bin"] = len(
                viz_request.viz.per_bin_aggregates)

            bin_types = []
            for viz_bin in viz_request.viz.binning:
                if "width" in viz_bin:
                    bin_types.append("quantitative")
                else:
                    bin_types.append("nominal")
            operation_result["binning_type"] = "_".join(sorted(bin_types))

            agg_types = []
            for viz_agg in viz_request.viz.per_bin_aggregates:
                if viz_agg["type"] == "count":
                    agg_types.append("count")
                elif viz_agg["type"] == "avg":
                    agg_types.append("avg")
                else:
                    raise Exception()
            operation_result["aggregate_type"] = "_".join(sorted(agg_types))

            if not viz_request.operation_id in self.operation_results:
                self.operation_results["results"][
                    viz_request.operation_id] = operation_result

            viz_request.delivered = True

        #self.driver.request_vizs(self.viz_requests)

    def get_config_hash(self):
        o = self.options
        h = (o.driver_name, o.settings_dataset, o.settings_workflow,
             o.settings_size, o.settings_normalized,
             o.settings_confidence_level, o.settings_thinktime,
             o.settings_thinktime, o.settings_time_requirement)
        return hashlib.md5(str(h).encode('utf-8')).hexdigest()

    def get_schema_path(self):
        return "data/%s/sample.json" % (self.options.settings_dataset)

    def get_workflow_path(self):
        return "data/%s/workflows/%s.json" % (self.options.settings_dataset,
                                              self.options.settings_workflow)

    def compute_viz_similarity(self, viz_gt, viz):

        if len(viz.keys()) == 0 and len(viz_gt.keys()) == 0:
            return 1

        if len(viz_gt.keys()) == 0 and len(viz.keys()) > 0:
            raise Exception()

        if len(viz_gt.keys()) > 0 and len(viz.keys()) == 0:
            return 0

        for gt_key in viz_gt.keys():
            if gt_key not in viz:
                viz[gt_key] = 0

        viz_gt_vals = []
        viz_vals = []
        for gt_key in viz_gt.keys():
            if isinstance(viz_gt[gt_key], list):
                viz_gt_vals.append(viz_gt[gt_key][0])
            else:
                viz_gt_vals.append(viz_gt[gt_key])

            if isinstance(viz[gt_key], list):
                viz_vals.append(viz[gt_key][0])
            else:
                viz_vals.append(viz[gt_key])

        viz_gt_vals = np.array(viz_gt_vals).astype(float)
        viz_vals = np.array(viz_vals).astype(float)

        #viz_gt_vals = self.normalize(viz_gt_vals)
        #viz_vals = self.normalize(viz_vals)

        if np.isnan(viz_gt_vals).any():
            raise Exception()

        if np.isnan(viz_vals).any():
            raise Exception()

        #score = np.dot(viz_gt_vals, viz_vals)/ ( np.sqrt(np.sum(np.square(viz_gt_vals))) * np.sqrt(np.sum(np.square(viz_vals))) )
        np.seterr(all='raise')
        try:
            score = 1 - spatial.distance.cosine(viz_gt_vals, viz_vals)
        except:
            return 0
        return score if not np.isnan(score) else 0

    def normalize(self, v):
        norm = np.linalg.norm(v, ord=1)
        if norm == 0:
            norm = np.finfo(v.dtype).eps
        return v / norm

    def evaluate(self, config_hash):
        print("evaluate")
        result_json = None
        try:
            with open("results/%s.json" % config_hash, "r") as json_data:
                result_json = json.load(json_data)
        except:
            print("couldn't load file %s" % ("results/%s.json" % config_hash))
            return

        workflow = result_json["args"]["settings_workflow"]
        dataset = result_json["args"]["settings_dataset"]
        size = result_json["args"]["settings_size"]
        time_requirement = result_json["args"]["settings_time_requirement"]

        with open(
                "data/%s/groundtruths/%s_%s.json" % (dataset, size, workflow),
                "r") as json_data:
            groundtruths = json.load(json_data)["results"]

        with open("reports/%s.csv" % config_hash, 'w') as fp:
            w = csv.DictWriter(fp, [
                "operation_id", "config_hash", "interaction_id", "dataset",
                "size", "viz_name", "interface", "think_time",
                "time_requirement", "t_start", "t_pause", "workflow",
                "start_time", "end_time", "duration", "progress",
                "time_violated", "num_binning_dimensions", "binning_type",
                "has_invalid_bins", "num_bins_out_of_margin",
                "num_bins_delivered", "num_bins_in_gt", "missing_bins",
                "dissimilarity", "num_aggregates_per_bin", "aggregate_type",
                "bias", "rel_error_avg", "rel_error_stdev", "rel_error_min",
                "rel_error_max", "margin_avg", "margin_stdev", "margin_min",
                "margin_max", "margin_ratio"
            ],
                               delimiter=",",
                               lineterminator="\n")
            w.writeheader()

            operations = result_json["results"]

            for op_number in operations.keys():

                gt_output = groundtruths[op_number]["output"]
                operation = operations[op_number]

                margins = []
                rel_errors = []
                forecast_values = []
                actual_values = []
                out_of_margin_count = 0

                for gt_bin_identifier, gt_aggregate_results in gt_output.items(
                ):

                    if gt_bin_identifier in operation["output"]:

                        for agg_bin_result_index, agg_bin_result in enumerate(
                                operation["output"][gt_bin_identifier]):
                            rel_error = None
                            op_result = operation["output"][gt_bin_identifier][
                                agg_bin_result_index]
                            gt_result = gt_aggregate_results[
                                agg_bin_result_index]

                            if abs(gt_result) > 0:
                                rel_error = abs(op_result -
                                                gt_result) / abs(gt_result)
                                if rel_error > 1e-5:
                                    pass
                                rel_errors.append(rel_error)
                            else:
                                print("ignoring zero in groundtruth")

                            forecast_values.append(op_result)
                            actual_values.append(gt_result)

                            if operation[
                                    "margins"] and gt_bin_identifier in operation[
                                        "margins"]:
                                op_margin = float(
                                    operation["margins"][gt_bin_identifier]
                                    [agg_bin_result_index])

                                if np.isnan(op_margin) or np.isinf(
                                        op_margin) or abs(op_margin) > 1000000:
                                    if os.path.exists("./margin_errors"):
                                        append_write = 'a'  # append if already exists
                                    else:
                                        append_write = 'w'  # make a new file if not
                                    with open("./margin_errors",
                                              append_write) as ffff:
                                        ffff.writelines(
                                            self.options.settings_workflow +
                                            "\n" +
                                            str(operation["margins"]
                                                [gt_bin_identifier]
                                                [agg_bin_result_index]) + "\n")

                                elif gt_result + 1e-6 < op_result - abs(
                                        op_result * op_margin
                                ) or gt_result - 1e-6 > op_result + abs(
                                        op_result * op_margin):
                                    out_of_margin_count += 1
                                    margins.append(abs(op_margin))
                                else:
                                    margins.append(abs(op_margin))

                    else:
                        pass
                        # add error as many times as a bin was expected!
                        #rel_errors.extend( [ 1 for n in range(len(gt_aggregate_results)) ] )

                # invalid bins test
                has_invalid_bins = False
                num_invalid = 0
                inv = []

                for kk in operation["output"].keys():
                    if kk not in gt_output:
                        has_invalid_bins = True
                        num_invalid += 1
                        inv.append(kk)

                        print(self.options.settings_workflow)
                        print(str(operation["id"]))
                        print("invalid key:" + kk)
                        print(operation["sql"])
                        print(operation["output"])
                        os._exit(0)

                args = result_json["args"]

                missing_bins = 1 - len(operation["output"].keys()) / len(
                    gt_output.keys()) if len(gt_output.keys()) > 0 else 0
                op_eval_result = {}
                op_eval_result["operation_id"] = operation["id"]
                op_eval_result["config_hash"] = self.get_config_hash()
                op_eval_result["interaction_id"] = operation[
                    "parent_operation_id"]
                op_eval_result["dataset"] = args["settings_dataset"]
                op_eval_result["size"] = args["settings_size"]
                op_eval_result["viz_name"] = operation["viz_name"]
                op_eval_result["think_time"] = args["settings_thinktime"]
                op_eval_result["time_requirement"] = args[
                    "settings_time_requirement"]
                op_eval_result["interface"] = args["driver_name"]
                op_eval_result["workflow"] = args["settings_workflow"]
                op_eval_result["start_time"] = operation["start_time"]
                op_eval_result["end_time"] = operation["end_time"]
                op_eval_result["t_pause"] = operation[
                    "t_pause"] if "t_pause" in operation else 0
                op_eval_result["t_start"] = operation[
                    "t_start"] if "t_start" in operation else 0
                op_eval_result["duration"] = operation["end_time"] - operation[
                    "start_time"]

                if "time_violated" in operation:
                    op_eval_result["time_violated"] = operation[
                        "time_violated"]
                elif "timedout" in operation:
                    op_eval_result["time_violated"] = operation["timedout"]
                else:
                    raise Exception()

                op_eval_result["has_invalid_bins"] = has_invalid_bins
                op_eval_result["binning_type"] = operation["binning_type"]
                op_eval_result["aggregate_type"] = operation["aggregate_type"]
                op_eval_result["num_bins_delivered"] = len(
                    operation["output"].keys())
                op_eval_result["num_bins_in_gt"] = len(gt_output.items())
                op_eval_result["missing_bins"] = "%.5f" % missing_bins

                op_eval_result["dissimilarity"] = "%.5f" % (
                    1 - self.compute_viz_similarity(gt_output,
                                                    operation["output"]))

                op_eval_result[
                    "num_bins_out_of_margin"] = "%i" % out_of_margin_count
                op_eval_result["num_aggregates_per_bin"] = operation[
                    "num_aggregates_per_bin"]
                op_eval_result["num_binning_dimensions"] = operation[
                    "num_binning_dimensions"]
                op_eval_result["progress"] = "%.5f" % operation["progress"]
                op_eval_result["bias"] = "%.5f" % (
                    sum(forecast_values) / sum(actual_values) -
                    1) if len(actual_values) > 0 else 0
                op_eval_result["rel_error_stdev"] = "%.5f" % statistics.stdev(
                    rel_errors) if len(rel_errors) > 1 else 0.0
                op_eval_result["rel_error_min"] = "%.5f" % min(
                    rel_errors) if len(rel_errors) > 0 else 0
                op_eval_result["rel_error_max"] = "%.5f" % max(
                    rel_errors) if len(rel_errors) > 0 else 0
                op_eval_result["rel_error_avg"] = "%.5f" % float(
                    sum(rel_errors) /
                    float(len(rel_errors))) if len(rel_errors) > 0 else 0
                op_eval_result["margin_stdev"] = "%.5f" % statistics.stdev(
                    margins) if len(margins) > 1 else 0.0
                op_eval_result["margin_min"] = "%.5f" % min(margins) if len(
                    margins) > 0 else 0.0
                op_eval_result["margin_max"] = "%.5f" % max(margins) if len(
                    margins) > 0 else 0.0
                op_eval_result["margin_avg"] = "%.5f" % float(
                    sum(margins) /
                    float(len(margins))) if len(margins) > 0 else 0.0
                op_eval_result["margin_ratio"] = "%.5f" % float(
                    len(operation["margins"]) /
                    len(operation["output"])) if operation["margins"] and len(
                        operation["output"]) > 0 else 1
                w.writerow(op_eval_result)

    def create_report(self):
        header_saved = False
        interesting_files = glob.glob("reports/*.csv")
        with open('./full_report.csv', 'w') as fout:
            for filename in interesting_files:
                print(filename)
                with open(filename) as fin:
                    header = next(fin)
                    if not header_saved:
                        print(header)
                        fout.write(header)
                        header_saved = True
                    for line in fin:
                        fout.write(line)
        print("saved report")
    def __init__(self):

        parser = OptionParser()
        parser.add_option("-r", "--seed", dest="seed", action="store", type=int, help="Random seed", default=25000)
        parser.add_option("-d", "--dataset", dest="data_folder", action="store", help="path to save the file", default="flights")
        parser.add_option("--debug", dest="debug", action="store_true", help="creates a debug file", default=False)
        parser.add_option("-n", "--num-operations", dest="num_operations", action="store", type=int, help="Number of operations to generate", default=20)
        parser.add_option("-c", "--workflow-type", dest="config", action="store", help="path to config file", default="data/flights/workflowtypes/sequential.json")
        parser.add_option("-p", "--output", dest="path", action="store", help="path to save the file", default="workflow.json")
        parser.add_option("-s", "--num-samples", dest="numsamples", action="store", type=int, help="Number of samples to draw from the original dataset", default=10000)
        (options, args) = parser.parse_args()
        self.options = options

        random.seed(options.seed)
        np.random.seed(seed=options.seed)

        print("data/" + options.data_folder + "/" + options.config)
        with open("data/" + options.data_folder + "/workflowtypes/" + options.config, "r") as fp:
            self.config = json.load(fp)

        schema = None
        with open(self.get_schema_path()) as f:
            schema = Schema(json.load(f))

        print("reading csv...")
        # load sample data
        df = pd.read_csv("data/" + options.data_folder + "/sample.csv", nrows=options.numsamples, header=0)
        
        #schema = {"tables": [{ "name": "df", "dimensions": []}]}
        sample_json = None
        with open("data/" + options.data_folder + "/sample.json", "r") as f:
            sample_json = json.load(f)
    #       print(sample_json)
    #        for field in sample_json["tables"]["fact"]["fields"]:
    #          schema["tables"][0]["dimensions"].append({"name": field["field"]})


        #storage = Storage(schema)

        zero_qs_ratio = 100

        tries = -1
        while zero_qs_ratio > 0.15:
            tries += 1
            num_zeros_qs = 0
            num_qs = 0
            VizAction.VIZ_COUNTER = -1  
            LinkAction.FIRST_LINK = None
            LinkAction.LATEST_LINK = None
            LinkAction.LINKS = set()
            
            vizgraph = VizGraph()
            random.seed(options.seed + tries)
            root = VizAction(self.config, df, vizgraph, schema, sample_json)
            current = root
            states = []
            
            num_ops = 0
            
            debug_states = []
            while num_ops < options.num_operations:
                res = current.get_states()
                if res:                 
                    affected_vizs = vizgraph.apply_interaction(res)
                    if options.debug:
                        nodes_dict = vizgraph.get_nodes_dict()
                        states_dict = {}
                        for n in nodes_dict.keys():
                            states_dict[n] = {
                                "name":n,
                                "source" : nodes_dict[n].get_source(),
                                "binning":  nodes_dict[n].binning,
                                "agg": nodes_dict[n].per_bin_aggregates,
                                "selection": nodes_dict[n].get_selection(),
                                "filter": nodes_dict[n].get_filter(),
                                "computed_filter": nodes_dict[n].get_computed_filter_as_sql(schema),
                            }
                        debug_states.append(states_dict)
                    
                    for x in affected_vizs:
                        sql = x.get_computed_filter_as_sql(schema).replace("FLOOR", "ROUND").replace(schema.get_fact_table_name(), "df")
                        r = pandasql.sqldf(sql, locals())
                        num_qs += 1
                        if len(r.index) == 0:
                            num_zeros_qs += 1
                            #print("ZERO QUERY")

                    states.append(res.data)
                    #print(res.data)
                    #if "source" not in res:
                    num_ops += 1

                current = current.get_next()
                if current is None:
                    zero_qs_ratio = num_zeros_qs/num_qs
                    break
            zero_qs_ratio = num_zeros_qs/num_qs
        
        print("zero queries:")
        print( (num_zeros_qs / num_qs))

        with open("data/" + options.data_folder +  "/workflows/" + options.path + ".json", "w") as fp:
            fp.write(json.dumps({"name": "generated", "dataset": options.data_folder, "seed": options.seed, "config": options.config, "interactions": states}))

        print("done.")
class IDEBench:

    result_queue = multiprocessing.Queue()
    def __init__(self):

        parser = OptionParser()
        parser.add_option("--driver-name", dest="driver_name", action="store", help="Driver name")
        parser.add_option("--driver-create-storage", dest="driver_create_storage", action="store_true", help="Calls create_storage on driver", default=False)
        parser.add_option("--driver-clear-storage", dest="driver_clear_storage", action="store_true", help="Calls clear_storage on driver", default=False)
        parser.add_option("--driver-clear-cache", dest="driver_clear_cache", action="store_true", help="Calls clear_cache on driver", default=False)
        parser.add_option("--driver-args", dest="driver_args", action="store", help="Arguments to pass to the driver", default="")

        parser.add_option("--settings-normalized", dest="settings_normalized", action="store_true", help="Whether joins should be used", default=False)
        parser.add_option("--settings-dataset", dest="settings_dataset", action="store", help="Name of the dataset")
        parser.add_option("--settings-size", dest="settings_size", default="", action="store", help="Number of rows in the dataset")
        parser.add_option("--settings-thinktime", dest="settings_thinktime", type="int", action="store", help="Think-time in seconds between two executions", default=1000)
        parser.add_option("--settings-time-requirement", dest="settings_time_requirement", action="store", help="The Time requirement to be used", default=1000)
        parser.add_option("--settings-confidence-level", dest="settings_confidence_level", action="store", help="The confidence level to be used", default=95)
        parser.add_option("--settings-workflow", dest="settings_workflow", action="store", help="The workflow file to be used")
        
        parser.add_option("--evaluate", dest="evaluate", action="store", help="The name of the workflow result to evaluate", default=None)
        parser.add_option("--create--full-report", dest="create_report", action="store_true", help="Merges all reports in the reports directory into a single file", default=False)
        parser.add_option("--run", dest="run", action="store_true", help="Flag to run the benchmark without config file", default=False)
        parser.add_option("--await-response", dest="await_response", action="store_true", help="Whether or not to wait for wait for driver responses before proceeding with the next request (only works when multi-threaded)", default=False)
        parser.add_option("--run-config", dest="config", action="store", help="Flag to run the benchmark with the specified config file")
        parser.add_option("--groundtruth", dest="groundtruth", action="store_true", help="If set computes the ground-truth for the specified workflow", default=False)
        parser.add_option("--gt-folder", dest="gt_folder", action="store", help="The path to the groundtruth", default=None)
        parser.add_option("--gt-for", dest="gt_for", action="store", help="If set only computes the ground-truth for results found in this file", default=None)

        (self.options, args) = parser.parse_args()
        
        self.workflow_start_time = -1
        self.counter = 0
        self.evaluator = Evaluator(self.options)

        if not self.options.config:
            
            if self.options.evaluate:
                
                self.evaluator.evaluate(self.options.evaluate)
                return

            if self.options.create_report:
                self.evaluator.create_report()
                return

            if not self.options.driver_name:
                parser.error("No driver name specified.")

            if not self.options.settings_dataset:
                parser.error("No dataset specified.")

            if not self.options.settings_size:
                print("Warning: No dataset size specified.")

            if self.options.groundtruth or self.options.run:
                self.setup()

            if self.options.groundtruth:
            
                self.options.think_time = 1
                self.options.time_requirement = 999999

                if self.options.gt_for:
                
                    with open(self.options.gt_for) as f:
                        self.gt_for_result = json.load(f)
                        workflow_files = ["data/%s/workflows/%s.json" % (self.options.settings_dataset, self.gt_for_result["args"]["settings_workflow"])]

                else:
                    workflow_files = glob.glob("data/" + self.options.settings_dataset + "/workflows/*.json") 

                for workflow_file in workflow_files:
                    self.options.settings_workflow = basename(workflow_file).split(".")[0]
                    self.run()
            
            elif self.options.run:
                
                if not self.options.settings_workflow:
                    parser.error("No workflow specified.")
                
                self.run()
        else:

            with open(self.options.config) as f:
                config = json.load(f)
                assure_path_exists("./results")
                for d in config["settings-datasets"]:
                    assure_path_exists("./data/%s/groundtruths" % d)

                # TODO: create pairs instead
                for dataset in config["settings-datasets"]:
                    self.options.settings_dataset = dataset
                    
                    for driver_name in config["driver-names"]:
                        for driver_arg in config["driver-args"]:                        

                            self.options.driver_name = driver_name
                            self.setup(driver_arg)                        

                            for size in config["settings-sizes"]:
                                for workflow in config["settings-workflows"]:
                                    for thinktime in config["settings-thinktimes"]:
                                        for time_requirement in config["settings-time-requirements"]:
                                            for confidence_level in config["settings-confidence-levels"]:
                                            
                                                self.options.driver_name = driver_name
                                                
                                                self.options.settings_size = size
                                                self.options.settings_workflow = workflow
                                                self.options.settings_thinktime = thinktime
                                                self.options.settings_time_requirement = time_requirement
                                                self.options.settings_confidence_level = confidence_level
                                                self.options.settings_normalized = config["settings-normalized"]
                                                self.options.groundtruth = config["groundtruth"] if "groundtruth" in config else False
                                                self.options.run = config["run"] if "run" in config else True
                                                self.options.evaluate = config["evaluate"] if "evaluate" in config else True

                                                if self.options.run:
                                                    self.run()

                                                if self.options.evaluate:
                                                    self.evaluator.evaluate(self.options.evaluate)

    def setup(self, driver_arg = None):
        logger.info("loading schema")
        with open(self.get_schema_path()) as f:
            self.schema = Schema(json.load(f), self.options.settings_normalized)

        logger.info("loading driver")
        module = importlib.import_module("drivers." +  self.options.driver_name)
        self.driver = getattr(module, "IDEBenchDriver")()

        logger.info("initializing %s driver" % self.options.driver_name)
        try:
            self.driver.init(self.options, self.schema, driver_arg)
        except AttributeError:
            pass

    def run(self):

        self.vizgraph = VizGraph()
        with open(self.get_workflow_path()) as f:
            json_data = json.load(f)
            for s in json_data["setup"]:
                self.vizgraph.add_viz(s)

            for s in json_data["setup"]:
               self.vizgraph.apply_interaction(Operation(s))

            self.workflow_interactions = json_data["interactions"]

            self.operation_results = OrderedDict({ "args": vars(self.options), "results": OrderedDict() })
            self.current_interaction_index = 0
            self.current_vizrequest_index = 0
            self.benchmark_start_time = util.get_current_ms_time()

            try:
                logger.info("calling \"workflow_start\" on driver")
                self.driver.workflow_start()
            except AttributeError:
                pass

            global do_poll
            do_poll = True
            def poll_results(slf, queue):
                global count
                while do_poll:
                    try:
                        process_result = queue.get(timeout=1)
                    except Empty:
                        logger.info("result queue empty... trying again")
                        continue
                    if process_result is None:
                        continue
                    slf.deliver_viz_request([process_result])
                logger.info("stopped polling results")

                try:
                    while queue.get(timeout=0.01):
                        pass
                except Empty:
                    logger.info("result queue cleard")

            if not self.options.groundtruth:
                thread = Thread(target = poll_results, args = (self, IDEBench.result_queue))
                thread.start()
            
            interaction_index = 0
            while interaction_index < len(self.workflow_interactions):
                self.process_interaction(interaction_index)
                interaction_index +=1

            do_poll = False
            if not self.options.groundtruth:
                thread.join()
            self.end_run()


    def end_run(self):
        logger.info("done processing interactions")
        try:
            logger.info("calling \"workflow_end\" on driver")
            self.driver.workflow_end()
        except AttributeError:
            pass

        # adding un-answered queries, if necessary

        non_delivered_count = 0
        delivered_count = 0
        operation_id = 0
        event_id = 0
        if not self.options.groundtruth:
            for interaction in self.workflow_interactions:
                vizs_to_request = self.vizgraph.apply_interaction(Operation(interaction))
                expected_start_time = interaction["time"]
                for viz in vizs_to_request:
                    if not operation_id in self.operation_results["results"]:
                        non_delivered_count += 1
                        self.deliver_viz_request([VizRequest(operation_id, event_id, expected_start_time, viz, True)])
                    else:
                        delivered_count += 1
                    expected_start_time = interaction["time"]
                    operation_id += 1
                    event_id += 1
        
        if non_delivered_count > 0:
            logger.info("added %s non-delivered results to final result" % non_delivered_count)

        path = "results/%s.json" % (self.get_config_hash())
        
        if not self.options.groundtruth:
            logger.info("saving results to %s" % path)
            with open(path, "w") as fp:
                json.dump(self.operation_results, fp, indent=4)

        if self.options.groundtruth:
            logger.info("saving groundtruth to %s" % self.get_groundtruth_path())
            assure_path_exists(self.get_groundtruth_path())
            with open(self.get_groundtruth_path(), "w") as fp:
                json.dump(self.operation_results, fp, indent=4)
    
    def process_interaction(self, interaction_index):
        logger.info("interaction %i" % interaction_index)
        interaction = self.workflow_interactions[interaction_index]
        next_interaction = self.workflow_interactions[interaction_index + 1] if interaction_index +1 < len(self.workflow_interactions) else None
        vizs_to_request = self.vizgraph.apply_interaction(Operation(interaction))
        expected_start_time = interaction["time"]
        
        viz_requests = []
        for viz in vizs_to_request:
            viz_requests.append(VizRequest(self.current_vizrequest_index, self.current_interaction_index, expected_start_time, viz))
            self.current_vizrequest_index += 1

        # TODO: document this feature
        try:
            self.driver.before_requests(self.options, self.schema, IDEBench.result_queue)
        except AttributeError:
            pass

        procs = []
        nprocs = len(viz_requests)
        if self.options.groundtruth or (hasattr(self.driver, "use_single_process") and self.driver.use_single_process):
            for viz_request in viz_requests:
                op_id = str(viz_request.operation_id)
                if  op_id in self.gt_for_result["results"] and self.gt_for_result["results"][op_id]["output"]:
                    self.driver.process_request(viz_request, self.options, self.schema, IDEBench.result_queue)
                    self.deliver_viz_request([IDEBench.result_queue.get()])
                    self.counter += 1
        else:
            for viz_request in viz_requests:
                thread = Thread(target = self.driver.process_request, args = (viz_request, self.options, self.schema, IDEBench.result_queue ))
                procs.append(thread)
                thread.start()
                time.sleep(0.002) # so the request threads do not overwhelm some of the drivers (particularly verdictdb)
 
 
        resultlist = []

        delay = 0
        think_time = 0
        if "time" in interaction and next_interaction:
            original_think_time = next_interaction["time"] - interaction["time"]
            delay = min(0, next_interaction["time"] - (util.get_current_ms_time() - self.benchmark_start_time))
            think_time = max(0, delay + original_think_time)
        else:
            think_time = self.options.settings_thinktime

        if not self.options.groundtruth:
            time.sleep(think_time / 1000)
            
        self.current_interaction_index += 1

    def deliver_viz_request(self, viz_requests):

        if len(self.operation_results["results"]) == 0 :
            self.workflow_start_time = sorted(viz_requests, key=lambda x: x.operation_id)[0].start_time
            
        for viz_request in viz_requests:            
            operation_result = {}
            operation_result["id"] = viz_request.operation_id
            operation_result["sql"] = viz_request.viz.get_computed_filter_as_sql(self.schema)
            operation_result["viz_name"] = viz_request.viz.name
            operation_result["event_id"] = viz_request.parent_operation_id
            operation_result["expected_start_time"] = viz_request.expected_start_time
            operation_result["start_time"] = viz_request.start_time - self.workflow_start_time
            operation_result["end_time"] = viz_request.end_time - self.workflow_start_time
            operation_result["time_violated"] = viz_request.timedout
            operation_result["dropped"] = viz_request.dropped
            #operation_result["t_pause"] = viz_request.t_pause
            #operation_result["t_start"] = viz_request.t_start
            operation_result["progress"] = viz_request.progress
            operation_result["output"] = viz_request.result
            operation_result["margins"] = viz_request.margins
            operation_result["num_binning_dimensions"] = len(viz_request.viz.binning)
            operation_result["num_aggregates_per_bin"] = len(viz_request.viz.per_bin_aggregates)
  
            bin_types = []
            for viz_bin in viz_request.viz.binning:
                if "width" in viz_bin:
                    bin_types.append("quantitative")
                else:
                    bin_types.append("nominal")
            operation_result["binning_type"] = "_".join(sorted(bin_types))

            agg_types = []
            for viz_agg in viz_request.viz.per_bin_aggregates:
                if viz_agg["type"] == "count":
                    agg_types.append("count")
                elif viz_agg["type"] == "avg":
                    agg_types.append("avg")
                else:
                    raise Exception()
            operation_result["aggregate_type"] = "_".join(sorted(agg_types))

            if not viz_request.operation_id in self.operation_results:
                self.operation_results["results"][viz_request.operation_id] = operation_result
            
            viz_request.delivered = True
    
    def get_config_hash(self):
        o = self.options
        h = (o.driver_name, o.settings_dataset, o.settings_workflow, o.settings_size, o.settings_normalized, o.settings_confidence_level, o.settings_thinktime, o.settings_thinktime, o.settings_time_requirement)
        return hashlib.md5(str(h).encode("utf-8")).hexdigest()

    def get_schema_path(self):
        return "data/%s/sample.json" % (self.options.settings_dataset)

    def get_workflow_path(self):
        return "data/%s/workflows/%s.json" % (self.options.settings_dataset, self.options.settings_workflow)   

    def get_groundtruth_path(self):        
        return os.path.join(self.get_groundtruth_folder(), self.options.settings_workflow + ".json")

    def get_groundtruth_folder(self):
        return os.path.join(self.options.gt_folder, self.gt_for_result["args"]["driver_name"], self.options.settings_dataset, self.options.settings_size)