def run(self): self.vizgraph = VizGraph() with open(self.get_workflow_path()) as f: json_data = json.load(f) for s in json_data["setup"]: self.vizgraph.add_viz(s) for s in json_data["setup"]: self.vizgraph.apply_interaction(Operation(s)) self.workflow_interactions = json_data["interactions"] self.operation_results = OrderedDict({ "args": vars(self.options), "results": OrderedDict() }) self.current_interaction_index = 0 self.current_vizrequest_index = 0 self.benchmark_start_time = util.get_current_ms_time() try: logger.info("calling \"workflow_start\" on driver") self.driver.workflow_start() except AttributeError: pass global do_poll do_poll = True def poll_results(slf, queue): global count while do_poll: try: process_result = queue.get(timeout=1) except Empty: logger.info("result queue empty... trying again") continue if process_result is None: continue slf.deliver_viz_request([process_result]) logger.info("stopped polling results") try: while queue.get(timeout=0.01): pass except Empty: logger.info("result queue cleard") if not self.options.groundtruth: thread = Thread(target = poll_results, args = (self, IDEBench.result_queue)) thread.start() interaction_index = 0 while interaction_index < len(self.workflow_interactions): self.process_interaction(interaction_index) interaction_index +=1 do_poll = False if not self.options.groundtruth: thread.join() self.end_run()
def run(self): try: self.driver.workflow_start() except AttributeError: pass with open(self.get_workflow_path()) as f: self.workflow_interactions = json.load(f)["interactions"] self.vizgraph = VizGraph() self.operation_results = {"args": vars(self.options), "results": {}} self.current_interaction_index = 0 self.current_vizrequest_index = 0 self.process_interaction(0)
class IDEBench: result_queue = multiprocessing.Queue() def __init__(self): parser = OptionParser() parser.add_option("--driver-name", dest="driver_name", action="store", help="Driver name") parser.add_option("--driver-create-storage", dest="driver_create_storage", action="store_true", help="Calls create_storage on driver", default=False) parser.add_option("--driver-clear-storage", dest="driver_clear_storage", action="store_true", help="Calls clear_storage on driver", default=False) parser.add_option("--driver-clear-cache", dest="driver_clear_cache", action="store_true", help="Calls clear_cache on driver", default=False) parser.add_option("--driver-args", dest="driver_args", action="store", help="Arguments to pass to the driver", default="") parser.add_option("--settings-normalized", dest="settings_normalized", action="store_true", help="Whether joins should be used", default=False) parser.add_option("--settings-dataset", dest="settings_dataset", action="store", help="Name of the dataset") parser.add_option("--settings-size", dest="settings_size", default="", action="store", help="Number of rows in the dataset") parser.add_option("--settings-thinktime", dest="settings_thinktime", type="int", action="store", help="Think-time in seconds between two executions", default=1000) parser.add_option("--settings-time-requirement", dest="settings_time_requirement", action="store", help="The Time requirement to be used", default=1000) parser.add_option("--settings-confidence-level", dest="settings_confidence_level", action="store", help="The confidence level to be used", default=95) parser.add_option("--settings-workflow", dest="settings_workflow", action="store", help="The workflow file to be used") parser.add_option("--evaluate", dest="evaluate", action="store_true", help="Size of the dataset in MB", default=False) parser.add_option( "--create--full-report", dest="create_report", action="store_true", help= "Merges all reports in the reports directory into a single file", default=False) parser.add_option("--run", dest="run", action="store_true", help="Flag to run the benchmark without config file", default=False) parser.add_option( "--run-config", dest="config", action="store", help="Flag to run the benchmark with the specified config file") parser.add_option( "--groundtruth", dest="groundtruth", action="store_true", help="If set computes the ground-truth for the specified workflow", default=False) (self.options, args) = parser.parse_args() if not self.options.config: if self.options.create_report: self.create_report() return if not self.options.driver_name: parser.error("No driver name specified.") if not self.options.settings_dataset: parser.error("No dataset specified.") if not self.options.settings_size: print("Warning: No dataset size specified.") if self.options.groundtruth or self.options.run: self.setup() if self.options.groundtruth: self.options.think_time = 1 self.options.time_requirement = 999999 workflow_files = glob.glob("data/" + self.options.settings_dataset + "/workflows/*.json") for workflow_file in workflow_files: self.options.settings_workflow = basename( workflow_file).split(".")[0] self.run() elif self.options.run: if not self.options.settings_workflow: parser.error("No workflow specified.") self.run() elif self.options.evaluate: self.evaluate(self.get_config_hash()) else: with open(self.options.config) as f: config = json.load(f) assure_path_exists("./results") for d in config["settings-datasets"]: assure_path_exists("./data/%s/groundtruths" % d) # TODO: create pairs instead for dataset in config["settings-datasets"]: self.options.settings_dataset = dataset for driver_name in config["driver-names"]: for driver_arg in config["driver-args"]: self.options.driver_name = driver_name self.setup(driver_arg) for size in config["settings-sizes"]: for workflow in config["settings-workflows"]: for thinktime in config[ "settings-thinktimes"]: for time_requirement in config[ "settings-time-requirements"]: for confidence_level in config[ "settings-confidence-levels"]: self.options.driver_name = driver_name self.options.settings_size = size self.options.settings_workflow = workflow self.options.settings_thinktime = thinktime self.options.settings_time_requirement = time_requirement self.options.settings_confidence_level = confidence_level self.options.settings_normalized = config[ "settings-normalized"] self.options.groundtruth = config[ "groundtruth"] if "groundtruth" in config else False self.options.run = config[ "run"] if "run" in config else True self.options.evaluate = config[ "evaluate"] if "evaluate" in config else True if self.options.run: self.run() if self.options.evaluate: self.evaluate( self.get_config_hash()) def setup(self, driver_arg=None): with open(self.get_schema_path()) as f: self.schema = Schema(json.load(f), self.options.settings_normalized) module = importlib.import_module("drivers." + self.options.driver_name) self.driver = getattr(module, "IDEBenchDriver")() try: self.driver.init(self.options, self.schema, driver_arg) except AttributeError: pass def run(self): try: self.driver.workflow_start() except AttributeError: pass with open(self.get_workflow_path()) as f: self.workflow_interactions = json.load(f)["interactions"] self.vizgraph = VizGraph() self.operation_results = {"args": vars(self.options), "results": {}} self.current_interaction_index = 0 self.current_vizrequest_index = 0 self.process_interaction(0) def end_run(self): try: self.driver.workflow_end() except AttributeError: pass path = "results/%s.json" % (self.get_config_hash()) if not self.options.groundtruth: with open(path, "w") as fp: json.dump(self.operation_results, fp) if self.options.groundtruth: path = "data/%s/groundtruths/%s_%s.json" % ( self.options.settings_dataset, self.options.settings_size, self.options.settings_workflow) with open(path, "w") as fp: json.dump(self.operation_results, fp) def process_interaction(self, interaction_index): print("processing!") if interaction_index < 0 or interaction_index >= len( self.workflow_interactions): print("reached end of interactions") self.end_run() return print("thinking...") time.sleep(self.options.settings_thinktime / 1000) interaction = self.workflow_interactions[interaction_index] vizs_to_request = self.vizgraph.apply_interaction( Operation(interaction)) viz_requests = [] for viz in vizs_to_request: viz_requests.append( VizRequest(self.current_vizrequest_index, self.current_interaction_index, viz)) self.current_vizrequest_index += 1 #if interaction_index == 0: # self.result_queue = multiprocessing.Queue() # TODO: document this feature try: self.driver.before_requests(self.options, self.schema, IDEBench.result_queue) except AttributeError: pass procs = [] nprocs = len(viz_requests) if hasattr(self.driver, "use_single_process") and self.driver.use_single_process: for viz_request in viz_requests: self.driver.process_request(viz_request, self.options, self.schema, IDEBench.result_queue) else: for viz_request in viz_requests: proc = multiprocessing.Process( target=self.driver.process_request, args=(viz_request, self.options, self.schema, IDEBench.result_queue)) procs.append(proc) proc.start() resultlist = [] for i in range(nprocs): resultlist.append(IDEBench.result_queue.get()) for proc in procs: proc.join() self.deliver_viz_request(resultlist) self.current_interaction_index += 1 self.process_interaction(self.current_interaction_index) def deliver_viz_request(self, viz_requests): for viz_request in viz_requests: if len(viz_request.result.keys()) == 0: pass operation_result = {} operation_result["id"] = viz_request.operation_id operation_result[ "sql"] = viz_request.viz.get_computed_filter_as_sql( self.schema) operation_result["viz_name"] = viz_request.viz.name operation_result[ "parent_operation_id"] = viz_request.parent_operation_id operation_result["start_time"] = viz_request.start_time operation_result["end_time"] = viz_request.end_time operation_result["time_violated"] = viz_request.timedout operation_result["t_pause"] = viz_request.t_pause operation_result["t_start"] = viz_request.t_start operation_result["progress"] = viz_request.progress operation_result["output"] = viz_request.result operation_result["margins"] = viz_request.margins operation_result["num_binning_dimensions"] = len( viz_request.viz.binning) operation_result["num_aggregates_per_bin"] = len( viz_request.viz.per_bin_aggregates) bin_types = [] for viz_bin in viz_request.viz.binning: if "width" in viz_bin: bin_types.append("quantitative") else: bin_types.append("nominal") operation_result["binning_type"] = "_".join(sorted(bin_types)) agg_types = [] for viz_agg in viz_request.viz.per_bin_aggregates: if viz_agg["type"] == "count": agg_types.append("count") elif viz_agg["type"] == "avg": agg_types.append("avg") else: raise Exception() operation_result["aggregate_type"] = "_".join(sorted(agg_types)) if not viz_request.operation_id in self.operation_results: self.operation_results["results"][ viz_request.operation_id] = operation_result viz_request.delivered = True #self.driver.request_vizs(self.viz_requests) def get_config_hash(self): o = self.options h = (o.driver_name, o.settings_dataset, o.settings_workflow, o.settings_size, o.settings_normalized, o.settings_confidence_level, o.settings_thinktime, o.settings_thinktime, o.settings_time_requirement) return hashlib.md5(str(h).encode('utf-8')).hexdigest() def get_schema_path(self): return "data/%s/sample.json" % (self.options.settings_dataset) def get_workflow_path(self): return "data/%s/workflows/%s.json" % (self.options.settings_dataset, self.options.settings_workflow) def compute_viz_similarity(self, viz_gt, viz): if len(viz.keys()) == 0 and len(viz_gt.keys()) == 0: return 1 if len(viz_gt.keys()) == 0 and len(viz.keys()) > 0: raise Exception() if len(viz_gt.keys()) > 0 and len(viz.keys()) == 0: return 0 for gt_key in viz_gt.keys(): if gt_key not in viz: viz[gt_key] = 0 viz_gt_vals = [] viz_vals = [] for gt_key in viz_gt.keys(): if isinstance(viz_gt[gt_key], list): viz_gt_vals.append(viz_gt[gt_key][0]) else: viz_gt_vals.append(viz_gt[gt_key]) if isinstance(viz[gt_key], list): viz_vals.append(viz[gt_key][0]) else: viz_vals.append(viz[gt_key]) viz_gt_vals = np.array(viz_gt_vals).astype(float) viz_vals = np.array(viz_vals).astype(float) #viz_gt_vals = self.normalize(viz_gt_vals) #viz_vals = self.normalize(viz_vals) if np.isnan(viz_gt_vals).any(): raise Exception() if np.isnan(viz_vals).any(): raise Exception() #score = np.dot(viz_gt_vals, viz_vals)/ ( np.sqrt(np.sum(np.square(viz_gt_vals))) * np.sqrt(np.sum(np.square(viz_vals))) ) np.seterr(all='raise') try: score = 1 - spatial.distance.cosine(viz_gt_vals, viz_vals) except: return 0 return score if not np.isnan(score) else 0 def normalize(self, v): norm = np.linalg.norm(v, ord=1) if norm == 0: norm = np.finfo(v.dtype).eps return v / norm def evaluate(self, config_hash): print("evaluate") result_json = None try: with open("results/%s.json" % config_hash, "r") as json_data: result_json = json.load(json_data) except: print("couldn't load file %s" % ("results/%s.json" % config_hash)) return workflow = result_json["args"]["settings_workflow"] dataset = result_json["args"]["settings_dataset"] size = result_json["args"]["settings_size"] time_requirement = result_json["args"]["settings_time_requirement"] with open( "data/%s/groundtruths/%s_%s.json" % (dataset, size, workflow), "r") as json_data: groundtruths = json.load(json_data)["results"] with open("reports/%s.csv" % config_hash, 'w') as fp: w = csv.DictWriter(fp, [ "operation_id", "config_hash", "interaction_id", "dataset", "size", "viz_name", "interface", "think_time", "time_requirement", "t_start", "t_pause", "workflow", "start_time", "end_time", "duration", "progress", "time_violated", "num_binning_dimensions", "binning_type", "has_invalid_bins", "num_bins_out_of_margin", "num_bins_delivered", "num_bins_in_gt", "missing_bins", "dissimilarity", "num_aggregates_per_bin", "aggregate_type", "bias", "rel_error_avg", "rel_error_stdev", "rel_error_min", "rel_error_max", "margin_avg", "margin_stdev", "margin_min", "margin_max", "margin_ratio" ], delimiter=",", lineterminator="\n") w.writeheader() operations = result_json["results"] for op_number in operations.keys(): gt_output = groundtruths[op_number]["output"] operation = operations[op_number] margins = [] rel_errors = [] forecast_values = [] actual_values = [] out_of_margin_count = 0 for gt_bin_identifier, gt_aggregate_results in gt_output.items( ): if gt_bin_identifier in operation["output"]: for agg_bin_result_index, agg_bin_result in enumerate( operation["output"][gt_bin_identifier]): rel_error = None op_result = operation["output"][gt_bin_identifier][ agg_bin_result_index] gt_result = gt_aggregate_results[ agg_bin_result_index] if abs(gt_result) > 0: rel_error = abs(op_result - gt_result) / abs(gt_result) if rel_error > 1e-5: pass rel_errors.append(rel_error) else: print("ignoring zero in groundtruth") forecast_values.append(op_result) actual_values.append(gt_result) if operation[ "margins"] and gt_bin_identifier in operation[ "margins"]: op_margin = float( operation["margins"][gt_bin_identifier] [agg_bin_result_index]) if np.isnan(op_margin) or np.isinf( op_margin) or abs(op_margin) > 1000000: if os.path.exists("./margin_errors"): append_write = 'a' # append if already exists else: append_write = 'w' # make a new file if not with open("./margin_errors", append_write) as ffff: ffff.writelines( self.options.settings_workflow + "\n" + str(operation["margins"] [gt_bin_identifier] [agg_bin_result_index]) + "\n") elif gt_result + 1e-6 < op_result - abs( op_result * op_margin ) or gt_result - 1e-6 > op_result + abs( op_result * op_margin): out_of_margin_count += 1 margins.append(abs(op_margin)) else: margins.append(abs(op_margin)) else: pass # add error as many times as a bin was expected! #rel_errors.extend( [ 1 for n in range(len(gt_aggregate_results)) ] ) # invalid bins test has_invalid_bins = False num_invalid = 0 inv = [] for kk in operation["output"].keys(): if kk not in gt_output: has_invalid_bins = True num_invalid += 1 inv.append(kk) print(self.options.settings_workflow) print(str(operation["id"])) print("invalid key:" + kk) print(operation["sql"]) print(operation["output"]) os._exit(0) args = result_json["args"] missing_bins = 1 - len(operation["output"].keys()) / len( gt_output.keys()) if len(gt_output.keys()) > 0 else 0 op_eval_result = {} op_eval_result["operation_id"] = operation["id"] op_eval_result["config_hash"] = self.get_config_hash() op_eval_result["interaction_id"] = operation[ "parent_operation_id"] op_eval_result["dataset"] = args["settings_dataset"] op_eval_result["size"] = args["settings_size"] op_eval_result["viz_name"] = operation["viz_name"] op_eval_result["think_time"] = args["settings_thinktime"] op_eval_result["time_requirement"] = args[ "settings_time_requirement"] op_eval_result["interface"] = args["driver_name"] op_eval_result["workflow"] = args["settings_workflow"] op_eval_result["start_time"] = operation["start_time"] op_eval_result["end_time"] = operation["end_time"] op_eval_result["t_pause"] = operation[ "t_pause"] if "t_pause" in operation else 0 op_eval_result["t_start"] = operation[ "t_start"] if "t_start" in operation else 0 op_eval_result["duration"] = operation["end_time"] - operation[ "start_time"] if "time_violated" in operation: op_eval_result["time_violated"] = operation[ "time_violated"] elif "timedout" in operation: op_eval_result["time_violated"] = operation["timedout"] else: raise Exception() op_eval_result["has_invalid_bins"] = has_invalid_bins op_eval_result["binning_type"] = operation["binning_type"] op_eval_result["aggregate_type"] = operation["aggregate_type"] op_eval_result["num_bins_delivered"] = len( operation["output"].keys()) op_eval_result["num_bins_in_gt"] = len(gt_output.items()) op_eval_result["missing_bins"] = "%.5f" % missing_bins op_eval_result["dissimilarity"] = "%.5f" % ( 1 - self.compute_viz_similarity(gt_output, operation["output"])) op_eval_result[ "num_bins_out_of_margin"] = "%i" % out_of_margin_count op_eval_result["num_aggregates_per_bin"] = operation[ "num_aggregates_per_bin"] op_eval_result["num_binning_dimensions"] = operation[ "num_binning_dimensions"] op_eval_result["progress"] = "%.5f" % operation["progress"] op_eval_result["bias"] = "%.5f" % ( sum(forecast_values) / sum(actual_values) - 1) if len(actual_values) > 0 else 0 op_eval_result["rel_error_stdev"] = "%.5f" % statistics.stdev( rel_errors) if len(rel_errors) > 1 else 0.0 op_eval_result["rel_error_min"] = "%.5f" % min( rel_errors) if len(rel_errors) > 0 else 0 op_eval_result["rel_error_max"] = "%.5f" % max( rel_errors) if len(rel_errors) > 0 else 0 op_eval_result["rel_error_avg"] = "%.5f" % float( sum(rel_errors) / float(len(rel_errors))) if len(rel_errors) > 0 else 0 op_eval_result["margin_stdev"] = "%.5f" % statistics.stdev( margins) if len(margins) > 1 else 0.0 op_eval_result["margin_min"] = "%.5f" % min(margins) if len( margins) > 0 else 0.0 op_eval_result["margin_max"] = "%.5f" % max(margins) if len( margins) > 0 else 0.0 op_eval_result["margin_avg"] = "%.5f" % float( sum(margins) / float(len(margins))) if len(margins) > 0 else 0.0 op_eval_result["margin_ratio"] = "%.5f" % float( len(operation["margins"]) / len(operation["output"])) if operation["margins"] and len( operation["output"]) > 0 else 1 w.writerow(op_eval_result) def create_report(self): header_saved = False interesting_files = glob.glob("reports/*.csv") with open('./full_report.csv', 'w') as fout: for filename in interesting_files: print(filename) with open(filename) as fin: header = next(fin) if not header_saved: print(header) fout.write(header) header_saved = True for line in fin: fout.write(line) print("saved report")
def __init__(self): parser = OptionParser() parser.add_option("-r", "--seed", dest="seed", action="store", type=int, help="Random seed", default=25000) parser.add_option("-d", "--dataset", dest="data_folder", action="store", help="path to save the file", default="flights") parser.add_option("--debug", dest="debug", action="store_true", help="creates a debug file", default=False) parser.add_option("-n", "--num-operations", dest="num_operations", action="store", type=int, help="Number of operations to generate", default=20) parser.add_option("-c", "--workflow-type", dest="config", action="store", help="path to config file", default="data/flights/workflowtypes/sequential.json") parser.add_option("-p", "--output", dest="path", action="store", help="path to save the file", default="workflow.json") parser.add_option("-s", "--num-samples", dest="numsamples", action="store", type=int, help="Number of samples to draw from the original dataset", default=10000) (options, args) = parser.parse_args() self.options = options random.seed(options.seed) np.random.seed(seed=options.seed) print("data/" + options.data_folder + "/" + options.config) with open("data/" + options.data_folder + "/workflowtypes/" + options.config, "r") as fp: self.config = json.load(fp) schema = None with open(self.get_schema_path()) as f: schema = Schema(json.load(f)) print("reading csv...") # load sample data df = pd.read_csv("data/" + options.data_folder + "/sample.csv", nrows=options.numsamples, header=0) #schema = {"tables": [{ "name": "df", "dimensions": []}]} sample_json = None with open("data/" + options.data_folder + "/sample.json", "r") as f: sample_json = json.load(f) # print(sample_json) # for field in sample_json["tables"]["fact"]["fields"]: # schema["tables"][0]["dimensions"].append({"name": field["field"]}) #storage = Storage(schema) zero_qs_ratio = 100 tries = -1 while zero_qs_ratio > 0.15: tries += 1 num_zeros_qs = 0 num_qs = 0 VizAction.VIZ_COUNTER = -1 LinkAction.FIRST_LINK = None LinkAction.LATEST_LINK = None LinkAction.LINKS = set() vizgraph = VizGraph() random.seed(options.seed + tries) root = VizAction(self.config, df, vizgraph, schema, sample_json) current = root states = [] num_ops = 0 debug_states = [] while num_ops < options.num_operations: res = current.get_states() if res: affected_vizs = vizgraph.apply_interaction(res) if options.debug: nodes_dict = vizgraph.get_nodes_dict() states_dict = {} for n in nodes_dict.keys(): states_dict[n] = { "name":n, "source" : nodes_dict[n].get_source(), "binning": nodes_dict[n].binning, "agg": nodes_dict[n].per_bin_aggregates, "selection": nodes_dict[n].get_selection(), "filter": nodes_dict[n].get_filter(), "computed_filter": nodes_dict[n].get_computed_filter_as_sql(schema), } debug_states.append(states_dict) for x in affected_vizs: sql = x.get_computed_filter_as_sql(schema).replace("FLOOR", "ROUND").replace(schema.get_fact_table_name(), "df") r = pandasql.sqldf(sql, locals()) num_qs += 1 if len(r.index) == 0: num_zeros_qs += 1 #print("ZERO QUERY") states.append(res.data) #print(res.data) #if "source" not in res: num_ops += 1 current = current.get_next() if current is None: zero_qs_ratio = num_zeros_qs/num_qs break zero_qs_ratio = num_zeros_qs/num_qs print("zero queries:") print( (num_zeros_qs / num_qs)) with open("data/" + options.data_folder + "/workflows/" + options.path + ".json", "w") as fp: fp.write(json.dumps({"name": "generated", "dataset": options.data_folder, "seed": options.seed, "config": options.config, "interactions": states})) print("done.")
class IDEBench: result_queue = multiprocessing.Queue() def __init__(self): parser = OptionParser() parser.add_option("--driver-name", dest="driver_name", action="store", help="Driver name") parser.add_option("--driver-create-storage", dest="driver_create_storage", action="store_true", help="Calls create_storage on driver", default=False) parser.add_option("--driver-clear-storage", dest="driver_clear_storage", action="store_true", help="Calls clear_storage on driver", default=False) parser.add_option("--driver-clear-cache", dest="driver_clear_cache", action="store_true", help="Calls clear_cache on driver", default=False) parser.add_option("--driver-args", dest="driver_args", action="store", help="Arguments to pass to the driver", default="") parser.add_option("--settings-normalized", dest="settings_normalized", action="store_true", help="Whether joins should be used", default=False) parser.add_option("--settings-dataset", dest="settings_dataset", action="store", help="Name of the dataset") parser.add_option("--settings-size", dest="settings_size", default="", action="store", help="Number of rows in the dataset") parser.add_option("--settings-thinktime", dest="settings_thinktime", type="int", action="store", help="Think-time in seconds between two executions", default=1000) parser.add_option("--settings-time-requirement", dest="settings_time_requirement", action="store", help="The Time requirement to be used", default=1000) parser.add_option("--settings-confidence-level", dest="settings_confidence_level", action="store", help="The confidence level to be used", default=95) parser.add_option("--settings-workflow", dest="settings_workflow", action="store", help="The workflow file to be used") parser.add_option("--evaluate", dest="evaluate", action="store", help="The name of the workflow result to evaluate", default=None) parser.add_option("--create--full-report", dest="create_report", action="store_true", help="Merges all reports in the reports directory into a single file", default=False) parser.add_option("--run", dest="run", action="store_true", help="Flag to run the benchmark without config file", default=False) parser.add_option("--await-response", dest="await_response", action="store_true", help="Whether or not to wait for wait for driver responses before proceeding with the next request (only works when multi-threaded)", default=False) parser.add_option("--run-config", dest="config", action="store", help="Flag to run the benchmark with the specified config file") parser.add_option("--groundtruth", dest="groundtruth", action="store_true", help="If set computes the ground-truth for the specified workflow", default=False) parser.add_option("--gt-folder", dest="gt_folder", action="store", help="The path to the groundtruth", default=None) parser.add_option("--gt-for", dest="gt_for", action="store", help="If set only computes the ground-truth for results found in this file", default=None) (self.options, args) = parser.parse_args() self.workflow_start_time = -1 self.counter = 0 self.evaluator = Evaluator(self.options) if not self.options.config: if self.options.evaluate: self.evaluator.evaluate(self.options.evaluate) return if self.options.create_report: self.evaluator.create_report() return if not self.options.driver_name: parser.error("No driver name specified.") if not self.options.settings_dataset: parser.error("No dataset specified.") if not self.options.settings_size: print("Warning: No dataset size specified.") if self.options.groundtruth or self.options.run: self.setup() if self.options.groundtruth: self.options.think_time = 1 self.options.time_requirement = 999999 if self.options.gt_for: with open(self.options.gt_for) as f: self.gt_for_result = json.load(f) workflow_files = ["data/%s/workflows/%s.json" % (self.options.settings_dataset, self.gt_for_result["args"]["settings_workflow"])] else: workflow_files = glob.glob("data/" + self.options.settings_dataset + "/workflows/*.json") for workflow_file in workflow_files: self.options.settings_workflow = basename(workflow_file).split(".")[0] self.run() elif self.options.run: if not self.options.settings_workflow: parser.error("No workflow specified.") self.run() else: with open(self.options.config) as f: config = json.load(f) assure_path_exists("./results") for d in config["settings-datasets"]: assure_path_exists("./data/%s/groundtruths" % d) # TODO: create pairs instead for dataset in config["settings-datasets"]: self.options.settings_dataset = dataset for driver_name in config["driver-names"]: for driver_arg in config["driver-args"]: self.options.driver_name = driver_name self.setup(driver_arg) for size in config["settings-sizes"]: for workflow in config["settings-workflows"]: for thinktime in config["settings-thinktimes"]: for time_requirement in config["settings-time-requirements"]: for confidence_level in config["settings-confidence-levels"]: self.options.driver_name = driver_name self.options.settings_size = size self.options.settings_workflow = workflow self.options.settings_thinktime = thinktime self.options.settings_time_requirement = time_requirement self.options.settings_confidence_level = confidence_level self.options.settings_normalized = config["settings-normalized"] self.options.groundtruth = config["groundtruth"] if "groundtruth" in config else False self.options.run = config["run"] if "run" in config else True self.options.evaluate = config["evaluate"] if "evaluate" in config else True if self.options.run: self.run() if self.options.evaluate: self.evaluator.evaluate(self.options.evaluate) def setup(self, driver_arg = None): logger.info("loading schema") with open(self.get_schema_path()) as f: self.schema = Schema(json.load(f), self.options.settings_normalized) logger.info("loading driver") module = importlib.import_module("drivers." + self.options.driver_name) self.driver = getattr(module, "IDEBenchDriver")() logger.info("initializing %s driver" % self.options.driver_name) try: self.driver.init(self.options, self.schema, driver_arg) except AttributeError: pass def run(self): self.vizgraph = VizGraph() with open(self.get_workflow_path()) as f: json_data = json.load(f) for s in json_data["setup"]: self.vizgraph.add_viz(s) for s in json_data["setup"]: self.vizgraph.apply_interaction(Operation(s)) self.workflow_interactions = json_data["interactions"] self.operation_results = OrderedDict({ "args": vars(self.options), "results": OrderedDict() }) self.current_interaction_index = 0 self.current_vizrequest_index = 0 self.benchmark_start_time = util.get_current_ms_time() try: logger.info("calling \"workflow_start\" on driver") self.driver.workflow_start() except AttributeError: pass global do_poll do_poll = True def poll_results(slf, queue): global count while do_poll: try: process_result = queue.get(timeout=1) except Empty: logger.info("result queue empty... trying again") continue if process_result is None: continue slf.deliver_viz_request([process_result]) logger.info("stopped polling results") try: while queue.get(timeout=0.01): pass except Empty: logger.info("result queue cleard") if not self.options.groundtruth: thread = Thread(target = poll_results, args = (self, IDEBench.result_queue)) thread.start() interaction_index = 0 while interaction_index < len(self.workflow_interactions): self.process_interaction(interaction_index) interaction_index +=1 do_poll = False if not self.options.groundtruth: thread.join() self.end_run() def end_run(self): logger.info("done processing interactions") try: logger.info("calling \"workflow_end\" on driver") self.driver.workflow_end() except AttributeError: pass # adding un-answered queries, if necessary non_delivered_count = 0 delivered_count = 0 operation_id = 0 event_id = 0 if not self.options.groundtruth: for interaction in self.workflow_interactions: vizs_to_request = self.vizgraph.apply_interaction(Operation(interaction)) expected_start_time = interaction["time"] for viz in vizs_to_request: if not operation_id in self.operation_results["results"]: non_delivered_count += 1 self.deliver_viz_request([VizRequest(operation_id, event_id, expected_start_time, viz, True)]) else: delivered_count += 1 expected_start_time = interaction["time"] operation_id += 1 event_id += 1 if non_delivered_count > 0: logger.info("added %s non-delivered results to final result" % non_delivered_count) path = "results/%s.json" % (self.get_config_hash()) if not self.options.groundtruth: logger.info("saving results to %s" % path) with open(path, "w") as fp: json.dump(self.operation_results, fp, indent=4) if self.options.groundtruth: logger.info("saving groundtruth to %s" % self.get_groundtruth_path()) assure_path_exists(self.get_groundtruth_path()) with open(self.get_groundtruth_path(), "w") as fp: json.dump(self.operation_results, fp, indent=4) def process_interaction(self, interaction_index): logger.info("interaction %i" % interaction_index) interaction = self.workflow_interactions[interaction_index] next_interaction = self.workflow_interactions[interaction_index + 1] if interaction_index +1 < len(self.workflow_interactions) else None vizs_to_request = self.vizgraph.apply_interaction(Operation(interaction)) expected_start_time = interaction["time"] viz_requests = [] for viz in vizs_to_request: viz_requests.append(VizRequest(self.current_vizrequest_index, self.current_interaction_index, expected_start_time, viz)) self.current_vizrequest_index += 1 # TODO: document this feature try: self.driver.before_requests(self.options, self.schema, IDEBench.result_queue) except AttributeError: pass procs = [] nprocs = len(viz_requests) if self.options.groundtruth or (hasattr(self.driver, "use_single_process") and self.driver.use_single_process): for viz_request in viz_requests: op_id = str(viz_request.operation_id) if op_id in self.gt_for_result["results"] and self.gt_for_result["results"][op_id]["output"]: self.driver.process_request(viz_request, self.options, self.schema, IDEBench.result_queue) self.deliver_viz_request([IDEBench.result_queue.get()]) self.counter += 1 else: for viz_request in viz_requests: thread = Thread(target = self.driver.process_request, args = (viz_request, self.options, self.schema, IDEBench.result_queue )) procs.append(thread) thread.start() time.sleep(0.002) # so the request threads do not overwhelm some of the drivers (particularly verdictdb) resultlist = [] delay = 0 think_time = 0 if "time" in interaction and next_interaction: original_think_time = next_interaction["time"] - interaction["time"] delay = min(0, next_interaction["time"] - (util.get_current_ms_time() - self.benchmark_start_time)) think_time = max(0, delay + original_think_time) else: think_time = self.options.settings_thinktime if not self.options.groundtruth: time.sleep(think_time / 1000) self.current_interaction_index += 1 def deliver_viz_request(self, viz_requests): if len(self.operation_results["results"]) == 0 : self.workflow_start_time = sorted(viz_requests, key=lambda x: x.operation_id)[0].start_time for viz_request in viz_requests: operation_result = {} operation_result["id"] = viz_request.operation_id operation_result["sql"] = viz_request.viz.get_computed_filter_as_sql(self.schema) operation_result["viz_name"] = viz_request.viz.name operation_result["event_id"] = viz_request.parent_operation_id operation_result["expected_start_time"] = viz_request.expected_start_time operation_result["start_time"] = viz_request.start_time - self.workflow_start_time operation_result["end_time"] = viz_request.end_time - self.workflow_start_time operation_result["time_violated"] = viz_request.timedout operation_result["dropped"] = viz_request.dropped #operation_result["t_pause"] = viz_request.t_pause #operation_result["t_start"] = viz_request.t_start operation_result["progress"] = viz_request.progress operation_result["output"] = viz_request.result operation_result["margins"] = viz_request.margins operation_result["num_binning_dimensions"] = len(viz_request.viz.binning) operation_result["num_aggregates_per_bin"] = len(viz_request.viz.per_bin_aggregates) bin_types = [] for viz_bin in viz_request.viz.binning: if "width" in viz_bin: bin_types.append("quantitative") else: bin_types.append("nominal") operation_result["binning_type"] = "_".join(sorted(bin_types)) agg_types = [] for viz_agg in viz_request.viz.per_bin_aggregates: if viz_agg["type"] == "count": agg_types.append("count") elif viz_agg["type"] == "avg": agg_types.append("avg") else: raise Exception() operation_result["aggregate_type"] = "_".join(sorted(agg_types)) if not viz_request.operation_id in self.operation_results: self.operation_results["results"][viz_request.operation_id] = operation_result viz_request.delivered = True def get_config_hash(self): o = self.options h = (o.driver_name, o.settings_dataset, o.settings_workflow, o.settings_size, o.settings_normalized, o.settings_confidence_level, o.settings_thinktime, o.settings_thinktime, o.settings_time_requirement) return hashlib.md5(str(h).encode("utf-8")).hexdigest() def get_schema_path(self): return "data/%s/sample.json" % (self.options.settings_dataset) def get_workflow_path(self): return "data/%s/workflows/%s.json" % (self.options.settings_dataset, self.options.settings_workflow) def get_groundtruth_path(self): return os.path.join(self.get_groundtruth_folder(), self.options.settings_workflow + ".json") def get_groundtruth_folder(self): return os.path.join(self.options.gt_folder, self.gt_for_result["args"]["driver_name"], self.options.settings_dataset, self.options.settings_size)