def _get_manifests_in_db(self, db_obj, trace_id): """Returns a list of present workflow types in a trace in the DB. """ hist = Histogram() hist_man = _filter_non_man(hist.get_list_of_results(db_obj, trace_id)) stats = NumericStats() stats_man = _filter_non_man(stats.get_list_of_results( db_obj, trace_id)) hist_man = [x.split("_")[1] for x in hist_man] stats_man = [x.split("_")[1] for x in stats_man] return list(set(hist_man + stats_man))
def test_load_job_results_per_manifest(self): db_obj = self._db hist = Histogram() stat = NumericStats() self.addCleanup(self._del_table, "histograms") self.addCleanup(self._del_table, "numericStats") hist.create_table(db_obj) stat.create_table(db_obj) we = WorkflowsExtractor() job_list = { "job_name": [ "wf_manifest-2_S0", "wf_manifest-2_S1_dS0", "wf_manifest-2_S2_dS0", "wf_manifest-2_S3_dS2", "wf_manifest-2_S4_dS3", "wf_manifest-2_S5_dS4-dS1", "wf_manifest-2_S6_dS0", "sim_job", "wf_manifest-3_S0", "wf_manifest-3_S1_dS0", "wf_manifest-3_S2_dS0", "wf_manifest-3_S3_dS1-dS2", "wf_manifest2-4_S0" ], "id_job": [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], "time_start": [1, 15, 17, 22, 27, 42, 12, 20, 1, 15, 17, 22, 30], "time_end": [10, 20, 40, 25, 29, 50, 70, 30, 10, 20, 19, 25, 35], "time_submit": [1, 1, 1, 1, 1, 1, 1, 20, 2, 2, 2, 2, 3], "cpus_alloc": [1, 2, 3, 4, 5, 6, 7, 1, 1, 2, 3, 4, 33] } we.extract(job_list) we.do_processing() old_results = we.calculate_per_manifest_results(True, db_obj, 1) new_we = WorkflowsExtractor() new_results = new_we.load_per_manifest_results(db_obj, 1) self.assertEqual(sorted(list(new_results.keys())), sorted(["manifest2", "manifest"])) for manifest in ["manifest2", "manifest"]: for field in [ "wf_runtime_cdf", "wf_runtime_stats", "wf_waittime_cdf", "wf_waittime_stats", "wf_turnaround_cdf", "wf_turnaround_stats", "wf_stretch_factor_cdf", "wf_stretch_factor_stats", "wf_jobs_runtime_cdf", "wf_jobs_runtime_stats", "wf_jobs_cores_cdf", "wf_jobs_cores_stats" ]: field = "m_" + manifest + "_" + field assertEqualResult(self, old_results[manifest][field], new_results[manifest][field], field)
def del_results(self, db_obj): """Deletes all analysis results associated with this experiment""" field="trace_id" value=self._trace_id db_obj.delete_rows(Histogram()._table_name, field, value) db_obj.delete_rows(ResultTrace()._get_utilization_result()._table_name, field, value) db_obj.delete_rows(NumericStats()._table_name, field, value)
def test_load_job_results_grouped_core_seconds(self): db_obj = self._db hist = Histogram() stat = NumericStats() self.addCleanup(self._del_table, "histograms") self.addCleanup(self._del_table, "numericStats") hist.create_table(db_obj) stat.create_table(db_obj) rt = ResultTrace() pbs_list = { "account": ["account1", "account2"], "cores_per_node": [24, 24, 24], "numnodes": [1, 1, 1], "wallclock_requested": [120, 368, 400], "class": ["queue1", "queue2", "queue3"], "created": [1000, 2000, 3000], "start": [1100, 2200, 3300], "completion": [1500, 2700, 4000], "jobname": ["name1", "name2", "name3"] } rt._lists_submit = rt._transform_pbs_to_slurm(pbs_list) rt.calculate_job_results_grouped_core_seconds([0, 24 * 450, 24 * 550], True, db_obj, 1) db_obj = self._db new_rt = ResultTrace() new_rt.load_job_results_grouped_core_seconds([0, 24 * 450, 24 * 550], db_obj, 1) fields = [ "jobs_runtime_cdf", "jobs_runtime_stats", "jobs_waittime_cdf", "jobs_waittime_stats", "jobs_turnaround_cdf", "jobs_turnaround_stats", "jobs_requested_wc_cdf", "jobs_requested_wc_stats", "jobs_cpus_alloc_cdf", "jobs_cpus_alloc_stats", "jobs_slowdown_cdf", "jobs_slowdown_stats" ] new_fields = [] for edge in [0, 24 * 450, 24 * 550]: for field in fields: new_fields.append("g" + str(edge) + "_" + field) for field in new_fields: self.assertNotEqual(new_rt.jobs_results[field], None)
def test_store_load(self): db_obj = self._db hist = Histogram() stat = NumericStats() self.addCleanup(self._del_table, "histograms") self.addCleanup(self._del_table, "numericStats") hist.create_table(db_obj) stat.create_table(db_obj) job_list_1 = { "job_name": [ "wf_manifest-2_S0", "wf_manifest-2_S1_dS0", "wf_manifest-3_S0", "wf_manifest-3_S1_dS0" ], "id_job": [0, 1, 2, 3], "time_submit": [100, 100, 1100, 1100], "time_start": [110, 215, 1200, 1400], "time_end": [200, 250, 1300, 1500] } job_list_2 = { "job_name": ["wf_manifest-2_S0", "wf_manifest-3_S0"], "id_job": [0, 1], "time_submit": [100, 1100], "time_start": [110, 1200], "time_end": [615, 2000] } wf_d = WorkflowDeltas() wf_d._first_trace = ResultTrace() wf_d._second_trace = ResultTrace() wf_d._first_trace._lists_submit = job_list_1 wf_d._second_trace._lists_submit = job_list_2 wf_d._first_workflows = wf_d._first_trace.do_workflow_pre_processing() wf_d._second_workflows = wf_d._second_trace.do_workflow_pre_processing( ) wf_d.produce_deltas() results_1 = wf_d.calculate_delta_results(True, db_obj, 1) wf_d_2 = WorkflowDeltas() results_2 = wf_d_2.load_delta_results(db_obj, 1) for field in list(results_1.keys()): assertEqualResult(self, results_1[field], results_2[field], field)
def __init__(self, propStdDev, min, binWidth, values): self.targetDistr = Histogram.createInitialized(min, binWidth, values) self.propsalDistr = GaussianRejectSampler(0, propStdDev) self.proposalMixture = False # bootstrap sample (min, max) = self.targetDistr.getMinMax() self.curSample = random.randint(min, max) self.curDistr = self.targetDistr.value(self.curSample) self.transCount = 0
def test(self, examples, print_level=1): """Computes the "area under the ROC curve". This is a way to measure the precision/recall WITHOUT choosing a cutoff-threshold. It is mathematically equivalent to: "the probability that a random positive example has a higher prob_output1 than a random negative case" (This equivalence is non-obvious). The algorithm below computes this average probability by effectively trying all combinations of positive-vs-negative examples, but does this in O(NlgN) instead of O(N^2)""" if type(examples) is TrainingExamples: examples = examples.examples prob_stats = SummaryStats() prob_hist = Histogram() output1_scores = list() output0_scores = list() for example in examples: assert example["_OUTPUT"] in [0, 1] prob = self.prob_output1(example) prob_stats.add(prob) prob_key = "%1.1f-%1.1f" % (int(prob * 10) / 10.0, (int(prob * 10) + 1) / 10.0) if prob == 1: prob_key = "0.9-1.0" # don't create a 1.0-1.1 bucket prob_hist.add(prob_key) real_output = example["_OUTPUT"] == 1 if real_output: output1_scores.append(prob) else: output0_scores.append(prob) output1_scores.sort() output0_scores.sort() if print_level >= 2: print "%d output1 scores:" % len(output1_scores), print ["%2.2f" % i for i in output1_scores[0:5]], print " ... ",
def test_load_job_results(self): db_obj = self._db hist = Histogram() stat = NumericStats() self.addCleanup(self._del_table, "histograms") self.addCleanup(self._del_table, "numericStats") hist.create_table(db_obj) stat.create_table(db_obj) rt = ResultTrace() pbs_list = { "account": ["account1", "account2"], "cores_per_node": [24, 48], "numnodes": [100, 200], "class": ["queue1", "queue2"], "wallclock_requested": [120, 368], "created": [1000, 2000], "start": [1100, 2200], "completion": [1500, 2700], "jobname": ["name1", "name2"] } rt._lists_submit = rt._transform_pbs_to_slurm(pbs_list) rt.calculate_job_results(True, db_obj, 1) db_obj = self._db new_rt = ResultTrace() new_rt.load_job_results(db_obj, 1) for field in [ "jobs_runtime_cdf", "jobs_runtime_stats", "jobs_waittime_cdf", "jobs_waittime_stats", "jobs_turnaround_cdf", "jobs_turnaround_stats", "jobs_requested_wc_cdf", "jobs_requested_wc_stats", "jobs_cpus_alloc_cdf", "jobs_cpus_alloc_stats", "jobs_slowdown_cdf", "jobs_slowdown_stats" ]: self.assertNotEqual(rt.jobs_results[field], None)
""" Creates the SQL schema for the workload databases. Env vars: - ANALYSIS_DB_HOST: hostname of the system hosting the database. - ANALYSIS_DB_NAME: database name to read from. - ANALYSIS_DB_USER: user to be used to access the database. - ANALYSIS_DB_PASS: password to be used to used to access the database. - ANALYSIS_DB_PORT: port on which the database runs. """ from orchestration import get_central_db from orchestration.definition import ExperimentDefinition from stats.trace import ResultTrace from stats import Histogram, NumericStats db_obj = get_central_db() ExperimentDefinition().create_table(db_obj) ResultTrace().create_trace_table(db_obj, ResultTrace()._table_name) Histogram().create_table(db_obj) ResultTrace()._get_utilization_result().create_table(db_obj) NumericStats().create_table(db_obj)
def setUp(self): self._db = DB(os.getenv("TEST_DB_HOST", "127.0.0.1"), os.getenv("TEST_DB_NAME", "test"), os.getenv("TEST_DB_USER", "root"), os.getenv("TEST_DB_PASS", "")) ht = Histogram() ht.create_table(self._db) self.addCleanup(self._del_table, ht._table_name) ns = NumericStats() ns.create_table(self._db) self.addCleanup(self._del_table, ns._table_name) us = NumericList("usage_values", ["utilization", "waste"]) us.create_table(self._db) self.addCleanup(self._del_table, "usage_values") rt = ResultTrace() self.addCleanup(self._del_table, "import_table") rt.create_import_table(self._db, "import_table") self.addCleanup(self._del_table, "traces") rt.create_trace_table(self._db, "traces") rt = ResultTrace() rt._lists_submit = { "job_db_inx": [1, 2], "account": ["account1", "account2"], "cpus_req": [48, 96], "cpus_alloc": [48, 96], "job_name": ["jobName1", "jbname2"], "id_job": [1, 2], "id_qos": [2, 3], "id_resv": [3, 4], "id_user": [4, 5], "nodes_alloc": [2, 4], "partition": ["partition1", "partition2"], "priority": [99, 199], "state": [3, 2], "timelimit": [100, 200], "time_submit": [3000, 3001], "time_start": [3002, 3001], "time_end": [3002, 3005] } rt._lists_start = { "job_db_inx": [2, 1], "account": ["account2", "account1"], "cpus_req": [96, 48], "cpus_alloc": [96, 48], "job_name": ["jobName2", "jobName1"], "id_job": [2, 1], "id_qos": [3, 2], "id_resv": [4, 3], "id_user": [5, 4], "nodes_alloc": [4, 2], "partition": ["partition2", "partition1"], "priority": [199, 99], "state": [2, 3], "timelimit": [200, 100], "time_submit": [3003, 3000], "time_start": [3001, 3002], "time_end": [3005, 3002] } rt.store_trace(self._db, 1) self._rt = rt
def test_calculate(self): hist = Histogram() hist.calculate([1, 2, 3, 3, 5], 1) bins, edges = hist.get_data() self.assertEqual(edges, [1, 2, 3, 4, 5, 6]) self.assertEqual(list(bins), [0.2, 0.2, 0.4, 0, 0.2]) hist.calculate([1, 2, 3, 3, 5], 1, minmax=(1, 3)) self.assertEqual(hist._get("edges"), [1, 2, 3, 4]) self.assertEqual(list(hist._get("bins")), [0.25, 0.25, 0.5]) hist.calculate([1, 2, 3, 3, 5], 1, minmax=(1, 3), input_bins=[1, 6]) self.assertEqual(hist._get("edges"), [1, 6]) self.assertEqual(list(hist._get("bins")), [1.0])
def test_save_load(self): hist = Histogram() self.addCleanup(self._del_table, "histograms") hist.create_table(self._db) hist.calculate([1, 2, 3, 3, 5], 1) data_id = hist.store(self._db, 1, "MyHist") hist = None hist_new = Histogram() hist_new.load(self._db, 1, "MyHist") self.assertEqual(hist_new._get("edges"), [1, 2, 3, 4, 5, 6]) self.assertEqual(list(hist_new._get("bins")), [0.2, 0.2, 0.4, 0, 0.2])