def test_write_results_to_disk(self, mock_platform, no_results_instantiation_check): config = Config("krun/tests/example.krun") mock_platform.num_cpus = 4 mock_platform.num_per_core_measurements = mock_platform.num_cpus out_file = "krun/tests/example_results.json.bz2" results0 = Results(config, mock_platform) results0.audit = dict() results0.starting_temperatures = [4355, 9879] results0.wallclock_times = {u"dummy:Java:default-java": [[1.000726]]} results0.eta_estimates = {u"dummy:Java:default-java": [1.1]} results0.core_cycle_counts = { u"dummy:Java:default-java": [[[2], [3], [4], [5]]] } results0.aperf_counts = { u"dummy:Java:default-java": [[[3], [4], [5], [6]]] } results0.mperf_counts = { u"dummy:Java:default-java": [[[4], [5], [6], [7]]] } results0.reboots = 5 results0.error_flag = False results0.write_to_file() results1 = Results(config, None, results_file=out_file) assert results0 == results1 # Clean-up generated file. os.unlink(out_file)
def emulate_first_reboot(platform, config): platform.starting_temperatures = platform.take_temperature_readings() manifest = ManifestManager(config, platform, new_file=True) manifest.set_starting_temperatures(platform.starting_temperatures) results = Results(config, platform) results.write_to_file() return manifest
def test_audit_differs0001(self, monkeypatch, mock_platform, caplog, no_results_instantiation_check): """Check that if the audit differs, a crash occurs""" make_reboot_raise(monkeypatch) no_envlogs(monkeypatch) config = Config(os.path.join(TEST_DIR, "example.krun")) krun.util.assign_platform(config, mock_platform) emulate_first_reboot(mock_platform, config) results_path = config.results_filename() # mutate the audit, so it won't match later results = Results(config, mock_platform, results_file=results_path) results.audit._audit["wibble"] = "wut" results.write_to_file() sched = ExecutionScheduler(config, mock_platform.mailer, mock_platform, dry_run=True) with pytest.raises(krun.util.FatalKrunError): sched.run() expect = "This is only valid if the machine you are using is identical" assert expect in caplog.text os.unlink(sched.manifest.path) os.unlink(results_path)
def test_audit_differs0001(self, monkeypatch, mock_platform, caplog): """Check that if the audit differs, a crash occurs""" make_reboot_raise(monkeypatch) no_envlogs(monkeypatch) config = Config(os.path.join(TEST_DIR, "example.krun")) krun.util.assign_platform(config, mock_platform) emulate_first_reboot(mock_platform, config, monkeypatch) results_path = config.results_filename() # mutate the audit, so it won't match later results = Results(config, mock_platform, results_file=results_path) results.audit._audit["wibble"] = "wut" results.write_to_file() sched = ExecutionScheduler(config, mock_platform.mailer, mock_platform, dry_run=True) with pytest.raises(krun.util.FatalKrunError): sched.run() expect = "This is only valid if the machine you are using is identical" assert expect in caplog.text() os.unlink(sched.manifest.path) os.unlink(results_path)
def emulate_first_reboot(platform, config, monkeypatch): no_results_instantiation_check(monkeypatch) platform.starting_temperatures = platform.take_temperature_readings() manifest = ManifestManager(config, platform, new_file=True) manifest.set_starting_temperatures(platform.starting_temperatures) results = Results(config, platform) results.write_to_file() return manifest
def strip_results(config, key_spec): from krun.platform import detect_platform from krun.results import Results platform = detect_platform(None) results = Results(config, platform, results_file=config.results_filename()) n_removed = results.strip_results(key_spec) if n_removed > 0: results.write_to_file() info("Removed %d result keys" % n_removed)
def test_write_results_to_disk(self, mock_platform): config = Config("krun/tests/example.krun") out_file = "krun/tests/example_results.json.bz2" results0 = Results(config, mock_platform) results0.audit = dict() results0.starting_temperatures = [4355, 9879] results0.data = {u"dummy:Java:default-java": [[1.000726]]} results0.eta_estimates = {u"dummy:Java:default-java": [1.1]} results0.reboots = 5 results0.error_flag = False results0.write_to_file() results1 = Results(config, None, results_file=out_file) assert results0 == results1 # Clean-up generated file. os.unlink(out_file)
def test_write_results_to_disk(self, mock_platform, no_results_instantiation_check): config = Config("krun/tests/example.krun") mock_platform.num_cpus = 4 mock_platform.num_per_core_measurements = mock_platform.num_cpus out_file = "krun/tests/example_results.json.bz2" results0 = Results(config, mock_platform) results0.audit = dict() results0.starting_temperatures = [4355, 9879] results0.wallclock_times = {u"dummy:Java:default-java": [[1.000726]]} results0.eta_estimates = {u"dummy:Java:default-java": [1.1]} results0.core_cycle_counts = {u"dummy:Java:default-java": [[[2], [3], [4], [5]]]} results0.aperf_counts = {u"dummy:Java:default-java": [[[3], [4], [5], [6]]]} results0.mperf_counts = {u"dummy:Java:default-java": [[[4], [5], [6], [7]]]} results0.reboots = 5 results0.error_flag = False results0.write_to_file() results1 = Results(config, None, results_file=out_file) assert results0 == results1 # Clean-up generated file. os.unlink(out_file)
def run(self): """Benchmark execution starts here""" # In reboot mode, wait for the system to come up before we proceed if self.platform.hardware_reboots: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) self.platform.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() assert self.manifest.num_execs_left > 0 self.platform.wait_for_temperature_sensors() bench, vm, variant = self.manifest.next_exec_key.split(":") key_pexec_idx = self.manifest.next_exec_key_index() job = ExecutionJob(self, vm, self.config.VMS[vm], bench, variant, self.config.BENCHMARKS[bench], key_pexec_idx) # Default to error state. This is the value the finally block will see # if an exception is raised inside the try block, otherwise it is # re-assigned based on the result of running the benchmark. flag = 'E' # Run the pre-exec commands, the benchmark and the post-exec commands. # These are wrapped in a try/except, so that the post-exec commands # are always executed, even if an exception has occurred. We only # reboot /after/ the post-exec commands have completed. results = None try: # Run the user's pre-process-execution commands. We can't put an # ETA estimate in the environment for the pre-commands as we have # not (and should not) load the results file into memory yet. # # It might seem tempting to move this outside the try block, to # ensure that post-hooks are only run if pre-hooks ran. We don't, # thus avoiding the case where only *part* of the pre-hooks run, # but the post-hooks then don't run. util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS,) # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() measurements, instr_data, flag = job.run(self.mailer, self.dry_run) exec_end_time = time.time() # Only now is it OK to load the results file into memory. Results.ok_to_instantiate = True results = Results(self.config, self.platform, results_file=self.config.results_filename()) # Bail early if the process execution needs to be re-run. if flag == "O": util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_post_cmd_env(results) ) info("Rebooting to re-run previous process execution") util.reboot(self.manifest, self.platform, update_count=False) # reboot() does not return raise RuntimeError("reached unreachable code!") # Store new result. results.append_exec_measurements(job.key, measurements, flag) # Store instrumentation data in a separate file if job.vm_info["vm_def"].instrument: key_exec_num = self.manifest.completed_exec_counts[job.key] util.dump_instr_json(job.key, key_exec_num, self.config, instr_data) eta_info = exec_end_time - exec_start_time if self.platform.hardware_reboots: # Add time taken to wait for system to come up if we are in # hardware-reboot mode. eta_info += STARTUP_WAIT_SECONDS results.eta_estimates[job.key].append(eta_info) self.manifest.update(flag) except Exception: raise finally: # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. # _make_post_cmd_env() needs the results to make an ETA. If an # exception occurred in the above try block, there's a chance that # they have not have been loaded. if results is None: Results.ok_to_instantiate = True results = Results(self.config, self.platform, results_file=self.config.results_filename()) # If errors occured, set error flag in results file if self.platform.check_dmesg_for_changes(self.manifest) or \ flag == 'E': results.error_flag = True results.write_to_file() util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_post_cmd_env(results) ) tfmt = self.get_overall_time_estimate_formatter(results) if self.manifest.eta_avail_idx == self.manifest.next_exec_idx: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format( "Estimated completion (whole session)", tfmt.finish_str, tfmt.delta_str)) info("%d executions left in scheduler queue" % self.manifest.num_execs_left) if self.manifest.num_execs_left > 0 and \ self.manifest.eta_avail_idx > self.manifest.next_exec_idx: info("Executions until ETA known: %s" % (self.manifest.eta_avail_idx - self.manifest.next_exec_idx)) # Although it would have been nice to have checked this prior to # running the execution, it depends on the results file, which we # should not load prior to the process execution. util.check_audit_unchanged(results, self.platform) assert self.manifest.num_execs_left >= 0 if self.manifest.num_execs_left > 0: # print info about the next job benchmark, vm_name, variant = \ self.manifest.next_exec_key.split(":") info("Next execution is '%s(%d)' (%s variant) under '%s'" % (benchmark, self.config.BENCHMARKS[benchmark], variant, vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key, results) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) info("Reboot in preparation for next execution") util.reboot(self.manifest, self.platform) elif self.manifest.num_execs_left == 0: self.platform.save_power() if self.config.ENABLE_PINNING: self.platform.clear_cpu_pinning() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if results.error_flag: msg += "\n\n%s" % err_msg msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True)
def inner_main(mailer, on_first_invocation, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) instr_dir = util.get_instr_json_dir(config) instr_dir_exists = os.path.exists(instr_dir) envlog_dir = util.get_envlog_dir(config) envlog_dir_exists = os.path.exists(envlog_dir) if out_file_exists and not os.path.isfile(out_file): util.fatal("Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and on_first_invocation: util.fatal("Output results file '%s' already exists. " "Move the file away before running Krun." % out_file) if instr_dir_exists and on_first_invocation: util.fatal("Instrumentation dir '%s' exists." % instr_dir) if envlog_dir_exists and on_first_invocation: util.fatal("Env log dir '%s' exists." % envlog_dir) if not out_file_exists and not on_first_invocation: util.fatal("No results file to resume. Expected '%s'" % out_file) # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer, config) platform.quick_mode = args.quick platform.no_user_change = args.no_user_change platform.no_tickless_check = args.no_tickless_check platform.no_pstate_check = args.no_pstate_check platform.hardware_reboots = args.hardware_reboots # Create the instrumentation directory if required if on_first_invocation: # We only want make a dir if >=1 VM is in instrumentation mode. for vm in config.VMS.itervalues(): if vm['vm_def'].instrument: util.make_instr_dir(config) break debug("Checking platform preliminaries") platform.check_preliminaries() # Make a bit of noise if this is a virtualised environment if platform.is_virtual(): warn( "This appears to be a virtualised host. The results will be flawed. " "Use bare-metal for reliable results!") platform.collect_audit() # At this point the config file is OK, and on-disk state is consistent, # so let's daemonise (if requested). if args.daemonise: util.daemonise() if not on_first_invocation: # output file must exist, due to check above assert (out_file_exists) debug("Using pre-recorded initial temperature readings") manifest = ManifestManager(config, platform) platform_temps = {} for sensor, tup in manifest.starting_temperatures.iteritems(): platform_temps[sensor] = tup[1] platform.starting_temperatures = platform_temps else: manifest = ManifestManager(config, platform, new_file=True) if manifest.num_execs_left == 0: # No executions, or all skipped fatal("Empty schedule!") try: info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) # This part is wrapped in hooks, so that if daemons or networking are # taken down for process executions, then the initial temperature # reading gets the same treatment. util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS, ) platform.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings( ) manifest.set_starting_temperatures(platform.starting_temperatures) # Write out an empty results file. After the initial reboot Krun # will expect this to exist. Results.ok_to_instantiate = True results = Results(config, platform) results.write_to_file() except: raise finally: util.run_shell_cmd_list(config.POST_EXECUTION_CMDS, ) log_path = config.log_filename(resume=False) util.log_and_mail(mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % log_path, bypass_limiter=True) util.reboot(manifest, platform) # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run) sched.run()
class ExecutionScheduler(object): """Represents our entire benchmarking session""" def __init__(self, config, mailer, platform, resume=False, reboot=False, dry_run=False, started_by_init=False): self.mailer = mailer self.config = config self.work_deque = deque() self.eta_avail = 0 self.jobs_done = 0 self.platform = platform self.resume = resume self.reboot = reboot self.dry_run = dry_run self.started_by_init = started_by_init if resume: self.results = Results(self.config, platform, results_file=self.config.results_filename()) else: self.results = Results(self.config, platform) self.log_path = self.config.log_filename(self.resume) def set_eta_avail(self): """call after adding job before eta should become available""" self.eta_avail = len(self) def jobs_until_eta_known(self): return self.eta_avail - self.jobs_done def add_job(self, job): self.work_deque.append(job) def remove_job_by_key(self, key): for job in self.work_deque: if job.key == key: job_to_remove = job break else: raise JobMissingError(key) debug("Removed %s from schedule" % key) self.work_deque.remove(job_to_remove) def next_job(self, peek=False): try: job = self.work_deque.popleft() if peek: self.work_deque.appendleft(job) return job except IndexError: raise ScheduleEmpty() # we are done def __len__(self): return len(self.work_deque) def get_estimated_exec_duration(self, key): previous_exec_times = self.results.eta_estimates.get(key) if previous_exec_times: return mean(previous_exec_times) else: return None # we don't know def get_estimated_overall_duration(self): etas = [j.get_estimated_exec_duration() for j in self.work_deque] if None in etas: return None # we don't know return sum(etas) def get_exec_estimate_time_formatter(self, key): return TimeEstimateFormatter(self.get_estimated_exec_duration(key)) def get_overall_time_estimate_formatter(self): return TimeEstimateFormatter(self.get_estimated_overall_duration()) def add_eta_info(self, key, exec_time): self.results.eta_estimates[key].append(exec_time) def build_schedule(self): """Builds a queue of process execution jobs. Returns two sets: non_skipped_keys, skipped_keys""" skipped_keys, non_skipped_keys = set(), set() one_exec_scheduled = False eta_avail_job = None for exec_n in xrange(self.config.N_EXECUTIONS): for vm_name, vm_info in self.config.VMS.items(): for bmark, param in self.config.BENCHMARKS.items(): for variant in vm_info["variants"]: job = ExecutionJob(self, vm_name, vm_info, bmark, variant, param) if not self.config.should_skip(job.key): non_skipped_keys |= set([job.key]) if one_exec_scheduled and not eta_avail_job: # first job of second executions eta becomes known. eta_avail_job = job self.set_eta_avail() self.add_job(job) else: skipped_keys |= set([job.key]) if not one_exec_scheduled: debug("%s is in skip list. Not scheduling." % job.key) one_exec_scheduled = True self.expected_reboots = len(self) # Resume mode: if previous results are available, remove the # jobs from the schedule which have already been executed, and # add the results to this object, ready to be saved to a Json file. if self.resume: self._remove_previous_execs_from_schedule() # Sanity check ETA estimates for key, exec_data in self.results.data.iteritems(): got_len = len(self.results.eta_estimates[key]) expect_len = len(exec_data) if expect_len != got_len: msg = "ETA estimates didn't tally with results: " msg += "key=%s, expect_len=%d, got_len=%d " % \ (key, expect_len, got_len) msg += "data[%s]=%s; " % (key, str(self.results.data[key])) msg += "eta[%s]=%s" % \ (key, str(self.results.eta_estimates[key])) util.log_and_mail(self.mailer, error, "Fatal Krun Error", msg, bypass_limiter=True, exit=True) return non_skipped_keys, skipped_keys def _remove_previous_execs_from_schedule(self): for key in self.results.data: num_completed_jobs = self.results.jobs_completed(key) if num_completed_jobs > 0: try: debug("%s has already been run %d times." % (key, num_completed_jobs)) for _ in range(num_completed_jobs): self.remove_job_by_key(key) self.jobs_done += 1 except JobMissingError as excn: tup = (excn.key, self.config.filename, self.config.results_filename()) msg = ("Failed to resume benchmarking session\n." + "The execution %s appears in results " + "file: %s, but not in config file: %s." % tup) util.fatal(msg) def _make_pre_post_cmd_env(self): """Prepare an environment dict for pre/post execution hooks""" jobs_until_eta_known = self.jobs_until_eta_known() if jobs_until_eta_known > 0: eta_val = "Unknown. Known in %d process executions." % \ self.jobs_until_eta_known() else: eta_val = self.get_overall_time_estimate_formatter().finish_str return { "KRUN_RESULTS_FILE": self.config.results_filename(), "KRUN_LOG_FILE": self.config.log_filename(resume=True), "KRUN_ETA_DATUM": now_str(), "KRUN_ETA_VALUE": eta_val, } def run(self): """Benchmark execution starts here""" jobs_left = len(self) if jobs_left == 0: debug("Krun started with an empty queue of jobs") if not self.started_by_init: util.log_and_mail(self.mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % self.log_path, bypass_limiter=True) if self.reboot and not self.started_by_init: # Reboot before first benchmark (dumps results file). info("Reboot prior to first execution") self._reboot() if self.reboot and self.started_by_init and jobs_left > 0: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) if self.dry_run: info("SIMULATED: time.sleep (would have waited %s seconds)." % STARTUP_WAIT_SECONDS) else: time.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() while True: self.platform.wait_for_temperature_sensors() jobs_left = len(self) if jobs_left == 0: break # Run the user's pre-process-execution commands util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env()) job = self.next_job() # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() raw_exec_result = job.run(self.mailer, self.dry_run) exec_end_time = time.time() exec_result = util.format_raw_exec_results(raw_exec_result) if not exec_result and not self.dry_run: self.results.error_flag = True self.results.data[job.key].append(exec_result) eta_info = exec_end_time - exec_start_time if self.reboot: # Add time taken to wait for system to come up if we are in # reboot mode. eta_info += STARTUP_WAIT_SECONDS self.add_eta_info(job.key, eta_info) info("%d executions left in scheduler queue" % (jobs_left - 1)) # We dump the json after each experiment so we can monitor the # json file mid-run. It is overwritten each time. self.results.write_to_file() self.jobs_done += 1 # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. util.run_shell_cmd_list(self.config.POST_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env()) tfmt = self.get_overall_time_estimate_formatter() if self.eta_avail == self.jobs_done: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format("Estimated completion", tfmt.finish_str, tfmt.delta_str)) try: job = self.next_job(peek=True) except ScheduleEmpty: pass # no next job else: # print info about the next job info("Next execution is '%s(%d)' (%s variant) under '%s'" % (job.benchmark, job.parameter, job.variant, job.vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) if (self.eta_avail is not None) and (self.jobs_done < self.eta_avail): info("Executions until ETA known: %s" % self.jobs_until_eta_known()) if self.platform.check_dmesg_for_changes(): self.results.error_flag = True if self.reboot and len(self) > 0: info("Reboot in preparation for next execution") self._reboot() self.platform.save_power() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if self.results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if self.results.error_flag: msg += "\n\n%s" % err_msg if self.reboot: msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True) def _reboot(self): self.results.reboots += 1 debug("About to execute reboot: %g, expecting %g in total." % (self.results.reboots, self.expected_reboots)) # Dump the results file. This may already have been done, but we # have changed self.nreboots, which needs to be written out. self.results.write_to_file() if self.results.reboots > self.expected_reboots: util.fatal(("HALTING now to prevent an infinite reboot loop: " + "INVARIANT num_reboots <= num_jobs violated. " + "Krun was about to execute reboot number: %g. " + "%g jobs have been completed, %g are left to go.") % (self.results.reboots, self.jobs_done, len(self))) if self.dry_run: info("SIMULATED: reboot (restarting Krun in-place)") args = sys.argv if not self.started_by_init: args.extend(["--resume", "--started-by-init"]) debug("Simulated reboot with args: " + " ".join(args)) os.execv(args[0], args) # replace myself assert False # unreachable else: subprocess.call(self.platform.get_reboot_cmd())
class ExecutionScheduler(object): """Represents our entire benchmarking session""" def __init__(self, config, mailer, platform, resume=False, reboot=False, dry_run=False, started_by_init=False): self.mailer = mailer self.config = config self.work_deque = deque() self.eta_avail = 0 self.jobs_done = 0 self.platform = platform self.resume = resume self.reboot = reboot self.dry_run = dry_run self.started_by_init = started_by_init if resume: self.results = Results(self.config, platform, results_file=self.config.results_filename()) else: self.results = Results(self.config, platform) self.log_path = self.config.log_filename(self.resume) def set_eta_avail(self): """call after adding job before eta should become available""" self.eta_avail = len(self) def jobs_until_eta_known(self): return self.eta_avail - self.jobs_done def add_job(self, job): self.work_deque.append(job) def remove_job_by_key(self, key): for job in self.work_deque: if job.key == key: job_to_remove = job break else: raise JobMissingError(key) debug("Removed %s from schedule" % key) self.work_deque.remove(job_to_remove) def next_job(self, peek=False): try: job = self.work_deque.popleft() if peek: self.work_deque.appendleft(job) return job except IndexError: raise ScheduleEmpty() # we are done def __len__(self): return len(self.work_deque) def get_estimated_exec_duration(self, key): previous_exec_times = self.results.eta_estimates.get(key) if previous_exec_times: return mean(previous_exec_times) else: return None # we don't know def get_estimated_overall_duration(self): etas = [j.get_estimated_exec_duration() for j in self.work_deque] if None in etas: return None # we don't know return sum(etas) def get_exec_estimate_time_formatter(self, key): return TimeEstimateFormatter(self.get_estimated_exec_duration(key)) def get_overall_time_estimate_formatter(self): return TimeEstimateFormatter(self.get_estimated_overall_duration()) def add_eta_info(self, key, exec_time): self.results.eta_estimates[key].append(exec_time) def build_schedule(self): """Builds a queue of process execution jobs. Returns two sets: non_skipped_keys, skipped_keys""" skipped_keys, non_skipped_keys = set(), set() one_exec_scheduled = False eta_avail_job = None for exec_n in xrange(self.config.N_EXECUTIONS): for vm_name, vm_info in self.config.VMS.items(): for bmark, param in self.config.BENCHMARKS.items(): for variant in vm_info["variants"]: job = ExecutionJob(self, vm_name, vm_info, bmark, variant, param) if not self.config.should_skip(job.key): non_skipped_keys |= set([job.key]) if one_exec_scheduled and not eta_avail_job: # first job of second executions eta becomes known. eta_avail_job = job self.set_eta_avail() self.add_job(job) else: skipped_keys |= set([job.key]) if not one_exec_scheduled: debug("%s is in skip list. Not scheduling." % job.key) one_exec_scheduled = True self.expected_reboots = len(self) # Resume mode: if previous results are available, remove the # jobs from the schedule which have already been executed, and # add the results to this object, ready to be saved to a Json file. if self.resume: self._remove_previous_execs_from_schedule() # Sanity check ETA estimates for key, exec_data in self.results.data.iteritems(): got_len = len(self.results.eta_estimates[key]) expect_len = len(exec_data) if expect_len != got_len: msg = "ETA estimates didn't tally with results: " msg += "key=%s, expect_len=%d, got_len=%d " % \ (key, expect_len, got_len) msg += "data[%s]=%s; " % (key, str(self.results.data[key])) msg += "eta[%s]=%s" % \ (key, str(self.results.eta_estimates[key])) util.log_and_mail(self.mailer, error, "Fatal Krun Error", msg, bypass_limiter=True, exit=True) return non_skipped_keys, skipped_keys def _remove_previous_execs_from_schedule(self): for key in self.results.data: num_completed_jobs = self.results.jobs_completed(key) if num_completed_jobs > 0: try: debug("%s has already been run %d times." % (key, num_completed_jobs)) for _ in range(num_completed_jobs): self.remove_job_by_key(key) self.jobs_done += 1 except JobMissingError as excn: tup = (excn.key, self.config.filename, self.config.results_filename()) msg = ("Failed to resume benchmarking session\n." + "The execution %s appears in results " + "file: %s, but not in config file: %s." % tup) util.fatal(msg) def _make_pre_post_cmd_env(self): """Prepare an environment dict for pre/post execution hooks""" jobs_until_eta_known = self.jobs_until_eta_known() if jobs_until_eta_known > 0: eta_val = "Unknown. Known in %d process executions." % \ self.jobs_until_eta_known() else: eta_val = self.get_overall_time_estimate_formatter().finish_str return { "KRUN_RESULTS_FILE": self.config.results_filename(), "KRUN_LOG_FILE": self.config.log_filename(resume=True), "KRUN_ETA_DATUM": now_str(), "KRUN_ETA_VALUE": eta_val, } def run(self): """Benchmark execution starts here""" jobs_left = len(self) if jobs_left == 0: debug("Krun started with an empty queue of jobs") if not self.started_by_init: util.log_and_mail(self.mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % self.log_path, bypass_limiter=True) if self.reboot and not self.started_by_init: # Reboot before first benchmark (dumps results file). info("Reboot prior to first execution") self._reboot() if self.reboot and self.started_by_init and jobs_left > 0: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) if self.dry_run: info("SIMULATED: time.sleep (would have waited %s seconds)." % STARTUP_WAIT_SECONDS) else: time.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() while True: self.platform.wait_for_temperature_sensors() jobs_left = len(self) if jobs_left == 0: break # Run the user's pre-process-execution commands util.run_shell_cmd_list( self.config.PRE_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env() ) job = self.next_job() # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() raw_exec_result = job.run(self.mailer, self.dry_run) exec_end_time = time.time() exec_result = util.format_raw_exec_results(raw_exec_result) if not exec_result and not self.dry_run: self.results.error_flag = True self.results.data[job.key].append(exec_result) eta_info = exec_end_time - exec_start_time if self.reboot: # Add time taken to wait for system to come up if we are in # reboot mode. eta_info += STARTUP_WAIT_SECONDS self.add_eta_info(job.key, eta_info) info("%d executions left in scheduler queue" % (jobs_left - 1)) # We dump the json after each experiment so we can monitor the # json file mid-run. It is overwritten each time. self.results.write_to_file() self.jobs_done += 1 # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env() ) tfmt = self.get_overall_time_estimate_formatter() if self.eta_avail == self.jobs_done: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format( "Estimated completion", tfmt.finish_str, tfmt.delta_str)) try: job = self.next_job(peek=True) except ScheduleEmpty: pass # no next job else: # print info about the next job info("Next execution is '%s(%d)' (%s variant) under '%s'" % (job.benchmark, job.parameter, job.variant, job.vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) if (self.eta_avail is not None) and (self.jobs_done < self.eta_avail): info("Executions until ETA known: %s" % self.jobs_until_eta_known()) if self.platform.check_dmesg_for_changes(): self.results.error_flag = True if self.reboot and len(self) > 0: info("Reboot in preparation for next execution") self._reboot() self.platform.save_power() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if self.results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if self.results.error_flag: msg += "\n\n%s" % err_msg if self.reboot: msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True) def _reboot(self): self.results.reboots += 1 debug("About to execute reboot: %g, expecting %g in total." % (self.results.reboots, self.expected_reboots)) # Dump the results file. This may already have been done, but we # have changed self.nreboots, which needs to be written out. self.results.write_to_file() if self.results.reboots > self.expected_reboots: util.fatal(("HALTING now to prevent an infinite reboot loop: " + "INVARIANT num_reboots <= num_jobs violated. " + "Krun was about to execute reboot number: %g. " + "%g jobs have been completed, %g are left to go.") % (self.results.reboots, self.jobs_done, len(self))) if self.dry_run: info("SIMULATED: reboot (restarting Krun in-place)") args = sys.argv if not self.started_by_init: args.extend(["--resume", "--started-by-init"]) debug("Simulated reboot with args: " + " ".join(args)) os.execv(args[0], args) # replace myself assert False # unreachable else: subprocess.call(self.platform.get_reboot_cmd())
def inner_main(mailer, on_first_invocation, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) instr_dir = util.get_instr_json_dir(config) instr_dir_exists = os.path.exists(instr_dir) envlog_dir = util.get_envlog_dir(config) envlog_dir_exists = os.path.exists(envlog_dir) if out_file_exists and not os.path.isfile(out_file): util.fatal( "Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and on_first_invocation: util.fatal("Output results file '%s' already exists. " "Move the file away before running Krun." % out_file) if instr_dir_exists and on_first_invocation: util.fatal("Instrumentation dir '%s' exists." % instr_dir) if envlog_dir_exists and on_first_invocation: util.fatal("Env log dir '%s' exists." % envlog_dir) if not out_file_exists and not on_first_invocation: util.fatal("No results file to resume. Expected '%s'" % out_file) # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer, config) platform.quick_mode = args.quick platform.no_user_change = args.no_user_change platform.no_tickless_check = args.no_tickless_check platform.no_pstate_check = args.no_pstate_check platform.hardware_reboots = args.hardware_reboots # Create the instrumentation directory if required if on_first_invocation: # We only want make a dir if >=1 VM is in instrumentation mode. for vm in config.VMS.itervalues(): if vm['vm_def'].instrument: util.make_instr_dir(config) break debug("Checking platform preliminaries") platform.check_preliminaries() # Make a bit of noise if this is a virtualised environment if platform.is_virtual(): warn("This appears to be a virtualised host. The results will be flawed. " "Use bare-metal for reliable results!") platform.collect_audit() # At this point the config file is OK, and on-disk state is consistent, # so let's daemonise (if requested). if args.daemonise: util.daemonise() if not on_first_invocation: # output file must exist, due to check above assert(out_file_exists) debug("Using pre-recorded initial temperature readings") manifest = ManifestManager(config, platform) platform_temps = {} for sensor, tup in manifest.starting_temperatures.iteritems(): platform_temps[sensor] = tup[1] platform.starting_temperatures = platform_temps else: manifest = ManifestManager(config, platform, new_file=True) if manifest.num_execs_left == 0: # No executions, or all skipped fatal("Empty schedule!") try: info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) # This part is wrapped in hooks, so that if daemons or networking are # taken down for process executions, then the initial temperature # reading gets the same treatment. util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS,) platform.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings() manifest.set_starting_temperatures(platform.starting_temperatures) # Write out an empty results file. After the initial reboot Krun # will expect this to exist. Results.ok_to_instantiate = True results = Results(config, platform) results.write_to_file() except: raise finally: util.run_shell_cmd_list(config.POST_EXECUTION_CMDS,) log_path = config.log_filename(resume=False) util.log_and_mail(mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % log_path, bypass_limiter=True) util.reboot(manifest, platform) # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run) sched.run()