def run(self, mailer, dry_run=False): """Runs this job (execution)""" entry_point = self.sched.config.VARIANTS[self.variant] vm_def = self.vm_info["vm_def"] vm_def.dry_run = dry_run # Set heap limit heap_limit_kb = self.sched.config.HEAP_LIMIT stack_limit_kb = self.sched.config.STACK_LIMIT stdout, stderr, rc = vm_def.run_exec( entry_point, self.benchmark, self.vm_info["n_iterations"], self.parameter, heap_limit_kb, stack_limit_kb) if not dry_run: try: iterations_results = util.check_and_parse_execution_results(stdout, stderr, rc) except util.ExecutionFailed as e: util.log_and_mail(mailer, error, "Benchmark failure: %s" % self.key, e.message) iterations_results = [] else: iterations_results = [] # We print the status *after* benchmarking, so that I/O cannot be # commited during benchmarking. In production, we will be rebooting # before the next execution, so we are grand. info("Finished '%s(%d)' (%s variant) under '%s'" % (self.benchmark, self.parameter, self.variant, self.vm_name)) return iterations_results
def run(self, mailer, dry_run=False): """Runs this job (execution)""" entry_point = self.sched.config.VARIANTS[self.variant] vm_def = self.vm_info["vm_def"] vm_def.dry_run = dry_run # Set heap limit heap_limit_kb = self.sched.config.HEAP_LIMIT stack_limit_kb = self.sched.config.STACK_LIMIT stdout, stderr, rc = vm_def.run_exec(entry_point, self.benchmark, self.vm_info["n_iterations"], self.parameter, heap_limit_kb, stack_limit_kb) if not dry_run: try: iterations_results = util.check_and_parse_execution_results( stdout, stderr, rc) except util.ExecutionFailed as e: util.log_and_mail(mailer, error, "Benchmark failure: %s" % self.key, e.message) iterations_results = [] else: iterations_results = [] # We print the status *after* benchmarking, so that I/O cannot be # commited during benchmarking. In production, we will be rebooting # before the next execution, so we are grand. info("Finished '%s(%d)' (%s variant) under '%s'" % (self.benchmark, self.parameter, self.variant, self.vm_name)) return iterations_results
def test_log_and_mail(): log_fn = lambda s: None log_and_mail(MockMailer(), log_fn, "subject", "msg", exit=False, bypass_limiter=False) with pytest.raises(FatalKrunError): log_and_mail(MockMailer(), log_fn, "", "", exit=True, bypass_limiter=False) assert True
def test_log_and_mail(mock_manifest, mock_mailer): log_fn = lambda s: None log_and_mail(mock_mailer, log_fn, "subject", "msg", exit=False, bypass_limiter=False, manifest=mock_manifest) with pytest.raises(FatalKrunError): log_and_mail(MockMailer(), log_fn, "", "", exit=True, bypass_limiter=False, manifest=mock_manifest) assert True
def build_schedule(self): """Builds a queue of process execution jobs. Returns two sets: non_skipped_keys, skipped_keys""" skipped_keys, non_skipped_keys = set(), set() one_exec_scheduled = False eta_avail_job = None for exec_n in xrange(self.config.N_EXECUTIONS): for vm_name, vm_info in self.config.VMS.items(): for bmark, param in self.config.BENCHMARKS.items(): for variant in vm_info["variants"]: job = ExecutionJob(self, vm_name, vm_info, bmark, variant, param) if not self.config.should_skip(job.key): non_skipped_keys |= set([job.key]) if one_exec_scheduled and not eta_avail_job: # first job of second executions eta becomes known. eta_avail_job = job self.set_eta_avail() self.add_job(job) else: skipped_keys |= set([job.key]) if not one_exec_scheduled: debug("%s is in skip list. Not scheduling." % job.key) one_exec_scheduled = True self.expected_reboots = len(self) # Resume mode: if previous results are available, remove the # jobs from the schedule which have already been executed, and # add the results to this object, ready to be saved to a Json file. if self.resume: self._remove_previous_execs_from_schedule() # Sanity check ETA estimates for key, exec_data in self.results.data.iteritems(): got_len = len(self.results.eta_estimates[key]) expect_len = len(exec_data) if expect_len != got_len: msg = "ETA estimates didn't tally with results: " msg += "key=%s, expect_len=%d, got_len=%d " % \ (key, expect_len, got_len) msg += "data[%s]=%s; " % (key, str(self.results.data[key])) msg += "eta[%s]=%s" % \ (key, str(self.results.eta_estimates[key])) util.log_and_mail(self.mailer, error, "Fatal Krun Error", msg, bypass_limiter=True, exit=True) return non_skipped_keys, skipped_keys
def check_dmesg_for_changes(self): new_dmesg_time = localtime() new_dmesg = self._collect_dmesg_lines() old_fn = self._timestamp_to_str(self.last_dmesg_time) new_fn = self._timestamp_to_str(new_dmesg_time) lines = [x for x in difflib.unified_diff( self.last_dmesg, new_dmesg, old_fn, new_fn, lineterm="")] if lines: # dmesg changed! diff = "\n".join(lines) warn_s = "dmesg seems to have changed! Diff follows:\n" + diff log_and_mail(self.mailer, warn, "dmesg changed", warn_s) self.last_dmesg = new_dmesg self.last_dmesg_time = new_dmesg_time return True # i.e. a (potential) error occurred return False
def run(self): """Benchmark execution starts here""" # In reboot mode, wait for the system to come up before we proceed if self.platform.hardware_reboots: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) self.platform.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() assert self.manifest.num_execs_left > 0 self.platform.wait_for_temperature_sensors() bench, vm, variant = self.manifest.next_exec_key.split(":") key_pexec_idx = self.manifest.next_exec_key_index() job = ExecutionJob(self, vm, self.config.VMS[vm], bench, variant, self.config.BENCHMARKS[bench], key_pexec_idx) # Default to error state. This is the value the finally block will see # if an exception is raised inside the try block, otherwise it is # re-assigned based on the result of running the benchmark. flag = 'E' # Run the pre-exec commands, the benchmark and the post-exec commands. # These are wrapped in a try/except, so that the post-exec commands # are always executed, even if an exception has occurred. We only # reboot /after/ the post-exec commands have completed. results = None try: # Run the user's pre-process-execution commands. We can't put an # ETA estimate in the environment for the pre-commands as we have # not (and should not) load the results file into memory yet. # # It might seem tempting to move this outside the try block, to # ensure that post-hooks are only run if pre-hooks ran. We don't, # thus avoiding the case where only *part* of the pre-hooks run, # but the post-hooks then don't run. util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS,) # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() measurements, instr_data, flag = job.run(self.mailer, self.dry_run) exec_end_time = time.time() # Only now is it OK to load the results file into memory. Results.ok_to_instantiate = True results = Results(self.config, self.platform, results_file=self.config.results_filename()) # Bail early if the process execution needs to be re-run. if flag == "O": util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_post_cmd_env(results) ) info("Rebooting to re-run previous process execution") util.reboot(self.manifest, self.platform, update_count=False) # reboot() does not return raise RuntimeError("reached unreachable code!") # Store new result. results.append_exec_measurements(job.key, measurements, flag) # Store instrumentation data in a separate file if job.vm_info["vm_def"].instrument: key_exec_num = self.manifest.completed_exec_counts[job.key] util.dump_instr_json(job.key, key_exec_num, self.config, instr_data) eta_info = exec_end_time - exec_start_time if self.platform.hardware_reboots: # Add time taken to wait for system to come up if we are in # hardware-reboot mode. eta_info += STARTUP_WAIT_SECONDS results.eta_estimates[job.key].append(eta_info) self.manifest.update(flag) except Exception: raise finally: # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. # _make_post_cmd_env() needs the results to make an ETA. If an # exception occurred in the above try block, there's a chance that # they have not have been loaded. if results is None: Results.ok_to_instantiate = True results = Results(self.config, self.platform, results_file=self.config.results_filename()) # If errors occured, set error flag in results file if self.platform.check_dmesg_for_changes(self.manifest) or \ flag == 'E': results.error_flag = True results.write_to_file() util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_post_cmd_env(results) ) tfmt = self.get_overall_time_estimate_formatter(results) if self.manifest.eta_avail_idx == self.manifest.next_exec_idx: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format( "Estimated completion (whole session)", tfmt.finish_str, tfmt.delta_str)) info("%d executions left in scheduler queue" % self.manifest.num_execs_left) if self.manifest.num_execs_left > 0 and \ self.manifest.eta_avail_idx > self.manifest.next_exec_idx: info("Executions until ETA known: %s" % (self.manifest.eta_avail_idx - self.manifest.next_exec_idx)) # Although it would have been nice to have checked this prior to # running the execution, it depends on the results file, which we # should not load prior to the process execution. util.check_audit_unchanged(results, self.platform) assert self.manifest.num_execs_left >= 0 if self.manifest.num_execs_left > 0: # print info about the next job benchmark, vm_name, variant = \ self.manifest.next_exec_key.split(":") info("Next execution is '%s(%d)' (%s variant) under '%s'" % (benchmark, self.config.BENCHMARKS[benchmark], variant, vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key, results) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) info("Reboot in preparation for next execution") util.reboot(self.manifest, self.platform) elif self.manifest.num_execs_left == 0: self.platform.save_power() if self.config.ENABLE_PINNING: self.platform.clear_cpu_pinning() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if results.error_flag: msg += "\n\n%s" % err_msg msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True)
def run(self, mailer, dry_run=False): """Runs this job (execution)""" flag = None entry_point = self.sched.config.VARIANTS[self.variant] vm_def = self.vm_info["vm_def"] vm_def.dry_run = dry_run # Set heap limit heap_limit_kb = self.sched.config.HEAP_LIMIT stack_limit_kb = self.sched.config.STACK_LIMIT in_proc_iters = self.vm_info["n_iterations"] if not dry_run: self.sched.platform.collect_starting_throttle_counts() stdout, stderr, rc, envlog_filename, timed_out = \ vm_def.run_exec(entry_point, in_proc_iters, self.parameter, heap_limit_kb, stack_limit_kb, self.key, self.key_pexec_idx) if timed_out: measurements = self.empty_measurements instr_data = {} flag = "T" elif not dry_run: try: self.sched.platform.check_throttle_counts(self.sched.manifest) measurements = util.check_and_parse_execution_results( stdout, stderr, rc, self.sched.config, self.key, instrument=vm_def.instrument) flag = "C" except util.RerunExecution as e: subject = ("Benchmark needs to be re-run: %s (exec_idx=%s)" % (self.key, self.sched.manifest.next_exec_idx)) util.log_and_mail(mailer, warn, subject, e.message, manifest=self.sched.manifest, bypass_limiter=True) measurements = self.empty_measurements flag = "O" # i.e. still outstanding except util.ExecutionFailed as e: util.log_and_mail(mailer, error, "Benchmark failure: %s" % self.key, e.message, manifest=self.sched.manifest) measurements = self.empty_measurements flag = "E" # Collect instrumentation data if vm_def.instrument and flag == "C": instr_data = vm_def.get_instr_data() for k, v in instr_data.iteritems(): assert len(instr_data[k]) == in_proc_iters else: # The benchmark either failed, needs to be re-run, or had # instrumentation turned off. instr_data = {} else: measurements = self.empty_measurements instr_data = {} flag = "C" # We print the status *after* benchmarking, so that I/O cannot be # committed during benchmarking. In production, we will be rebooting # before the next execution, so we are grand. info("Finished '%s(%d)' (%s variant) under '%s'" % (self.benchmark, self.parameter, self.variant, self.vm_name)) # Move the environment log out of /tmp. # # We don't do this for re-runs (O) as the log for the re-run pexec is # the one we want. # # We don't do this for timeouts (T) because the wrapper script is # killed upon timeout, and thus doesn't get a chance to log the # environment. if not dry_run and flag not in ("O", "T"): key_exec_num = self.sched.manifest.completed_exec_counts[self.key] util.stash_envlog(envlog_filename, self.sched.config, self.sched.platform, self.key, key_exec_num) assert flag is not None return measurements, instr_data, flag
def inner_main(mailer, on_first_invocation, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) instr_dir = util.get_instr_json_dir(config) instr_dir_exists = os.path.exists(instr_dir) envlog_dir = util.get_envlog_dir(config) envlog_dir_exists = os.path.exists(envlog_dir) if out_file_exists and not os.path.isfile(out_file): util.fatal("Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and on_first_invocation: util.fatal("Output results file '%s' already exists. " "Move the file away before running Krun." % out_file) if instr_dir_exists and on_first_invocation: util.fatal("Instrumentation dir '%s' exists." % instr_dir) if envlog_dir_exists and on_first_invocation: util.fatal("Env log dir '%s' exists." % envlog_dir) if not out_file_exists and not on_first_invocation: util.fatal("No results file to resume. Expected '%s'" % out_file) # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer, config) platform.quick_mode = args.quick platform.no_user_change = args.no_user_change platform.no_tickless_check = args.no_tickless_check platform.no_pstate_check = args.no_pstate_check platform.hardware_reboots = args.hardware_reboots # Create the instrumentation directory if required if on_first_invocation: # We only want make a dir if >=1 VM is in instrumentation mode. for vm in config.VMS.itervalues(): if vm['vm_def'].instrument: util.make_instr_dir(config) break debug("Checking platform preliminaries") platform.check_preliminaries() # Make a bit of noise if this is a virtualised environment if platform.is_virtual(): warn( "This appears to be a virtualised host. The results will be flawed. " "Use bare-metal for reliable results!") platform.collect_audit() # At this point the config file is OK, and on-disk state is consistent, # so let's daemonise (if requested). if args.daemonise: util.daemonise() if not on_first_invocation: # output file must exist, due to check above assert (out_file_exists) debug("Using pre-recorded initial temperature readings") manifest = ManifestManager(config, platform) platform_temps = {} for sensor, tup in manifest.starting_temperatures.iteritems(): platform_temps[sensor] = tup[1] platform.starting_temperatures = platform_temps else: manifest = ManifestManager(config, platform, new_file=True) if manifest.num_execs_left == 0: # No executions, or all skipped fatal("Empty schedule!") try: info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) # This part is wrapped in hooks, so that if daemons or networking are # taken down for process executions, then the initial temperature # reading gets the same treatment. util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS, ) platform.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings( ) manifest.set_starting_temperatures(platform.starting_temperatures) # Write out an empty results file. After the initial reboot Krun # will expect this to exist. Results.ok_to_instantiate = True results = Results(config, platform) results.write_to_file() except: raise finally: util.run_shell_cmd_list(config.POST_EXECUTION_CMDS, ) log_path = config.log_filename(resume=False) util.log_and_mail(mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % log_path, bypass_limiter=True) util.reboot(manifest, platform) # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run) sched.run()
def main(parser): args = parser.parse_args() if args.dump is not None: if not args.filename.endswith(".json.bz2"): usage(parser) else: Results.ok_to_instantiate = True results = Results(None, None, results_file=args.filename) text = results.dump(args.dump) # String data read in from JSON are unicode objects. This matters # for us as some data in the audit includes unicode characters. # If it does, a simple print no longer suffices if the system # locale is (e.g.) ASCII. In this case print will raise an # exception. The correct thing to do is to encode() the unicode to # the system locale. print(text.encode(locale.getpreferredencoding())) sys.exit(0) if not args.filename.endswith(".krun"): usage(parser) try: if os.stat(args.filename).st_size <= 0: util.fatal('Krun configuration file %s is empty.' % args.filename) except OSError: util.fatal('Krun configuration file %s does not exist.' % args.filename) config = Config(args.filename) if args.info: # Info mode doesn't run the experiment. # Just prints some metrics and exits. util.print_session_info(config) return manifest_filename = ManifestManager.get_filename(config) on_first_invocation = not (os.path.isfile(manifest_filename) and os.stat(manifest_filename).st_size > 0) log_file = config.log_filename() if on_first_invocation and os.path.exists(log_file): util.fatal("Log file '%s' already exists. " "Move the file away before running Krun." % log_file) attach_log_file(config, not on_first_invocation) debug("Krun invoked with arguments: %s" % sys.argv) mail_recipients = config.MAIL_TO if type(mail_recipients) is not list: util.fatal("MAIL_TO config should be a list") mailer = Mailer(mail_recipients, max_mails=config.MAX_MAILS) try: inner_main(mailer, on_first_invocation, config, args) except Exception as exn: error_info = sys.exc_info() subject = "Fatal Krun Exception" lines = ["Fatal Krun error: %s\n" % str(error_info[1])] for frame in traceback.format_tb(error_info[2]): lines.append(frame) msg = "".join(lines) util.log_and_mail(mailer, debug, subject, msg, bypass_limiter=True) raise exn
def run(self): """Benchmark execution starts here""" jobs_left = len(self) if jobs_left == 0: debug("Krun started with an empty queue of jobs") if not self.started_by_init: util.log_and_mail(self.mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % self.log_path, bypass_limiter=True) if self.reboot and not self.started_by_init: # Reboot before first benchmark (dumps results file). info("Reboot prior to first execution") self._reboot() if self.reboot and self.started_by_init and jobs_left > 0: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) if self.dry_run: info("SIMULATED: time.sleep (would have waited %s seconds)." % STARTUP_WAIT_SECONDS) else: time.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() while True: self.platform.wait_for_temperature_sensors() jobs_left = len(self) if jobs_left == 0: break # Run the user's pre-process-execution commands util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env()) job = self.next_job() # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() raw_exec_result = job.run(self.mailer, self.dry_run) exec_end_time = time.time() exec_result = util.format_raw_exec_results(raw_exec_result) if not exec_result and not self.dry_run: self.results.error_flag = True self.results.data[job.key].append(exec_result) eta_info = exec_end_time - exec_start_time if self.reboot: # Add time taken to wait for system to come up if we are in # reboot mode. eta_info += STARTUP_WAIT_SECONDS self.add_eta_info(job.key, eta_info) info("%d executions left in scheduler queue" % (jobs_left - 1)) # We dump the json after each experiment so we can monitor the # json file mid-run. It is overwritten each time. self.results.write_to_file() self.jobs_done += 1 # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. util.run_shell_cmd_list(self.config.POST_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env()) tfmt = self.get_overall_time_estimate_formatter() if self.eta_avail == self.jobs_done: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format("Estimated completion", tfmt.finish_str, tfmt.delta_str)) try: job = self.next_job(peek=True) except ScheduleEmpty: pass # no next job else: # print info about the next job info("Next execution is '%s(%d)' (%s variant) under '%s'" % (job.benchmark, job.parameter, job.variant, job.vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) if (self.eta_avail is not None) and (self.jobs_done < self.eta_avail): info("Executions until ETA known: %s" % self.jobs_until_eta_known()) if self.platform.check_dmesg_for_changes(): self.results.error_flag = True if self.reboot and len(self) > 0: info("Reboot in preparation for next execution") self._reboot() self.platform.save_power() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if self.results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if self.results.error_flag: msg += "\n\n%s" % err_msg if self.reboot: msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True)
def run(self): """Benchmark execution starts here""" jobs_left = len(self) if jobs_left == 0: debug("Krun started with an empty queue of jobs") if not self.started_by_init: util.log_and_mail(self.mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % self.log_path, bypass_limiter=True) if self.reboot and not self.started_by_init: # Reboot before first benchmark (dumps results file). info("Reboot prior to first execution") self._reboot() if self.reboot and self.started_by_init and jobs_left > 0: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) if self.dry_run: info("SIMULATED: time.sleep (would have waited %s seconds)." % STARTUP_WAIT_SECONDS) else: time.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() while True: self.platform.wait_for_temperature_sensors() jobs_left = len(self) if jobs_left == 0: break # Run the user's pre-process-execution commands util.run_shell_cmd_list( self.config.PRE_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env() ) job = self.next_job() # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() raw_exec_result = job.run(self.mailer, self.dry_run) exec_end_time = time.time() exec_result = util.format_raw_exec_results(raw_exec_result) if not exec_result and not self.dry_run: self.results.error_flag = True self.results.data[job.key].append(exec_result) eta_info = exec_end_time - exec_start_time if self.reboot: # Add time taken to wait for system to come up if we are in # reboot mode. eta_info += STARTUP_WAIT_SECONDS self.add_eta_info(job.key, eta_info) info("%d executions left in scheduler queue" % (jobs_left - 1)) # We dump the json after each experiment so we can monitor the # json file mid-run. It is overwritten each time. self.results.write_to_file() self.jobs_done += 1 # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env() ) tfmt = self.get_overall_time_estimate_formatter() if self.eta_avail == self.jobs_done: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format( "Estimated completion", tfmt.finish_str, tfmt.delta_str)) try: job = self.next_job(peek=True) except ScheduleEmpty: pass # no next job else: # print info about the next job info("Next execution is '%s(%d)' (%s variant) under '%s'" % (job.benchmark, job.parameter, job.variant, job.vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) if (self.eta_avail is not None) and (self.jobs_done < self.eta_avail): info("Executions until ETA known: %s" % self.jobs_until_eta_known()) if self.platform.check_dmesg_for_changes(): self.results.error_flag = True if self.reboot and len(self) > 0: info("Reboot in preparation for next execution") self._reboot() self.platform.save_power() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if self.results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if self.results.error_flag: msg += "\n\n%s" % err_msg if self.reboot: msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True)
def inner_main(mailer, on_first_invocation, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) instr_dir = util.get_instr_json_dir(config) instr_dir_exists = os.path.exists(instr_dir) envlog_dir = util.get_envlog_dir(config) envlog_dir_exists = os.path.exists(envlog_dir) if out_file_exists and not os.path.isfile(out_file): util.fatal( "Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and on_first_invocation: util.fatal("Output results file '%s' already exists. " "Move the file away before running Krun." % out_file) if instr_dir_exists and on_first_invocation: util.fatal("Instrumentation dir '%s' exists." % instr_dir) if envlog_dir_exists and on_first_invocation: util.fatal("Env log dir '%s' exists." % envlog_dir) if not out_file_exists and not on_first_invocation: util.fatal("No results file to resume. Expected '%s'" % out_file) # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer, config) platform.quick_mode = args.quick platform.no_user_change = args.no_user_change platform.no_tickless_check = args.no_tickless_check platform.no_pstate_check = args.no_pstate_check platform.hardware_reboots = args.hardware_reboots # Create the instrumentation directory if required if on_first_invocation: # We only want make a dir if >=1 VM is in instrumentation mode. for vm in config.VMS.itervalues(): if vm['vm_def'].instrument: util.make_instr_dir(config) break debug("Checking platform preliminaries") platform.check_preliminaries() # Make a bit of noise if this is a virtualised environment if platform.is_virtual(): warn("This appears to be a virtualised host. The results will be flawed. " "Use bare-metal for reliable results!") platform.collect_audit() # At this point the config file is OK, and on-disk state is consistent, # so let's daemonise (if requested). if args.daemonise: util.daemonise() if not on_first_invocation: # output file must exist, due to check above assert(out_file_exists) debug("Using pre-recorded initial temperature readings") manifest = ManifestManager(config, platform) platform_temps = {} for sensor, tup in manifest.starting_temperatures.iteritems(): platform_temps[sensor] = tup[1] platform.starting_temperatures = platform_temps else: manifest = ManifestManager(config, platform, new_file=True) if manifest.num_execs_left == 0: # No executions, or all skipped fatal("Empty schedule!") try: info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) # This part is wrapped in hooks, so that if daemons or networking are # taken down for process executions, then the initial temperature # reading gets the same treatment. util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS,) platform.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings() manifest.set_starting_temperatures(platform.starting_temperatures) # Write out an empty results file. After the initial reboot Krun # will expect this to exist. Results.ok_to_instantiate = True results = Results(config, platform) results.write_to_file() except: raise finally: util.run_shell_cmd_list(config.POST_EXECUTION_CMDS,) log_path = config.log_filename(resume=False) util.log_and_mail(mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % log_path, bypass_limiter=True) util.reboot(manifest, platform) # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run) sched.run()