def test_audit_differs0001(self, monkeypatch, mock_platform, caplog, no_results_instantiation_check): """Check that if the audit differs, a crash occurs""" make_reboot_raise(monkeypatch) no_envlogs(monkeypatch) config = Config(os.path.join(TEST_DIR, "example.krun")) krun.util.assign_platform(config, mock_platform) emulate_first_reboot(mock_platform, config) results_path = config.results_filename() # mutate the audit, so it won't match later results = Results(config, mock_platform, results_file=results_path) results.audit._audit["wibble"] = "wut" results.write_to_file() sched = ExecutionScheduler(config, mock_platform.mailer, mock_platform, dry_run=True) with pytest.raises(krun.util.FatalKrunError): sched.run() expect = "This is only valid if the machine you are using is identical" assert expect in caplog.text os.unlink(sched.manifest.path) os.unlink(results_path)
def __init__(self, config, mailer, platform, resume=False, reboot=False, dry_run=False, started_by_init=False): self.mailer = mailer self.config = config self.work_deque = deque() self.eta_avail = 0 self.jobs_done = 0 self.platform = platform self.resume = resume self.reboot = reboot self.dry_run = dry_run self.started_by_init = started_by_init if resume: self.results = Results(self.config, platform, results_file=self.config.results_filename()) else: self.results = Results(self.config, platform) self.log_path = self.config.log_filename(self.resume)
def emulate_first_reboot(platform, config): platform.starting_temperatures = platform.take_temperature_readings() manifest = ManifestManager(config, platform, new_file=True) manifest.set_starting_temperatures(platform.starting_temperatures) results = Results(config, platform) results.write_to_file() return manifest
def test_audit_differs0001(self, monkeypatch, mock_platform, caplog): """Check that if the audit differs, a crash occurs""" make_reboot_raise(monkeypatch) no_envlogs(monkeypatch) config = Config(os.path.join(TEST_DIR, "example.krun")) krun.util.assign_platform(config, mock_platform) emulate_first_reboot(mock_platform, config, monkeypatch) results_path = config.results_filename() # mutate the audit, so it won't match later results = Results(config, mock_platform, results_file=results_path) results.audit._audit["wibble"] = "wut" results.write_to_file() sched = ExecutionScheduler(config, mock_platform.mailer, mock_platform, dry_run=True) with pytest.raises(krun.util.FatalKrunError): sched.run() expect = "This is only valid if the machine you are using is identical" assert expect in caplog.text() os.unlink(sched.manifest.path) os.unlink(results_path)
def main(parser): args = parser.parse_args() if args.dump is not None: if not args.filename.endswith(".json.bz2"): usage(parser) else: results = Results(None, None, results_file=args.filename) if args.dump == "config" or "audit": text = unicode(results.__getattribute__(args.dump)) else: text = json.dumps(results.__getattribute__(args.dump), sort_keys=True, indent=2) # String data read in from JSON are unicode objects. This matters # for us as some data in the audit includes unicode characters. # If it does, a simple print no longer suffices if the system # locale is (e.g.) ASCII. In this case print will raise an # exception. The correct thing to do is to encode() the unicode to # the system locale. print(text.encode(locale.getpreferredencoding())) sys.exit(0) if not args.filename.endswith(".krun"): usage(parser) try: if os.stat(args.filename).st_size <= 0: util.fatal('Krun configuration file %s is empty.' % args.filename) except OSError: util.fatal('Krun configuration file %s does not exist.' % args.filename) config = Config(args.filename) if args.info: # Info mode doesn't run the experiment. # Just prints some metrics and exits. util.print_session_info(config) return if args.strip_results: util.strip_results(config, args.strip_results) return attach_log_file(config, args.resume) debug("Krun invoked with arguments: %s" % sys.argv) mail_recipients = config.MAIL_TO if type(mail_recipients) is not list: util.fatal("MAIL_TO config should be a list") mailer = Mailer(mail_recipients, max_mails=config.MAX_MAILS) try: inner_main(mailer, config, args) except util.FatalKrunError as e: subject = "Fatal Krun Exception" mailer.send(subject, e.args[0], bypass_limiter=True) util.run_shell_cmd_list(config.POST_EXECUTION_CMDS) raise e
def test_eq(self, mock_platform): results = Results(None, None, results_file="krun/tests/quick_results.json.bz2") assert results == results assert not results == None assert not results == \ Results(Config("krun/tests/example.krun"), mock_platform)
def emulate_first_reboot(platform, config, monkeypatch): no_results_instantiation_check(monkeypatch) platform.starting_temperatures = platform.take_temperature_readings() manifest = ManifestManager(config, platform, new_file=True) manifest.set_starting_temperatures(platform.starting_temperatures) results = Results(config, platform) results.write_to_file() return manifest
def test_dump_config(self, no_results_instantiation_check): """Simulates krun.py --dump-config RESULTS_FILE.json.bz2 """ res_path = os.path.join(TEST_DIR, "quick_results.json.bz2") conf_path = os.path.join(TEST_DIR, "quick.krun") results = Results(None, None, results_file=res_path) with open(conf_path) as fp: config = fp.read() assert config == results.dump("config")
def strip_results(config, key_spec): from krun.platform import detect_platform from krun.results import Results platform = detect_platform(None) results = Results(config, platform, results_file=config.results_filename()) n_removed = results.strip_results(key_spec) if n_removed > 0: results.write_to_file() info("Removed %d result keys" % n_removed)
def fake_results(mock_platform, no_results_instantiation_check): results = Results(None, mock_platform) mock_platform.num_cpus = 2 mock_platform.num_per_core_measurements = 2 results.eta_estimates = {"bench:vm:variant": [1., 1.]} results.wallclock_times = {"bench:vm:variant": [[2., 2.], [2., 2.]]} results.core_cycle_counts = { "bench:vm:variant": [[[3., 3.], [ 3., 3., ]], [[3., 3.], [3., 3.]]] } results.aperf_counts = { "bench:vm:variant": [[[4., 4.], [ 4., 4., ]], [[4., 4.], [4., 4.]]] } results.mperf_counts = { "bench:vm:variant": [[[5., 5.], [ 5., 5., ]], [[5., 5.], [5., 5.]]] } results.pexec_flags = {"bench:vm:variant": ["C", "T"]} return results
def test_run_schedule0005(self, mock_platform, monkeypatch, no_results_instantiation_check): def dummy_execjob_run(self, mailer, dryrun=False): return self.empty_measurements, {}, "E" # pretend jobs fail monkeypatch.setattr(ExecutionJob, 'run', dummy_execjob_run) config = Config(os.path.join(TEST_DIR, "example.krun")) n_reboots, sched = run_with_captured_reboots(config, mock_platform, monkeypatch) assert n_reboots == 8 # 2 benchmarks, 2 vms, 2 execs results = Results(config, mock_platform, results_file=config.results_filename()) type_check_results(results) assert len(results.wallclock_times) == 4 # 2 benchmarks, 2 vms for key, execs in results.wallclock_times.iteritems(): assert len(execs) == 2 for _exec in execs: assert len(_exec) == 0 # due to error os.unlink(config.results_filename()) os.unlink(sched.manifest.path)
def test_write_results_to_disk(self, mock_platform): config = Config("krun/tests/example.krun") out_file = "krun/tests/example_results.json.bz2" results0 = Results(config, mock_platform) results0.audit = dict() results0.starting_temperatures = [4355, 9879] results0.data = {u"dummy:Java:default-java": [[1.000726]]} results0.eta_estimates = {u"dummy:Java:default-java": [1.1]} results0.reboots = 5 results0.error_flag = False results0.write_to_file() results1 = Results(config, None, results_file=out_file) assert results0 == results1 # Clean-up generated file. os.unlink(out_file)
def to_strip(): from krun.platform import detect_platform from krun.results import Results path = os.path.join(TEST_DIR, "quick.krun") config = Config(path) platform = detect_platform(None) results = Results(config, platform, results_file=config.results_filename()) return results
def test_error_flag_persists0002(self, monkeypatch, mock_platform, no_results_instantiation_check): """Check a changed dmesg will correctly set the error flag""" make_reboot_raise(monkeypatch) no_envlogs(monkeypatch) # pretend dmesg changes def dummy_check_for_dmesg_changes(self): return True monkeypatch.setattr(mock_platform, 'check_dmesg_for_changes', dummy_check_for_dmesg_changes) config = Config(os.path.join(TEST_DIR, "example.krun")) krun.util.assign_platform(config, mock_platform) emulate_first_reboot(mock_platform, config) results_path = config.results_filename() # To start, the error flag is not set results = Results(config, mock_platform, results_file=results_path) assert not results.error_flag # run an execution where the dmesg changes sched = ExecutionScheduler(config, mock_platform.mailer, mock_platform, dry_run=True) try: sched.run() except _TestReboot: pass else: assert False # reload results and check the error flag is now set results = Results(config, mock_platform, results_file=results_path) assert results.error_flag os.unlink(sched.manifest.path) os.unlink(results_path)
def test_error_flag_persists0001(self, monkeypatch, mock_platform, no_results_instantiation_check): """Check a failing exec will correctly set the error flag""" make_reboot_raise(monkeypatch) no_envlogs(monkeypatch) # pretend exec fails def dummy_job_run(self, mailer, dry): measurements = self.make_empty_measurement() return measurements, None, 'E' # measurements, instr_data, flag monkeypatch.setattr(ExecutionJob, 'run', dummy_job_run) config = Config(os.path.join(TEST_DIR, "example.krun")) krun.util.assign_platform(config, mock_platform) emulate_first_reboot(mock_platform, config) results_path = config.results_filename() # To start, the error flag is not set results = Results(config, mock_platform, results_file=results_path) assert not results.error_flag # run a (failing) execution, which will dump the results file sched = ExecutionScheduler(config, mock_platform.mailer, mock_platform, dry_run=True) try: sched.run() except _TestReboot: pass else: assert False # reload results and check the error flag is now set results = Results(config, mock_platform, results_file=results_path) assert results.error_flag os.unlink(sched.manifest.path) os.unlink(results_path)
def test_run_schedule0002(self, mock_platform, monkeypatch): config = Config(os.path.join(TEST_DIR, "example.krun")) n_reboots, sched = run_with_captured_reboots(config, mock_platform, monkeypatch) assert n_reboots == 8 # 2 benchmarks, 2 vms, 2 execs results = Results(config, mock_platform, results_file=config.results_filename()) type_check_results(results) assert len(results.wallclock_times) == 4 # 2 benchmarks, 2 vms for key, execs in results.wallclock_times.iteritems(): assert len(execs) == 2 for _exec in execs: assert len(_exec) == 0 # due to dry run os.unlink(config.results_filename()) os.unlink(sched.manifest.path)
def test_run_schedule(self, monkeypatch, mock_platform): json_file = os.path.join(TEST_DIR, "example_results.json.bz2") def dummy_shell_cmd(text): pass monkeypatch.setattr(subprocess, 'call', dummy_shell_cmd) monkeypatch.setattr(krun.util, 'run_shell_cmd', dummy_shell_cmd) config = Config(os.path.join(TEST_DIR, "example.krun")) krun.util.assign_platform(config, mock_platform) sched = ExecutionScheduler(config, mock_platform.mailer, mock_platform, resume=False, reboot=False, dry_run=True, started_by_init=False) sched.build_schedule() assert len(sched) == 8 sched.run() assert len(sched) == 0 results = Results(Config(os.path.join(TEST_DIR, "example.krun")), mock_platform, results_file=json_file) for k, execs in results.data.iteritems(): assert type(execs) is list for one_exec in execs: assert type(one_exec) is list assert all([type(x) is float for x in one_exec]) for k, execs in results.eta_estimates.iteritems(): assert type(execs) is list assert all([type(x) is float for x in execs]) assert type(results.starting_temperatures) is dict assert type(results.reboots) is int assert type(results.audit) is type(Audit(dict())) assert type(results.config) is type(Config()) assert type(results.error_flag) is bool os.unlink(json_file)
def fake_results(mock_platform, no_results_instantiation_check): results = Results(None, mock_platform) mock_platform.num_cpus = 2 mock_platform.num_per_core_measurements = 2 results.eta_estimates = {"bench:vm:variant": [1., 1.]} results.wallclock_times = {"bench:vm:variant": [[2., 2.], [2., 2.]]} results.core_cycle_counts = {"bench:vm:variant": [[[3., 3.], [3., 3.,]], [[3., 3.], [3., 3.]]]} results.aperf_counts = {"bench:vm:variant": [[[4., 4.], [4., 4.,]], [[4., 4.], [4., 4.]]]} results.mperf_counts = {"bench:vm:variant": [[[5., 5.], [5., 5.,]], [[5., 5.], [5., 5.]]]} return results
def test_run_schedule0001(self, monkeypatch, mock_platform): config = Config(os.path.join(TEST_DIR, "one_exec.krun")) n_reboots, sched = run_with_captured_reboots(config, mock_platform, monkeypatch) assert sched.manifest.total_num_execs == 1 assert sched.manifest.num_execs_left == 0 assert n_reboots == 1 results = Results(config, mock_platform, results_file=config.results_filename()) type_check_results(results) assert len(results.wallclock_times) == 1 # 1 benchmark, 1 vm for key, execs in results.wallclock_times.iteritems(): assert len(execs) == 1 for _exec in execs: assert len(_exec) == 0 # due to dry run os.unlink(config.results_filename()) os.unlink(sched.manifest.path)
def test_read_results_from_disk(self): config = Config("krun/tests/quick.krun") results = Results(None, None, results_file="krun/tests/quick_results.json.bz2") expected = { u'nbody:CPython:default-python': [[0.022256]], u'dummy:CPython:default-python': [[1.005115]], u'nbody:Java:default-java': [[26.002632]], u'dummy:Java:default-java': [[1.000941]] } assert results.config == config assert results.audit[u'uname'] == u'Linux' assert results.audit[u'debian_version'] == u'jessie/sid' assert results.data == expected assert results.starting_temperatures == {"x": 3333, "y": 4444} assert results.eta_estimates == \ { u'nbody:CPython:default-python': [0.022256], u'dummy:CPython:default-python': [1.005115], u'nbody:Java:default-java': [26.002632], u'dummy:Java:default-java': [1.000941] }
def test_run_schedule0004(self, mock_platform, monkeypatch, no_results_instantiation_check): config = Config(os.path.join(TEST_DIR, "example_skip_1vm.krun")) n_reboots, sched = run_with_captured_reboots(config, mock_platform, monkeypatch) assert n_reboots == 4 # 2 benchmarks, 2 vms, 2 execs, one VM skipped results = Results(config, mock_platform, results_file=config.results_filename()) type_check_results(results) assert len(results.wallclock_times) == 4 # 2 benchmarks, 2 vms for key, execs in results.wallclock_times.iteritems(): if "CPython" in key: assert len(execs) == 0 else: assert len(execs) == 2 for _exec in execs: assert len(_exec) == 0 # due to dry run os.unlink(config.results_filename()) os.unlink(sched.manifest.path)
def test_write_results_to_disk(self, mock_platform, no_results_instantiation_check): config = Config("krun/tests/example.krun") mock_platform.num_cpus = 4 mock_platform.num_per_core_measurements = mock_platform.num_cpus out_file = "krun/tests/example_results.json.bz2" results0 = Results(config, mock_platform) results0.audit = dict() results0.starting_temperatures = [4355, 9879] results0.wallclock_times = {u"dummy:Java:default-java": [[1.000726]]} results0.eta_estimates = {u"dummy:Java:default-java": [1.1]} results0.core_cycle_counts = { u"dummy:Java:default-java": [[[2], [3], [4], [5]]] } results0.aperf_counts = { u"dummy:Java:default-java": [[[3], [4], [5], [6]]] } results0.mperf_counts = { u"dummy:Java:default-java": [[[4], [5], [6], [7]]] } results0.reboots = 5 results0.error_flag = False results0.write_to_file() results1 = Results(config, None, results_file=out_file) assert results0 == results1 # Clean-up generated file. os.unlink(out_file)
def main(parser): args = parser.parse_args() if args.dump is not None: if not args.filename.endswith(".json.bz2"): usage(parser) else: Results.ok_to_instantiate = True results = Results(None, None, results_file=args.filename) text = results.dump(args.dump) # String data read in from JSON are unicode objects. This matters # for us as some data in the audit includes unicode characters. # If it does, a simple print no longer suffices if the system # locale is (e.g.) ASCII. In this case print will raise an # exception. The correct thing to do is to encode() the unicode to # the system locale. print(text.encode(locale.getpreferredencoding())) sys.exit(0) if not args.filename.endswith(".krun"): usage(parser) try: if os.stat(args.filename).st_size <= 0: util.fatal('Krun configuration file %s is empty.' % args.filename) except OSError: util.fatal('Krun configuration file %s does not exist.' % args.filename) config = Config(args.filename) if args.info: # Info mode doesn't run the experiment. # Just prints some metrics and exits. util.print_session_info(config) return manifest_filename = ManifestManager.get_filename(config) on_first_invocation = not (os.path.isfile(manifest_filename) and os.stat(manifest_filename).st_size > 0) log_file = config.log_filename() if on_first_invocation and os.path.exists(log_file): util.fatal("Log file '%s' already exists. " "Move the file away before running Krun." % log_file) attach_log_file(config, not on_first_invocation) debug("Krun invoked with arguments: %s" % sys.argv) mail_recipients = config.MAIL_TO if type(mail_recipients) is not list: util.fatal("MAIL_TO config should be a list") mailer = Mailer(mail_recipients, max_mails=config.MAX_MAILS) try: inner_main(mailer, on_first_invocation, config, args) except Exception as exn: error_info = sys.exc_info() subject = "Fatal Krun Exception" lines = ["Fatal Krun error: %s\n" % str(error_info[1])] for frame in traceback.format_tb(error_info[2]): lines.append(frame) msg = "".join(lines) util.log_and_mail(mailer, debug, subject, msg, bypass_limiter=True) raise exn
def test_write_results_to_disk(self, mock_platform, no_results_instantiation_check): config = Config("krun/tests/example.krun") mock_platform.num_cpus = 4 mock_platform.num_per_core_measurements = mock_platform.num_cpus out_file = "krun/tests/example_results.json.bz2" results0 = Results(config, mock_platform) results0.audit = dict() results0.starting_temperatures = [4355, 9879] results0.wallclock_times = {u"dummy:Java:default-java": [[1.000726]]} results0.eta_estimates = {u"dummy:Java:default-java": [1.1]} results0.core_cycle_counts = {u"dummy:Java:default-java": [[[2], [3], [4], [5]]]} results0.aperf_counts = {u"dummy:Java:default-java": [[[3], [4], [5], [6]]]} results0.mperf_counts = {u"dummy:Java:default-java": [[[4], [5], [6], [7]]]} results0.reboots = 5 results0.error_flag = False results0.write_to_file() results1 = Results(config, None, results_file=out_file) assert results0 == results1 # Clean-up generated file. os.unlink(out_file)
def inner_main(mailer, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) if out_file_exists and not os.path.isfile(out_file): util.fatal( "Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and not args.resume: util.fatal("Output file '%s' already exists. " "Either resume the session (--resume) or " "move the file away" % out_file) if not out_file_exists and args.resume: util.fatal("No results file to resume. Expected '%s'" % out_file) if args.started_by_init and not args.reboot: util.fatal("--started-by-init makes no sense without --reboot") if args.started_by_init and not args.resume: util.fatal("--started-by-init makes no sense without --resume") if args.develop: warn("Developer mode enabled. Results will not be reliable.") # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer) if not args.develop: debug("Checking platform preliminaries") platform.check_preliminaries() else: # Needed to skip the use of certain tools and techniques. # E.g. switching user. warn("Not checking platform prerequisites due to developer mode") platform.developer_mode = True platform.collect_audit() # If the user has asked for resume-mode, the current platform must # be an identical machine to the current one. error_msg = ("You have asked Krun to resume an interrupted benchmark. " + "This is only valid if the machine you are using is " + "identical to the one on which the last results were " + "gathered, which is not the case.") current = None if args.resume: # output file must exist, due to check above assert(out_file_exists) current = Results(config, platform, results_file=out_file) from krun.audit import Audit if not Audit(platform.audit) == current.audit: util.fatal(error_msg) debug("Using pre-recorded initial temperature readings") platform.starting_temperatures = current.starting_temperatures else: # Touch the config file to update its mtime. This is required # by resume-mode which uses the mtime to determine the name of # the log file, should this benchmark be resumed. _, _, rc = util.run_shell_cmd("touch " + args.filename) if rc != 0: util.fatal("Could not touch config file: " + args.filename) info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) if args.develop or args.dry_run: info("SIMULATED: time.sleep(%s)" % config.TEMP_READ_PAUSE) else: time.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings() # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, resume=args.resume, reboot=args.reboot, dry_run=args.dry_run, started_by_init=args.started_by_init) sched.build_schedule() sched.run()
class ExecutionScheduler(object): """Represents our entire benchmarking session""" def __init__(self, config, mailer, platform, resume=False, reboot=False, dry_run=False, started_by_init=False): self.mailer = mailer self.config = config self.work_deque = deque() self.eta_avail = 0 self.jobs_done = 0 self.platform = platform self.resume = resume self.reboot = reboot self.dry_run = dry_run self.started_by_init = started_by_init if resume: self.results = Results(self.config, platform, results_file=self.config.results_filename()) else: self.results = Results(self.config, platform) self.log_path = self.config.log_filename(self.resume) def set_eta_avail(self): """call after adding job before eta should become available""" self.eta_avail = len(self) def jobs_until_eta_known(self): return self.eta_avail - self.jobs_done def add_job(self, job): self.work_deque.append(job) def remove_job_by_key(self, key): for job in self.work_deque: if job.key == key: job_to_remove = job break else: raise JobMissingError(key) debug("Removed %s from schedule" % key) self.work_deque.remove(job_to_remove) def next_job(self, peek=False): try: job = self.work_deque.popleft() if peek: self.work_deque.appendleft(job) return job except IndexError: raise ScheduleEmpty() # we are done def __len__(self): return len(self.work_deque) def get_estimated_exec_duration(self, key): previous_exec_times = self.results.eta_estimates.get(key) if previous_exec_times: return mean(previous_exec_times) else: return None # we don't know def get_estimated_overall_duration(self): etas = [j.get_estimated_exec_duration() for j in self.work_deque] if None in etas: return None # we don't know return sum(etas) def get_exec_estimate_time_formatter(self, key): return TimeEstimateFormatter(self.get_estimated_exec_duration(key)) def get_overall_time_estimate_formatter(self): return TimeEstimateFormatter(self.get_estimated_overall_duration()) def add_eta_info(self, key, exec_time): self.results.eta_estimates[key].append(exec_time) def build_schedule(self): """Builds a queue of process execution jobs. Returns two sets: non_skipped_keys, skipped_keys""" skipped_keys, non_skipped_keys = set(), set() one_exec_scheduled = False eta_avail_job = None for exec_n in xrange(self.config.N_EXECUTIONS): for vm_name, vm_info in self.config.VMS.items(): for bmark, param in self.config.BENCHMARKS.items(): for variant in vm_info["variants"]: job = ExecutionJob(self, vm_name, vm_info, bmark, variant, param) if not self.config.should_skip(job.key): non_skipped_keys |= set([job.key]) if one_exec_scheduled and not eta_avail_job: # first job of second executions eta becomes known. eta_avail_job = job self.set_eta_avail() self.add_job(job) else: skipped_keys |= set([job.key]) if not one_exec_scheduled: debug("%s is in skip list. Not scheduling." % job.key) one_exec_scheduled = True self.expected_reboots = len(self) # Resume mode: if previous results are available, remove the # jobs from the schedule which have already been executed, and # add the results to this object, ready to be saved to a Json file. if self.resume: self._remove_previous_execs_from_schedule() # Sanity check ETA estimates for key, exec_data in self.results.data.iteritems(): got_len = len(self.results.eta_estimates[key]) expect_len = len(exec_data) if expect_len != got_len: msg = "ETA estimates didn't tally with results: " msg += "key=%s, expect_len=%d, got_len=%d " % \ (key, expect_len, got_len) msg += "data[%s]=%s; " % (key, str(self.results.data[key])) msg += "eta[%s]=%s" % \ (key, str(self.results.eta_estimates[key])) util.log_and_mail(self.mailer, error, "Fatal Krun Error", msg, bypass_limiter=True, exit=True) return non_skipped_keys, skipped_keys def _remove_previous_execs_from_schedule(self): for key in self.results.data: num_completed_jobs = self.results.jobs_completed(key) if num_completed_jobs > 0: try: debug("%s has already been run %d times." % (key, num_completed_jobs)) for _ in range(num_completed_jobs): self.remove_job_by_key(key) self.jobs_done += 1 except JobMissingError as excn: tup = (excn.key, self.config.filename, self.config.results_filename()) msg = ("Failed to resume benchmarking session\n." + "The execution %s appears in results " + "file: %s, but not in config file: %s." % tup) util.fatal(msg) def _make_pre_post_cmd_env(self): """Prepare an environment dict for pre/post execution hooks""" jobs_until_eta_known = self.jobs_until_eta_known() if jobs_until_eta_known > 0: eta_val = "Unknown. Known in %d process executions." % \ self.jobs_until_eta_known() else: eta_val = self.get_overall_time_estimate_formatter().finish_str return { "KRUN_RESULTS_FILE": self.config.results_filename(), "KRUN_LOG_FILE": self.config.log_filename(resume=True), "KRUN_ETA_DATUM": now_str(), "KRUN_ETA_VALUE": eta_val, } def run(self): """Benchmark execution starts here""" jobs_left = len(self) if jobs_left == 0: debug("Krun started with an empty queue of jobs") if not self.started_by_init: util.log_and_mail(self.mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % self.log_path, bypass_limiter=True) if self.reboot and not self.started_by_init: # Reboot before first benchmark (dumps results file). info("Reboot prior to first execution") self._reboot() if self.reboot and self.started_by_init and jobs_left > 0: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) if self.dry_run: info("SIMULATED: time.sleep (would have waited %s seconds)." % STARTUP_WAIT_SECONDS) else: time.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() while True: self.platform.wait_for_temperature_sensors() jobs_left = len(self) if jobs_left == 0: break # Run the user's pre-process-execution commands util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env()) job = self.next_job() # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() raw_exec_result = job.run(self.mailer, self.dry_run) exec_end_time = time.time() exec_result = util.format_raw_exec_results(raw_exec_result) if not exec_result and not self.dry_run: self.results.error_flag = True self.results.data[job.key].append(exec_result) eta_info = exec_end_time - exec_start_time if self.reboot: # Add time taken to wait for system to come up if we are in # reboot mode. eta_info += STARTUP_WAIT_SECONDS self.add_eta_info(job.key, eta_info) info("%d executions left in scheduler queue" % (jobs_left - 1)) # We dump the json after each experiment so we can monitor the # json file mid-run. It is overwritten each time. self.results.write_to_file() self.jobs_done += 1 # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. util.run_shell_cmd_list(self.config.POST_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env()) tfmt = self.get_overall_time_estimate_formatter() if self.eta_avail == self.jobs_done: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format("Estimated completion", tfmt.finish_str, tfmt.delta_str)) try: job = self.next_job(peek=True) except ScheduleEmpty: pass # no next job else: # print info about the next job info("Next execution is '%s(%d)' (%s variant) under '%s'" % (job.benchmark, job.parameter, job.variant, job.vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) if (self.eta_avail is not None) and (self.jobs_done < self.eta_avail): info("Executions until ETA known: %s" % self.jobs_until_eta_known()) if self.platform.check_dmesg_for_changes(): self.results.error_flag = True if self.reboot and len(self) > 0: info("Reboot in preparation for next execution") self._reboot() self.platform.save_power() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if self.results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if self.results.error_flag: msg += "\n\n%s" % err_msg if self.reboot: msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True) def _reboot(self): self.results.reboots += 1 debug("About to execute reboot: %g, expecting %g in total." % (self.results.reboots, self.expected_reboots)) # Dump the results file. This may already have been done, but we # have changed self.nreboots, which needs to be written out. self.results.write_to_file() if self.results.reboots > self.expected_reboots: util.fatal(("HALTING now to prevent an infinite reboot loop: " + "INVARIANT num_reboots <= num_jobs violated. " + "Krun was about to execute reboot number: %g. " + "%g jobs have been completed, %g are left to go.") % (self.results.reboots, self.jobs_done, len(self))) if self.dry_run: info("SIMULATED: reboot (restarting Krun in-place)") args = sys.argv if not self.started_by_init: args.extend(["--resume", "--started-by-init"]) debug("Simulated reboot with args: " + " ".join(args)) os.execv(args[0], args) # replace myself assert False # unreachable else: subprocess.call(self.platform.get_reboot_cmd())
def inner_main(mailer, on_first_invocation, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) instr_dir = util.get_instr_json_dir(config) instr_dir_exists = os.path.exists(instr_dir) envlog_dir = util.get_envlog_dir(config) envlog_dir_exists = os.path.exists(envlog_dir) if out_file_exists and not os.path.isfile(out_file): util.fatal( "Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and on_first_invocation: util.fatal("Output results file '%s' already exists. " "Move the file away before running Krun." % out_file) if instr_dir_exists and on_first_invocation: util.fatal("Instrumentation dir '%s' exists." % instr_dir) if envlog_dir_exists and on_first_invocation: util.fatal("Env log dir '%s' exists." % envlog_dir) if not out_file_exists and not on_first_invocation: util.fatal("No results file to resume. Expected '%s'" % out_file) # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer, config) platform.quick_mode = args.quick platform.no_user_change = args.no_user_change platform.no_tickless_check = args.no_tickless_check platform.no_pstate_check = args.no_pstate_check platform.hardware_reboots = args.hardware_reboots # Create the instrumentation directory if required if on_first_invocation: # We only want make a dir if >=1 VM is in instrumentation mode. for vm in config.VMS.itervalues(): if vm['vm_def'].instrument: util.make_instr_dir(config) break debug("Checking platform preliminaries") platform.check_preliminaries() # Make a bit of noise if this is a virtualised environment if platform.is_virtual(): warn("This appears to be a virtualised host. The results will be flawed. " "Use bare-metal for reliable results!") platform.collect_audit() # At this point the config file is OK, and on-disk state is consistent, # so let's daemonise (if requested). if args.daemonise: util.daemonise() if not on_first_invocation: # output file must exist, due to check above assert(out_file_exists) debug("Using pre-recorded initial temperature readings") manifest = ManifestManager(config, platform) platform_temps = {} for sensor, tup in manifest.starting_temperatures.iteritems(): platform_temps[sensor] = tup[1] platform.starting_temperatures = platform_temps else: manifest = ManifestManager(config, platform, new_file=True) if manifest.num_execs_left == 0: # No executions, or all skipped fatal("Empty schedule!") try: info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) # This part is wrapped in hooks, so that if daemons or networking are # taken down for process executions, then the initial temperature # reading gets the same treatment. util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS,) platform.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings() manifest.set_starting_temperatures(platform.starting_temperatures) # Write out an empty results file. After the initial reboot Krun # will expect this to exist. Results.ok_to_instantiate = True results = Results(config, platform) results.write_to_file() except: raise finally: util.run_shell_cmd_list(config.POST_EXECUTION_CMDS,) log_path = config.log_filename(resume=False) util.log_and_mail(mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % log_path, bypass_limiter=True) util.reboot(manifest, platform) # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run) sched.run()
def inner_main(mailer, on_first_invocation, config, args): out_file = config.results_filename() out_file_exists = os.path.exists(out_file) instr_dir = util.get_instr_json_dir(config) instr_dir_exists = os.path.exists(instr_dir) envlog_dir = util.get_envlog_dir(config) envlog_dir_exists = os.path.exists(envlog_dir) if out_file_exists and not os.path.isfile(out_file): util.fatal("Output file '%s' exists but is not a regular file" % out_file) if out_file_exists and on_first_invocation: util.fatal("Output results file '%s' already exists. " "Move the file away before running Krun." % out_file) if instr_dir_exists and on_first_invocation: util.fatal("Instrumentation dir '%s' exists." % instr_dir) if envlog_dir_exists and on_first_invocation: util.fatal("Env log dir '%s' exists." % envlog_dir) if not out_file_exists and not on_first_invocation: util.fatal("No results file to resume. Expected '%s'" % out_file) # Initialise platform instance and assign to VM defs. # This needs to be done early, so VM sanity checks can run. platform = detect_platform(mailer, config) platform.quick_mode = args.quick platform.no_user_change = args.no_user_change platform.no_tickless_check = args.no_tickless_check platform.no_pstate_check = args.no_pstate_check platform.hardware_reboots = args.hardware_reboots # Create the instrumentation directory if required if on_first_invocation: # We only want make a dir if >=1 VM is in instrumentation mode. for vm in config.VMS.itervalues(): if vm['vm_def'].instrument: util.make_instr_dir(config) break debug("Checking platform preliminaries") platform.check_preliminaries() # Make a bit of noise if this is a virtualised environment if platform.is_virtual(): warn( "This appears to be a virtualised host. The results will be flawed. " "Use bare-metal for reliable results!") platform.collect_audit() # At this point the config file is OK, and on-disk state is consistent, # so let's daemonise (if requested). if args.daemonise: util.daemonise() if not on_first_invocation: # output file must exist, due to check above assert (out_file_exists) debug("Using pre-recorded initial temperature readings") manifest = ManifestManager(config, platform) platform_temps = {} for sensor, tup in manifest.starting_temperatures.iteritems(): platform_temps[sensor] = tup[1] platform.starting_temperatures = platform_temps else: manifest = ManifestManager(config, platform, new_file=True) if manifest.num_execs_left == 0: # No executions, or all skipped fatal("Empty schedule!") try: info(("Wait %s secs to allow system to cool prior to " "collecting initial temperature readings") % config.TEMP_READ_PAUSE) # This part is wrapped in hooks, so that if daemons or networking are # taken down for process executions, then the initial temperature # reading gets the same treatment. util.run_shell_cmd_list(config.PRE_EXECUTION_CMDS, ) platform.sleep(config.TEMP_READ_PAUSE) debug("Taking fresh initial temperature readings") platform.starting_temperatures = platform.take_temperature_readings( ) manifest.set_starting_temperatures(platform.starting_temperatures) # Write out an empty results file. After the initial reboot Krun # will expect this to exist. Results.ok_to_instantiate = True results = Results(config, platform) results.write_to_file() except: raise finally: util.run_shell_cmd_list(config.POST_EXECUTION_CMDS, ) log_path = config.log_filename(resume=False) util.log_and_mail(mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % log_path, bypass_limiter=True) util.reboot(manifest, platform) # Assign platform to VM defs -- needs to happen early for sanity checks util.assign_platform(config, platform) sanity_checks(config, platform) # Build job queue -- each job is an execution sched = ExecutionScheduler(config, mailer, platform, dry_run=args.dry_run) sched.run()
def run(self): """Benchmark execution starts here""" # In reboot mode, wait for the system to come up before we proceed if self.platform.hardware_reboots: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) self.platform.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() assert self.manifest.num_execs_left > 0 self.platform.wait_for_temperature_sensors() bench, vm, variant = self.manifest.next_exec_key.split(":") key_pexec_idx = self.manifest.next_exec_key_index() job = ExecutionJob(self, vm, self.config.VMS[vm], bench, variant, self.config.BENCHMARKS[bench], key_pexec_idx) # Default to error state. This is the value the finally block will see # if an exception is raised inside the try block, otherwise it is # re-assigned based on the result of running the benchmark. flag = 'E' # Run the pre-exec commands, the benchmark and the post-exec commands. # These are wrapped in a try/except, so that the post-exec commands # are always executed, even if an exception has occurred. We only # reboot /after/ the post-exec commands have completed. results = None try: # Run the user's pre-process-execution commands. We can't put an # ETA estimate in the environment for the pre-commands as we have # not (and should not) load the results file into memory yet. # # It might seem tempting to move this outside the try block, to # ensure that post-hooks are only run if pre-hooks ran. We don't, # thus avoiding the case where only *part* of the pre-hooks run, # but the post-hooks then don't run. util.run_shell_cmd_list(self.config.PRE_EXECUTION_CMDS,) # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() measurements, instr_data, flag = job.run(self.mailer, self.dry_run) exec_end_time = time.time() # Only now is it OK to load the results file into memory. Results.ok_to_instantiate = True results = Results(self.config, self.platform, results_file=self.config.results_filename()) # Bail early if the process execution needs to be re-run. if flag == "O": util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_post_cmd_env(results) ) info("Rebooting to re-run previous process execution") util.reboot(self.manifest, self.platform, update_count=False) # reboot() does not return raise RuntimeError("reached unreachable code!") # Store new result. results.append_exec_measurements(job.key, measurements, flag) # Store instrumentation data in a separate file if job.vm_info["vm_def"].instrument: key_exec_num = self.manifest.completed_exec_counts[job.key] util.dump_instr_json(job.key, key_exec_num, self.config, instr_data) eta_info = exec_end_time - exec_start_time if self.platform.hardware_reboots: # Add time taken to wait for system to come up if we are in # hardware-reboot mode. eta_info += STARTUP_WAIT_SECONDS results.eta_estimates[job.key].append(eta_info) self.manifest.update(flag) except Exception: raise finally: # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. # _make_post_cmd_env() needs the results to make an ETA. If an # exception occurred in the above try block, there's a chance that # they have not have been loaded. if results is None: Results.ok_to_instantiate = True results = Results(self.config, self.platform, results_file=self.config.results_filename()) # If errors occured, set error flag in results file if self.platform.check_dmesg_for_changes(self.manifest) or \ flag == 'E': results.error_flag = True results.write_to_file() util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_post_cmd_env(results) ) tfmt = self.get_overall_time_estimate_formatter(results) if self.manifest.eta_avail_idx == self.manifest.next_exec_idx: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format( "Estimated completion (whole session)", tfmt.finish_str, tfmt.delta_str)) info("%d executions left in scheduler queue" % self.manifest.num_execs_left) if self.manifest.num_execs_left > 0 and \ self.manifest.eta_avail_idx > self.manifest.next_exec_idx: info("Executions until ETA known: %s" % (self.manifest.eta_avail_idx - self.manifest.next_exec_idx)) # Although it would have been nice to have checked this prior to # running the execution, it depends on the results file, which we # should not load prior to the process execution. util.check_audit_unchanged(results, self.platform) assert self.manifest.num_execs_left >= 0 if self.manifest.num_execs_left > 0: # print info about the next job benchmark, vm_name, variant = \ self.manifest.next_exec_key.split(":") info("Next execution is '%s(%d)' (%s variant) under '%s'" % (benchmark, self.config.BENCHMARKS[benchmark], variant, vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key, results) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) info("Reboot in preparation for next execution") util.reboot(self.manifest, self.platform) elif self.manifest.num_execs_left == 0: self.platform.save_power() if self.config.ENABLE_PINNING: self.platform.clear_cpu_pinning() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if results.error_flag: msg += "\n\n%s" % err_msg msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True)
class ExecutionScheduler(object): """Represents our entire benchmarking session""" def __init__(self, config, mailer, platform, resume=False, reboot=False, dry_run=False, started_by_init=False): self.mailer = mailer self.config = config self.work_deque = deque() self.eta_avail = 0 self.jobs_done = 0 self.platform = platform self.resume = resume self.reboot = reboot self.dry_run = dry_run self.started_by_init = started_by_init if resume: self.results = Results(self.config, platform, results_file=self.config.results_filename()) else: self.results = Results(self.config, platform) self.log_path = self.config.log_filename(self.resume) def set_eta_avail(self): """call after adding job before eta should become available""" self.eta_avail = len(self) def jobs_until_eta_known(self): return self.eta_avail - self.jobs_done def add_job(self, job): self.work_deque.append(job) def remove_job_by_key(self, key): for job in self.work_deque: if job.key == key: job_to_remove = job break else: raise JobMissingError(key) debug("Removed %s from schedule" % key) self.work_deque.remove(job_to_remove) def next_job(self, peek=False): try: job = self.work_deque.popleft() if peek: self.work_deque.appendleft(job) return job except IndexError: raise ScheduleEmpty() # we are done def __len__(self): return len(self.work_deque) def get_estimated_exec_duration(self, key): previous_exec_times = self.results.eta_estimates.get(key) if previous_exec_times: return mean(previous_exec_times) else: return None # we don't know def get_estimated_overall_duration(self): etas = [j.get_estimated_exec_duration() for j in self.work_deque] if None in etas: return None # we don't know return sum(etas) def get_exec_estimate_time_formatter(self, key): return TimeEstimateFormatter(self.get_estimated_exec_duration(key)) def get_overall_time_estimate_formatter(self): return TimeEstimateFormatter(self.get_estimated_overall_duration()) def add_eta_info(self, key, exec_time): self.results.eta_estimates[key].append(exec_time) def build_schedule(self): """Builds a queue of process execution jobs. Returns two sets: non_skipped_keys, skipped_keys""" skipped_keys, non_skipped_keys = set(), set() one_exec_scheduled = False eta_avail_job = None for exec_n in xrange(self.config.N_EXECUTIONS): for vm_name, vm_info in self.config.VMS.items(): for bmark, param in self.config.BENCHMARKS.items(): for variant in vm_info["variants"]: job = ExecutionJob(self, vm_name, vm_info, bmark, variant, param) if not self.config.should_skip(job.key): non_skipped_keys |= set([job.key]) if one_exec_scheduled and not eta_avail_job: # first job of second executions eta becomes known. eta_avail_job = job self.set_eta_avail() self.add_job(job) else: skipped_keys |= set([job.key]) if not one_exec_scheduled: debug("%s is in skip list. Not scheduling." % job.key) one_exec_scheduled = True self.expected_reboots = len(self) # Resume mode: if previous results are available, remove the # jobs from the schedule which have already been executed, and # add the results to this object, ready to be saved to a Json file. if self.resume: self._remove_previous_execs_from_schedule() # Sanity check ETA estimates for key, exec_data in self.results.data.iteritems(): got_len = len(self.results.eta_estimates[key]) expect_len = len(exec_data) if expect_len != got_len: msg = "ETA estimates didn't tally with results: " msg += "key=%s, expect_len=%d, got_len=%d " % \ (key, expect_len, got_len) msg += "data[%s]=%s; " % (key, str(self.results.data[key])) msg += "eta[%s]=%s" % \ (key, str(self.results.eta_estimates[key])) util.log_and_mail(self.mailer, error, "Fatal Krun Error", msg, bypass_limiter=True, exit=True) return non_skipped_keys, skipped_keys def _remove_previous_execs_from_schedule(self): for key in self.results.data: num_completed_jobs = self.results.jobs_completed(key) if num_completed_jobs > 0: try: debug("%s has already been run %d times." % (key, num_completed_jobs)) for _ in range(num_completed_jobs): self.remove_job_by_key(key) self.jobs_done += 1 except JobMissingError as excn: tup = (excn.key, self.config.filename, self.config.results_filename()) msg = ("Failed to resume benchmarking session\n." + "The execution %s appears in results " + "file: %s, but not in config file: %s." % tup) util.fatal(msg) def _make_pre_post_cmd_env(self): """Prepare an environment dict for pre/post execution hooks""" jobs_until_eta_known = self.jobs_until_eta_known() if jobs_until_eta_known > 0: eta_val = "Unknown. Known in %d process executions." % \ self.jobs_until_eta_known() else: eta_val = self.get_overall_time_estimate_formatter().finish_str return { "KRUN_RESULTS_FILE": self.config.results_filename(), "KRUN_LOG_FILE": self.config.log_filename(resume=True), "KRUN_ETA_DATUM": now_str(), "KRUN_ETA_VALUE": eta_val, } def run(self): """Benchmark execution starts here""" jobs_left = len(self) if jobs_left == 0: debug("Krun started with an empty queue of jobs") if not self.started_by_init: util.log_and_mail(self.mailer, debug, "Benchmarking started", "Benchmarking started.\nLogging to %s" % self.log_path, bypass_limiter=True) if self.reboot and not self.started_by_init: # Reboot before first benchmark (dumps results file). info("Reboot prior to first execution") self._reboot() if self.reboot and self.started_by_init and jobs_left > 0: debug("Waiting %s seconds for the system to come up." % str(STARTUP_WAIT_SECONDS)) if self.dry_run: info("SIMULATED: time.sleep (would have waited %s seconds)." % STARTUP_WAIT_SECONDS) else: time.sleep(STARTUP_WAIT_SECONDS) # Important that the dmesg is collected after the above startup wait. # Otherwise we get spurious dmesg changes. self.platform.collect_starting_dmesg() while True: self.platform.wait_for_temperature_sensors() jobs_left = len(self) if jobs_left == 0: break # Run the user's pre-process-execution commands util.run_shell_cmd_list( self.config.PRE_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env() ) job = self.next_job() # We collect rough execution times separate from real results. The # reason for this is that, even if a benchmark crashes it takes # time and we need to account for this when making estimates. A # crashing benchmark will give an empty list of iteration times, # meaning we can't use 'raw_exec_result' below for estimates. exec_start_time = time.time() raw_exec_result = job.run(self.mailer, self.dry_run) exec_end_time = time.time() exec_result = util.format_raw_exec_results(raw_exec_result) if not exec_result and not self.dry_run: self.results.error_flag = True self.results.data[job.key].append(exec_result) eta_info = exec_end_time - exec_start_time if self.reboot: # Add time taken to wait for system to come up if we are in # reboot mode. eta_info += STARTUP_WAIT_SECONDS self.add_eta_info(job.key, eta_info) info("%d executions left in scheduler queue" % (jobs_left - 1)) # We dump the json after each experiment so we can monitor the # json file mid-run. It is overwritten each time. self.results.write_to_file() self.jobs_done += 1 # Run the user's post-process-execution commands with updated # ETA estimates. Important that this happens *after* dumping # results, as the user is likely copying intermediate results to # another host. util.run_shell_cmd_list( self.config.POST_EXECUTION_CMDS, extra_env=self._make_pre_post_cmd_env() ) tfmt = self.get_overall_time_estimate_formatter() if self.eta_avail == self.jobs_done: # We just found out roughly how long the session has left, mail out. msg = "ETA for current session now known: %s" % tfmt.finish_str util.log_and_mail(self.mailer, debug, "ETA for Current Session Available", msg, bypass_limiter=True) info("{:<25s}: {} ({} from now)".format( "Estimated completion", tfmt.finish_str, tfmt.delta_str)) try: job = self.next_job(peek=True) except ScheduleEmpty: pass # no next job else: # print info about the next job info("Next execution is '%s(%d)' (%s variant) under '%s'" % (job.benchmark, job.parameter, job.variant, job.vm_name)) tfmt = self.get_exec_estimate_time_formatter(job.key) info("{:<35s}: {} ({} from now)".format( "Estimated completion (next execution)", tfmt.finish_str, tfmt.delta_str)) if (self.eta_avail is not None) and (self.jobs_done < self.eta_avail): info("Executions until ETA known: %s" % self.jobs_until_eta_known()) if self.platform.check_dmesg_for_changes(): self.results.error_flag = True if self.reboot and len(self) > 0: info("Reboot in preparation for next execution") self._reboot() self.platform.save_power() info("Done: Results dumped to %s" % self.config.results_filename()) err_msg = "Errors/warnings occurred -- read the log!" if self.results.error_flag: warn(err_msg) msg = "Session completed. Log file at: '%s'" % (self.log_path) if self.results.error_flag: msg += "\n\n%s" % err_msg if self.reboot: msg += "\n\nDon't forget to disable Krun at boot." util.log_and_mail(self.mailer, info, "Benchmarks Complete", msg, bypass_limiter=True) def _reboot(self): self.results.reboots += 1 debug("About to execute reboot: %g, expecting %g in total." % (self.results.reboots, self.expected_reboots)) # Dump the results file. This may already have been done, but we # have changed self.nreboots, which needs to be written out. self.results.write_to_file() if self.results.reboots > self.expected_reboots: util.fatal(("HALTING now to prevent an infinite reboot loop: " + "INVARIANT num_reboots <= num_jobs violated. " + "Krun was about to execute reboot number: %g. " + "%g jobs have been completed, %g are left to go.") % (self.results.reboots, self.jobs_done, len(self))) if self.dry_run: info("SIMULATED: reboot (restarting Krun in-place)") args = sys.argv if not self.started_by_init: args.extend(["--resume", "--started-by-init"]) debug("Simulated reboot with args: " + " ".join(args)) os.execv(args[0], args) # replace myself assert False # unreachable else: subprocess.call(self.platform.get_reboot_cmd())