Ejemplo n.º 1
0
 def do_work(self, central_db_obj, sched_db_obj, trace_id=None):
     """
     Args:
     - central_db_obj: DB object configured to access the analysis database.
     - sched_db_obj: DB object configured to access the slurm database of
         an experiment worker. 
     - trace_id: If set to an experiment valid trace_id, it runs only the
         experiment identified by trace_id.
     """
     there_are_more=True
     while there_are_more:
         ed = ExperimentDefinition()
         if trace_id:
             ed.load(central_db_obj, trace_id)
             ed.mark_pre_simulating(central_db_obj)
         else:
             there_are_more = ed.load_fresh(central_db_obj)
         if there_are_more:
             print(("About to run exp({0}):{1}".format(
                             ed._trace_id, ed._name)))
             er = ExperimentRunner(ed)
             if(er.do_full_run(sched_db_obj, central_db_obj)):
                 print(("Exp({0}) Done".format(
                                              ed._trace_id)))
             else:
                 print(("Exp({0}) Error!".format(
                                              ed._trace_id)))
         if trace_id:
             break  
Ejemplo n.º 2
0
 def rescue_exp(self, central_db_obj, sched_db_obj, trace_id=None):
     """Retrieves the job trace from the database of an experiment worker and
     stores it in the central db.
     Args:
     - central_db_obj: DB object configured to access the analysis database.
     - sched_db_obj: DB object configured to access the slurm database of
         an experiment worker. 
     - trace_id: trace_id of the experiment to which the rescued trace
         corresponds.
     """
     there_are_more=True
     while there_are_more:
         ed = ExperimentDefinition()
         if trace_id:
             ed.load(central_db_obj, trace_id)
             ed.mark_simulation_done(central_db_obj)
         else:
             there_are_more = ed.load_next_state("simulation_failed",
                                                 "simulation_done")
         if there_are_more:
             print(("About to run resque({0}):{1}".format(
                             ed._trace_id, ed._name)))
             er = ExperimentRunner(ed)
             if(er.check_trace_and_store(sched_db_obj, central_db_obj)):
                 er.clean_trace_file()
                 print(("Exp({0}) Done".format(
                                              ed._trace_id)))
             else:
                 print(("Exp({0}) Error!".format(
                                              ed._trace_id)))
         if trace_id:
             break  
Ejemplo n.º 3
0
    def test_conf(self):
        ExperimentRunner.configure("tmp/trace_folder",
                                   "tmp",
                                   True,
                                   "myhost",
                                   "myUser",
                                   local_conf_dir="local_file",
                                   scheduler_conf_dir="sched_conf_dir",
                                   scheduler_conf_file_base="conf.file",
                                   scheduler_folder="folder",
                                   scheduler_script="script",
                                   manifest_folder="man_folder")

        self.assertEqual(ExperimentRunner._trace_folder, "tmp/trace_folder")
        self.assertEqual(ExperimentRunner._trace_generation_folder, "tmp")
        self.assertEqual(ExperimentRunner._local, True)
        self.assertEqual(ExperimentRunner._run_hostname, "myhost")
        self.assertEqual(ExperimentRunner._run_user, "myUser")
        self.assertEqual(ExperimentRunner._local_conf_dir, "local_file")
        self.assertEqual(ExperimentRunner._scheduler_conf_dir,
                         "sched_conf_dir")
        self.assertEqual(ExperimentRunner._scheduler_conf_file_base,
                         "conf.file")
        self.assertEqual(ExperimentRunner._scheduler_folder, "folder")
        self.assertEqual(ExperimentRunner._scheduler_script, "script")
        self.assertEqual(ExperimentRunner._manifest_folder, "man_folder")
Ejemplo n.º 4
0
    def test_generate_trace_files(self):
        ExperimentRunner.configure("tmp/trace_folder",
                                   "tmp",
                                   True,
                                   "myhost",
                                   "myUser",
                                   drain_time=0)
        self.assertEqual(ExperimentRunner._trace_folder, "tmp/trace_folder")
        self.assertEqual(ExperimentRunner._trace_generation_folder, "tmp")
        self.assertEqual(ExperimentRunner._local, True)

        ed = ExperimentDefinition(seed="seeeed",
                                  machine="edison",
                                  trace_type="single",
                                  manifest_list=[{
                                      "share": 1.0,
                                      "manifest": "manifestSim.json"
                                  }],
                                  workflow_policy="period",
                                  workflow_period_s=5,
                                  workflow_handling="single",
                                  preload_time_s=20,
                                  start_date=datetime(2016, 1, 1),
                                  workload_duration_s=400)

        er = ExperimentRunner(ed)
        er._generate_trace_files(ed)
        self.assertTrue(
            os.path.exists("tmp/edison-single-m1.0manifestSim.json"
                           "-period-p5-0.0-single-t-0d-0d-O0.0"
                           "-sseeeed.trace"))
        self.assertTrue(
            os.path.exists("tmp/edison-single-m1.0manifestSim.json"
                           "-period-p5-0.0-single-t-0d-0d-O0.0"
                           "-sseeeed.qos"))
        self.assertTrue(
            os.path.exists("tmp/edison-single-m1.0manifestSim.json"
                           "-period-p5-0.0-single-t-0d-0d-O0.0"
                           "-sseeeed.users"))
        records = trace_gen.extract_records(
            file_name="tmp/edison-single-m1.0manifestSim.json"
            "-period-p5-0.0-single-t-0d-0d-O0.0"
            "-sseeeed.trace",
            list_trace_location="../bin/list_trace")
        man_count = 0
        self.assertGreater(
            int(records[-1]["SUBMIT"]) - int(records[0]["SUBMIT"]), 320)
        self.assertLess(
            int(records[-1]["SUBMIT"]) - int(records[0]["SUBMIT"]), 1500)
        for rec in records:
            if rec["WF"].split("-")[0] == "manifestSim.json":
                man_count += 1
        self.assertGreaterEqual(
            man_count, 64, "There should be at least 80"
            " workflows in the "
            "trace, found: {0}".format(man_count))
        self.assertLessEqual(
            man_count, 104, "There should be at least 80"
            " workflows in the "
            "trace, found: {0}".format(man_count))
Ejemplo n.º 5
0
    def test_generate_trace_files_special(self):
        ExperimentRunner.configure("tmp/trace_folder", "tmp", True, "myhost",
                                   "myUser")
        ed = ExperimentDefinition(
            seed="AAAA",
            machine="edison",
            trace_type="single",
            manifest_list=[],
            workflow_policy="sp-sat-p2-c24-r36000-t4-b100",
            workflow_period_s=0,
            workflow_handling="single",
            preload_time_s=0,
            start_date=datetime(2016, 1, 1),
            workload_duration_s=120,
            overload_target=1.2)
        er = ExperimentRunner(ed)
        er._generate_trace_files(ed)
        trace_file_route = ("tmp/{0}".format(ed.get_trace_file_name()))
        self.assertTrue(os.path.exists(trace_file_route))
        records = trace_gen.extract_records(
            file_name=trace_file_route,
            list_trace_location="../bin/list_trace")
        self.assertEqual(len(records), 8)
        submit_times = [0, 2, 4, 6, 100, 102, 104, 106]
        first_submit = int(records[0]["SUBMIT"])
        submit_times = [x + first_submit for x in submit_times]

        for (rec, submit_time) in zip(records, submit_times):
            self.assertEqual(int(rec["SUBMIT"]), submit_time)
            self.assertEqual(
                int(rec["NUM_TASKS"]) * int(rec["CORES_PER_TASK"]), 24)
            self.assertEqual(int(rec["DURATION"]), 36000)
            self.assertEqual(int(rec["WCLIMIT"]), 601)
Ejemplo n.º 6
0
    def test_sustained_levels(self):

        definition = ExperimentDefinition(seed="AAAAA",
                                          machine="edison",
                                          trace_type="single",
                                          manifest_list=[],
                                          workflow_policy="no",
                                          workflow_period_s=0,
                                          workflow_handling="single",
                                          preload_time_s=3600 * 4,
                                          workload_duration_s=3600 * 1,
                                          overload_target=1.5)

        ExperimentRunner.configure("tmp/trace_folder", "tmp", True, "myhost",
                                   "myUser")

        trace_generator = MyTraceGen()
        machine = definition.get_machine()
        er = ExperimentRunner(definition)
        er._generate_trace_files(definition, trace_generator=trace_generator)

        acc_cores, period = trace_generator.check_pressure(
            machine.get_total_cores(), 3600, 1.5, self, 1.0)
        total_pressure = float(acc_cores) / float(
            period * machine.get_total_cores())
        print(total_pressure)
        self.assertAlmostEqual(total_pressure, 1.5, delta=0.01)
        self.assertLess(total_pressure, 1.8)
Ejemplo n.º 7
0
 def test_is_it_running_failed_comms(self):
     ExperimentRunner.configure("/tmp/tests/tmp/dest",
                                "/tmp/tests/tmp/orig",
                                False,
                                "fakehost.fake.com",
                                "aUSer",
                                scheduler_conf_dir="tmp/conf",
                                local_conf_dir="tmp/conf_orig")
     ed = ExperimentDefinition(seed="seeeed",
                               machine="edison",
                               trace_type="single",
                               manifest_list=[{
                                   "share": 1.0,
                                   "manifest": "manifestSim.json"
                               }],
                               workflow_policy="period",
                               workflow_period_s=5,
                               workflow_handling="single",
                               preload_time_s=20,
                               start_date=datetime(2016, 1, 1),
                               workload_duration_s=41)
     er = ExperimentRunner(ed)
     self.assertRaises(SystemError, er.is_it_running, "python")
Ejemplo n.º 8
0
    def test_do_full_run(self):
        sched_db_obj = DB(self._vm_ip, "slurm_acct_db",
                          os.getenv("SLURMDB_USER", None),
                          os.getenv("SLURMDB_PASS", None))
        trace = ResultTrace()
        self.addCleanup(self._del_table, "traces")
        trace.create_trace_table(self._db, "traces")

        ExperimentRunner.configure(trace_folder="/tmp/",
                                   trace_generation_folder="tmp",
                                   local=False,
                                   run_hostname=self._vm_ip,
                                   run_user=None,
                                   scheduler_conf_dir="/scsf/slurm_conf",
                                   local_conf_dir="configs/",
                                   scheduler_folder="/scsf/",
                                   drain_time=100)
        ensureDir("tmp")
        ed = ExperimentDefinition(seed="seeeed",
                                  machine="edison",
                                  trace_type="single",
                                  manifest_list=[{
                                      "share": 1.0,
                                      "manifest": "manifestSim.json"
                                  }],
                                  workflow_policy="period",
                                  workflow_period_s=5,
                                  workflow_handling="single",
                                  preload_time_s=60,
                                  start_date=datetime(2016, 1, 1),
                                  workload_duration_s=1800)
        self.addCleanup(self._del_table, "experiment")
        ed.create_table(self._db)
        ed.store(self._db)

        er = ExperimentRunner(ed)
        self.assertTrue(er.do_full_run(sched_db_obj, self._db))
Ejemplo n.º 9
0
    def get_workflow_info(self, workflow_file):

        if not workflow_file in list(self.manifest_dics.keys()):
            from orchestration.running import ExperimentRunner
            manifest_route = path.join(ExperimentRunner.get_manifest_folder(),
                                       workflow_file)
            cores, runtime, tasks = WorkflowGeneratorMultijobs.parse_all_jobs(
                manifest_route)
            self.manifest_dics[workflow_file] = {
                "cores": cores,
                "runtime": runtime,
                "tasks": tasks
            }

        return self.manifest_dics[workflow_file]
Ejemplo n.º 10
0
    def test_generate_trace_files_overload(self):

        for seed_string in [
                "seeeed", "asdsa", "asdasdasd", "asdasdasdas", "asdasdlkjlkjl",
                "eworiuwioejrewk", "asjdlkasdlas"
        ]:
            ExperimentRunner.configure("tmp/trace_folder", "tmp", True,
                                       "myhost", "myUser")
            self.assertEqual(ExperimentRunner._trace_folder,
                             "tmp/trace_folder")
            self.assertEqual(ExperimentRunner._trace_generation_folder, "tmp")
            self.assertEqual(ExperimentRunner._local, True)

            workload_duration = 4 * 3600
            m = Edison2015()
            total_cores = m.get_total_cores()
            ed = ExperimentDefinition(seed=seed_string,
                                      machine="edison",
                                      trace_type="single",
                                      manifest_list=[],
                                      workflow_policy="no",
                                      workflow_period_s=0,
                                      workflow_handling="single",
                                      preload_time_s=0,
                                      start_date=datetime(2016, 1, 1),
                                      workload_duration_s=workload_duration,
                                      overload_target=1.2)

            er = ExperimentRunner(ed)
            er._generate_trace_files(ed)
            trace_file_route = ("tmp/{0}".format(ed.get_trace_file_name()))
            self.assertTrue(os.path.exists(trace_file_route))
            records = trace_gen.extract_records(
                file_name=trace_file_route,
                list_trace_location="../bin/list_trace")
            acc_core_hours = 0
            for rec in records:
                acc_core_hours += (int(rec["NUM_TASKS"]) *
                                   int(rec["CORES_PER_TASK"]) *
                                   int(rec["DURATION"]))

            print("pressure Index:", (float(acc_core_hours) /
                                      float(total_cores * workload_duration)))
            self.assertGreater(acc_core_hours,
                               1.1 * total_cores * workload_duration)
            self.assertLess(acc_core_hours,
                            1.5 * total_cores * workload_duration)
Ejemplo n.º 11
0
    def test_configure_slurm(self):
        ExperimentRunner.configure("/tmp/tests/tmp/dest",
                                   "/tmp/tests/tmp/orig",
                                   True,
                                   "locahost",
                                   None,
                                   scheduler_conf_dir="tmp/conf",
                                   local_conf_dir="tmp/conf_orig")
        ensureDir("tmp/conf")
        ensureDir("tmp/conf_orig")
        if os.path.exists("tmp/conf/slurm.conf"):
            os.remove("tmp/conf/slurm.conf")
        orig = open("tmp/conf_orig/slurm.conf.edison.regular", "w")
        orig.write("regular")
        orig.close()

        orig = open("tmp/conf_orig/slurm.conf.edsion.wfaware", "w")
        orig.write("aware")
        orig.close()

        ed = ExperimentDefinition(seed="seeeed",
                                  machine="edison",
                                  trace_type="single",
                                  manifest_list=[{
                                      "share": 1.0,
                                      "manifest": "manifestSim.json"
                                  }],
                                  workflow_policy="period",
                                  workflow_period_s=5,
                                  workflow_handling="single",
                                  preload_time_s=20,
                                  start_date=datetime(2016, 1, 1),
                                  workload_duration_s=41)
        er = ExperimentRunner(ed)
        er._configure_slurm()
        final = open("tmp/conf/slurm.conf")
        line = final.readline()
        self.assertEqual("regular", line)
        final.close()
Ejemplo n.º 12
0
 def test_is_it_running(self):
     ExperimentRunner.configure("/tmp/tests/tmp/dest",
                                "/tmp/tests/tmp/orig",
                                True,
                                "locahost",
                                None,
                                scheduler_conf_dir="tmp/conf",
                                local_conf_dir="tmp/conf_orig")
     ed = ExperimentDefinition(seed="seeeed",
                               machine="edison",
                               trace_type="single",
                               manifest_list=[{
                                   "share": 1.0,
                                   "manifest": "manifestSim.json"
                               }],
                               workflow_policy="period",
                               workflow_period_s=5,
                               workflow_handling="single",
                               preload_time_s=20,
                               start_date=datetime(2016, 1, 1),
                               workload_duration_s=41)
     er = ExperimentRunner(ed)
     self.assertTrue(er.is_it_running("python"))
     self.assertFalse(er.is_it_running("pythondd"))
Ejemplo n.º 13
0
 def __init__(self, manifest):
     self._manifest = manifest
     from orchestration.running import ExperimentRunner
     man_dir = os.getenv("MANIFEST_DIR",
                         ExperimentRunner.get_manifest_folder())
     self._manifest = os.path.join(man_dir, self._manifest)
Ejemplo n.º 14
0
 def setUp(self):
     ExperimentRunner.configure(manifest_folder="manifests")
     self._db = DB(os.getenv("TEST_DB_HOST", "127.0.0.1"),
                   os.getenv("TEST_DB_NAME", "test"),
                   os.getenv("TEST_DB_USER", "root"),
                   os.getenv("TEST_DB_PASS", ""))
Ejemplo n.º 15
0
    def test_run_simulation(self):
        ExperimentRunner.configure(trace_folder="/tmp/",
                                   trace_generation_folder="tmp",
                                   local=False,
                                   run_hostname=self._vm_ip,
                                   run_user=None,
                                   scheduler_conf_dir="/scsf/slurm_conf",
                                   local_conf_dir="configs/",
                                   scheduler_folder="/scsf/",
                                   drain_time=100)
        ensureDir("tmp")
        ed = ExperimentDefinition(seed="seeeed",
                                  machine="edison",
                                  trace_type="single",
                                  manifest_list=[{
                                      "share": 1.0,
                                      "manifest": "manifestSim.json"
                                  }],
                                  workflow_policy="period",
                                  workflow_period_s=5,
                                  workflow_handling="single",
                                  preload_time_s=60,
                                  start_date=datetime(2016, 1, 1),
                                  workload_duration_s=3600)
        er = ExperimentRunner(ed)
        er.create_trace_file()
        er._run_simulation()

        er.stop_simulation()
        self.assertTrue(er.is_simulation_done())
Ejemplo n.º 16
0
    def test_place_trace_files_remote_and_clean(self):
        ExperimentRunner.configure("/tmp/tests/tmp/dest",
                                   "/tmp/tests/tmp/orig",
                                   True,
                                   "locahost",
                                   None,
                                   scheduler_folder="/tmp/tests/tmp/sched",
                                   scheduler_conf_dir="/tmp/tests/tmp/conf",
                                   manifest_folder="manifests")
        self.assertEqual(ExperimentRunner._trace_folder, "/tmp/tests/tmp/dest")
        self.assertEqual(ExperimentRunner._trace_generation_folder,
                         "/tmp/tests/tmp/orig")
        self.assertEqual(ExperimentRunner._local, True)
        ensureDir("/tmp/tests/tmp/dest")
        ensureDir("/tmp/tests/tmp/orig")
        ensureDir("/tmp/tests/tmp/sched")
        ensureDir("/tmp/tests/tmp/conf")

        ed = ExperimentDefinition(seed="seeeed",
                                  machine="edison",
                                  trace_type="single",
                                  manifest_list=[{
                                      "share": 1.0,
                                      "manifest": "manifestSim.json"
                                  }],
                                  workflow_policy="period",
                                  workflow_period_s=5,
                                  workflow_handling="single",
                                  preload_time_s=20,
                                  start_date=datetime(2016, 1, 1),
                                  workload_duration_s=41,
                                  overload_target=1.1)
        er = ExperimentRunner(ed)
        filenames = er._generate_trace_files(ed)
        er._place_trace_file(filenames[0])
        er._place_users_file(filenames[2])
        self.assertTrue(
            os.path.exists(
                "/tmp/tests/tmp/dest/edison-single-m1.0manifestSim.json"
                "-period-p5-0.0-single-t-0d-0d-O1.1"
                "-sseeeed.trace"))
        self.assertTrue(os.path.exists("/tmp/tests/tmp/conf/users.sim"))
        self.assertFalse(
            os.path.exists(
                "/tmp/tests/tmp/orig/edison-single-m1.0manifestSim.json"
                "-period-p5-0.0-single-t-0d-0d-O1.1"
                "-sseeeed.trace"))

        er.clean_trace_file()
        self.assertFalse(
            os.path.exists(
                "/tmp/tests/tmp/dest/edison-single-m1.0manifestSim.json"
                "-period-p5-0.0-single-t-0d-0d-O1.1"
                "-sseeeed.trace"))

        self.assertFalse(
            os.path.exists(
                "/tmp/tests/tmp/dest/edison-single-m1.0manifestSim.json"
                "-period-p5-0.0-single-t-0d-0d-O1.1"
                "-sseeeed.users"))
Ejemplo n.º 17
0
Env vars:
- ANALYSIS_DB_HOST: hostname of the system hosting the database.
- ANALYSIS_DB_NAME: database name to read from.
- ANALYSIS_DB_USER: user to be used to access the database.
- ANALYSIS_DB_PASS: password to be used to used to access the database.
- ANALYSIS_DB_PORT: port on which the database runs. 
"""
from orchestration import AnalysisWorker
from orchestration import get_central_db
from orchestration.running import ExperimentRunner
import sys
ExperimentRunner.configure(
    trace_folder="/home/gonzalo/cscs14038bscVIII",
    trace_generation_folder="tmp",
    local=False,
    run_user=None,
    scheduler_conf_dir="/home/gonzalo/cscs14038bscVIII/slurm_conf",
    local_conf_dir="configs/",
    scheduler_folder="/home/gonzalo/cscs14038bscVIII",
    manifest_folder="manifests")

trace_id = None
if len(sys.argv) >= 2:
    trace_id = sys.argv[1]

central_db_obj = get_central_db()

ew = AnalysisWorker()

ew.do_work_grouped_second_pass(central_db_obj, pre_trace_id=trace_id)
Ejemplo n.º 18
0
    def test_generate_trace_files_first_job(self):
        ExperimentRunner.configure("tmp/trace_folder",
                                   "tmp",
                                   True,
                                   "myhost",
                                   "myUser",
                                   drain_time=0)
        self.assertEqual(ExperimentRunner._trace_folder, "tmp/trace_folder")
        self.assertEqual(ExperimentRunner._trace_generation_folder, "tmp")
        self.assertEqual(ExperimentRunner._local, True)

        ed = ExperimentDefinition(seed="seeeed",
                                  machine="edison",
                                  trace_type="single",
                                  manifest_list=[{
                                      "share": 1.0,
                                      "manifest": "manifestSim.json"
                                  }],
                                  workflow_policy="period",
                                  workflow_period_s=5,
                                  workflow_handling="single",
                                  preload_time_s=20,
                                  start_date=datetime(2016, 1, 1),
                                  workload_duration_s=400,
                                  overload_target=3600000)

        er = ExperimentRunner(ed)
        er._generate_trace_files(ed)
        self.assertTrue(
            os.path.exists("tmp/edison-single-m1.0manifestSim.json"
                           "-period-p5-0.0-single-t-0d-0d-O3600000"
                           "-sseeeed.trace"))
        self.assertTrue(
            os.path.exists("tmp/edison-single-m1.0manifestSim.json"
                           "-period-p5-0.0-single-t-0d-0d-O3600000"
                           "-sseeeed.qos"))
        self.assertTrue(
            os.path.exists("tmp/edison-single-m1.0manifestSim.json"
                           "-period-p5-0.0-single-t-0d-0d-O3600000"
                           "-sseeeed.users"))
        records = trace_gen.extract_records(
            file_name="tmp/edison-single-m1.0manifestSim.json"
            "-period-p5-0.0-single-t-0d-0d-O3600000"
            "-sseeeed.trace",
            list_trace_location="../bin/list_trace")
        man_count = 0
        self.assertGreater(
            int(records[-1]["SUBMIT"]) - int(records[0]["SUBMIT"]), 320)
        self.assertLess(
            int(records[-1]["SUBMIT"]) - int(records[0]["SUBMIT"]),
            1500 + 3720)
        for rec in records:
            if rec["WF"].split("-")[0] == "manifestSim.json":
                man_count += 1
        self.assertGreaterEqual(
            man_count, 64, "There should be at least 80"
            " workflows in the "
            "trace, found: {0}".format(man_count))
        self.assertLessEqual(
            man_count, 104, "There should be at least 80"
            " workflows in the "
            "trace, found: {0}".format(man_count))
        first_submit = TimeController.get_epoch(datetime(2016, 1,
                                                         1)) - 20 - 3600 - 120
        for i in range(360):
            self.assertEqual(int(records[i]["NUM_TASKS"]), 16 * 24)
            self.assertEqual(int(records[i]["DURATION"]), 7320)
            self.assertEqual(int(records[i]["WCLIMIT"]), 123)
            self.assertEqual(int(records[i]["SUBMIT"]), first_submit)
            first_submit += 10
        self.assertGreaterEqual(
            int(records[360]["SUBMIT"]),
            TimeController.get_epoch(datetime(2016, 1, 1)) - 20)
        self.assertNotEqual(int(records[360]["DURATION"]), 3600)