Ejemplo n.º 1
0
class ExperimentManager(abc.ABC):
    def __init__(
            self,
            project_path="/home/experiment/project",  # according to core/experiment/dockex_experiment
            tmp_dockex_path="/tmp/dockex",
            initial_job_num=None,
            experiment_name_prefix=None,
            sleep_seconds=0.5,
            save_project=False,
    ):

        super().__init__()

        if project_path is None:
            raise ValueError("A project_path must be provided.")
        else:
            self.project_path = os.path.expanduser(project_path)

        self.tmp_dockex_path = tmp_dockex_path

        self.dockex_config = read_job_config(tmp_dockex_path + "/dockex_config.json")
        self.redis_client = DockexRedisClient(self.dockex_config["redis_address"])

        self.docker_client = docker.from_env()

        manager_ip_address = self.redis_client.get("ip_address")
        manager_port = self.redis_client.get("redis_port")

        self.dependency_lookup_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=1
        )
        self.dependency_counts_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=2
        )
        self.job_lookup_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=3
        )

        self.initial_job_num = initial_job_num
        if self.initial_job_num is not None:
            self.job_num = self.initial_job_num
        else:
            self.job_num = self.redis_client.get("manager_job_num")

        self.sleep_seconds = sleep_seconds

        self.job_list = []

        self.dockex_path_list = self.redis_client.get("dockex_path_list")

        self.experiment_name_prefix = experiment_name_prefix
        self.experiment_name = f"dockex_{str(datetime.datetime.now()).replace('-', '_').replace(' ', '_').replace(':', '_').split('.')[0]}"
        if self.experiment_name_prefix is not None:
            self.experiment_name = (
                f"{self.experiment_name_prefix}_{self.experiment_name}"
            )

        self.csv_filename = f"jobs_{self.experiment_name}.csv"
        self.csv_pathname = (
            f"/tmp/dockex/data/{self.csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.trials_csv_filename = f"trials_{self.experiment_name}.csv"
        self.trials_csv_pathname = (
            f"/tmp/dockex/data/{self.trials_csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.extra_output_pathnames = []
        self.save_project = save_project
        self.project_archive_pathname = None
        self.project_archive_filename = None

        self.trial_dict = dict()
        self.trials_list = []

    def send_to_output_saver(self, extra_output_pathname):
        self.extra_output_pathnames.append(extra_output_pathname)

    def generate_job_name(self, module_name):
        job_num = self.job_num
        job_name = f"{module_name}_{str(self.job_num)}"
        self.job_num += 1
        return job_name, job_num

    def add_job(
            self,
            module_path,
            params=None,
            input_pathnames=None,
            skip_job=False,
            skip_input_pathnames=False,
            skip_output_pathnames=False,
            cpu_credits=1,
            gpu_credits=0,
            save_outputs=False,
            params_nested_update=False,
            trial_tag=None,
            save_trial=False
    ):

        if cpu_credits == 0 and gpu_credits == 0:
            raise ValueError("Either cpu_credits or gpu_credits must be > 0")

        if params is None:
            params = dict()

        if input_pathnames is None:
            input_pathnames = dict()

        module_name = pathlib.PurePath(module_path).name
        config_pathname = f"{self.project_path}/{module_path}/{module_name}.json"

        with open(config_pathname, "r") as fp:
            config = json.load(fp)

        job_name, job_num = self.generate_job_name(module_name)

        config["name"] = job_name
        config["job_num"] = job_num
        config["path"] = module_path
        config["module_name"] = module_name

        config["params_nested_update"] = params_nested_update

        if "params" in config.keys():
            if params_nested_update:
                config["params"] = update(copy.deepcopy(config["params"]), params)
            else:
                config["params"].update(params)

        else:
            config["params"] = params

        if "input_pathnames" in config.keys():
            config["input_pathnames"].update(input_pathnames)
        else:
            config["input_pathnames"] = input_pathnames

        config["skip_job"] = skip_job
        config["skip_input_pathnames"] = skip_input_pathnames
        config["skip_output_pathnames"] = skip_output_pathnames
        config["cpu_credits"] = cpu_credits
        config["gpu_credits"] = gpu_credits
        config["save_outputs"] = save_outputs

        config[
            "skip_docker_wrapper_build"
        ] = (
            True
        )  # ExperimentWorker takes care of building containers before wrapper launched

        config["experiment_job"] = True

        for params_key in config["params"].keys():
            if config["params"][params_key] == "DOCKEX_REQUIRED":
                raise ValueError(
                    f'Missing required parameter "{params_key}" for job name "{job_name}"'
                )

        for input_pathname_key in config["input_pathnames"].keys():
            if config["input_pathnames"][input_pathname_key] == "DOCKEX_REQUIRED":
                raise ValueError(
                    f'Missing required input pathname "{input_pathname_key}" for job name "{job_name}"'
                )

        for output_pathname_key in config["output_pathnames"].keys():
            config["output_pathnames"][
                output_pathname_key
            ] = f"{module_name}/{job_name}{config['output_pathnames'][output_pathname_key]}"

        if skip_job is False:
            self.job_list.append(copy.deepcopy(config))

        if trial_tag is not None:
            self.trial_dict[trial_tag] = copy.deepcopy(config)
        
        if save_trial is True:
            self.trials_list.append(copy.deepcopy(self.trial_dict))

        return config["output_pathnames"]

    def archive_project(self):
        self.redis_client.set("status", "ARCHIVING PROJECT")
        self.project_archive_filename = (
            f"project_{self.experiment_name}.zip"
        )  # this assumes we're running in a container or using /tmp/dockex locally
        self.project_archive_pathname = (
            f"/tmp/dockex/data/{self.project_archive_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally
        shutil.make_archive(
            self.project_archive_pathname.replace(".zip", ""),
            "zip",
            "/tmp/dockex/project",
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.redis_client.set("project_archive_filename", self.project_archive_filename)

    def wait_for_jobs_to_end(self):
        keep_waiting = True
        while keep_waiting:
            time.sleep(self.sleep_seconds)

            num_complete_jobs = self.redis_client.get("num_complete_jobs")
            num_total_jobs = self.redis_client.get("num_total_jobs")

            print_progress(num_complete_jobs, num_total_jobs)

            if num_complete_jobs == num_total_jobs:
                keep_waiting = False

    def wait_for_save_outputs(self):
        # make sure output_saver flag is True
        self.redis_client.set("output_saver_working_flag", True)

        # send an experiment done message to output_saver
        # it should set flag to False once it processes this message
        self.redis_client.rpush("output_saver", CLOSE_ZIP_COMMAND)

        # wait for OutputSaver to finish its business
        while self.redis_client.get("output_saver_working_flag") is True:
            pass

    def wait_for_experiment_to_finish(self):
        print("WAITING FOR EXPERIMENT TO FINISH")
        self.redis_client.set("status", "WAITING FOR EXPERIMENT TO FINISH")

        # store the job csv in the experiment zip file
        self.redis_client.rpush("output_saver", self.csv_filename)

        # if a trials csv exists, store it in the experiment zip file
        if os.path.isfile(self.trials_csv_pathname):
            self.redis_client.rpush("output_saver", self.trials_csv_filename)

        # send extra outputs to output_saver
        for extra_output_pathname in self.extra_output_pathnames:
            self.redis_client.rpush("output_saver", extra_output_pathname)

        if self.save_project:
            self.redis_client.rpush("output_saver", self.project_archive_filename)

        self.wait_for_jobs_to_end()

        # generate a csv of all the finished jobs and add it to the zip
        post_job_list = [
            json.loads(b) for b in self.job_lookup_db.mget(self.job_lookup_db.keys("*"))
        ]
        post_csv_filename = f"post_{self.csv_filename}"
        post_csv_pathname = (
            f"/tmp/dockex/data/{post_csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally
        pd.DataFrame(post_job_list).sort_values(by="job_num", ascending=True).set_index(
            "name"
        ).to_csv(post_csv_pathname)
        self.redis_client.rpush("output_saver", post_csv_filename)

        self.wait_for_save_outputs()

        os.remove(post_csv_pathname)
        os.remove(self.csv_pathname)
        os.remove(self.project_archive_pathname)

    def initialize_experiment_variables(self):
        # set the global job num for future experiments
        self.redis_client.set("manager_job_num", self.job_num)

        # flush experiment dbs
        self.dependency_lookup_db.flushdb()
        self.dependency_counts_db.flushdb()
        self.job_lookup_db.flushdb()

        # initialize the overall experiment job counts
        self.redis_client.set("num_total_jobs", 0)
        self.redis_client.set("num_pending_jobs", 0)
        self.redis_client.set("num_ready_jobs", 0)
        self.redis_client.set("num_running_jobs", 0)
        self.redis_client.set("num_complete_jobs", 0)
        self.redis_client.set("num_error_jobs", 0)

        self.redis_client.strict_redis.delete("unique_module_paths")

        unique_module_names = self.redis_client.get_list("unique_module_names")
        for unique_module_name in unique_module_names:
            stats_keys = get_module_stats_keys(unique_module_name)

            for key in stats_keys.values():
                self.redis_client.strict_redis.delete(key)
        self.redis_client.strict_redis.delete("unique_module_names")

        ready_jobs_list_key_dicts = self.redis_client.smembers(
            "ready_jobs_list_key_dicts"
        )
        for ready_jobs_list_key_dict in ready_jobs_list_key_dicts:
            self.redis_client.strict_redis.delete(
                ready_jobs_list_key_dict["ready_jobs_list_key"]
            )
        self.redis_client.strict_redis.delete("ready_jobs_list_key_dicts")

        self.redis_client.set("experiment_name", self.experiment_name)

        # reset output_saver just in case a zip was left open
        self.redis_client.rpush("output_saver", CLOSE_ZIP_COMMAND)

        self.redis_client.strict_redis.delete("error_jobs")

    def stage_jobs(self):
        print("STAGING JOBS")
        self.redis_client.set("status", "STAGING JOBS")

        unique_module_names = []
        unique_module_paths = []
        for job in self.job_list:
            input_pathnames = job["input_pathnames"]
            module_name = job["module_name"]
            module_path = job["path"]
            name = job["name"]
            skip_input_pathnames = job["skip_input_pathnames"]

            if module_path not in unique_module_paths:
                unique_module_paths.append(module_path)
                self.redis_client.rpush("unique_module_paths", module_path)

            ready_jobs_list_dict = OrderedDict(
                [
                    ("cpu_credits", job["cpu_credits"]),
                    ("gpu_credits", job["gpu_credits"]),
                ]
            )

            # register the ready_jobs list that corresponds to this job's credits
            ready_jobs_list_key = ready_jobs_dict_to_key(ready_jobs_list_dict)

            ready_jobs_list_dict["ready_jobs_list_key"] = ready_jobs_list_key

            # this is an ordered dict to guarantee the resulting json string is always in the same order
            # we're using a redis set here, and don't want duplicate entries if dict keys are in different order
            self.redis_client.sadd("ready_jobs_list_key_dicts", ready_jobs_list_dict)

            stats_keys = get_module_stats_keys(module_name)

            if module_name not in unique_module_names:
                unique_module_names.append(module_name)
                self.redis_client.rpush("unique_module_names", module_name)

                # it's important that total_jobs is updated first for accurately detecting experiment completion
                self.redis_client.set(stats_keys["num_total_jobs"], 1)
                self.redis_client.set(stats_keys["num_pending_jobs"], 0)
                self.redis_client.set(stats_keys["num_ready_jobs"], 0)
                self.redis_client.set(stats_keys["num_running_jobs"], 0)
                self.redis_client.set(stats_keys["num_complete_jobs"], 0)
                self.redis_client.set(stats_keys["num_error_jobs"], 0)

            else:
                # it's important that total_jobs is updated first for accurately detecting experiment completion
                self.redis_client.strict_redis.incr(stats_keys["num_total_jobs"])

            num_input_pathnames = 0
            if len(input_pathnames.keys()) > 0:
                for input_pathname_key in input_pathnames.keys():
                    input_pathname = input_pathnames[input_pathname_key]

                    if input_pathname is not None:
                        if (
                                skip_input_pathnames is False
                                or skip_input_pathnames is None
                        ):
                            self.dependency_lookup_db.sadd(input_pathname, name)
                            num_input_pathnames += 1

                        elif skip_input_pathnames is True:
                            pass

                        elif type(skip_input_pathnames) is list:
                            if input_pathname_key in skip_input_pathnames:
                                pass

                            else:
                                self.dependency_lookup_db.sadd(input_pathname, name)
                                num_input_pathnames += 1

            if num_input_pathnames > 0:
                self.dependency_counts_db.set(name, num_input_pathnames)
                self.redis_client.strict_redis.incr(stats_keys["num_pending_jobs"])
                self.redis_client.strict_redis.incr("num_pending_jobs")

            else:
                self.redis_client.rpush(ready_jobs_list_key, job)
                self.redis_client.strict_redis.incr(stats_keys["num_ready_jobs"])
                self.redis_client.strict_redis.incr("num_ready_jobs")

            self.redis_client.strict_redis.incr("num_total_jobs")

            # register the job on the backend
            self.job_lookup_db.set(name, json.dumps(job))

    def set_manager_flag(self):
        print("SETTING MANAGER FLAG")
        self.redis_client.set("status", "SETTING MANAGER FLAG")
        self.redis_client.set("manager_flag", True)

    def unset_manager_flag(self):
        print("UNSETTING MANAGER FLAG")
        self.redis_client.set("status", "UNSETTING MANAGER FLAG")
        self.redis_client.set("manager_flag", False)

    def generate_job_csv(self):
        print("GENERATING JOB CSV")
        pd.DataFrame(self.job_list).to_csv(self.csv_pathname)

    def generate_trial_csv(self):
        print('GENERATING TRIALS CSV')
        if len(self.trials_list) > 0:
            pd.DataFrame(self.trials_list).to_csv(self.trials_csv_pathname)

    def copy_project(self):
        print("COPYING PROJECT")
        self.redis_client.set("status", "COPYING PROJECT DIRECTORY")
        tmp_project_path = f"{self.tmp_dockex_path}/project"
        empty_make_directory(tmp_project_path)
        copy_tree(self.project_path, tmp_project_path)
        os.system(f"chown -R nonroot:nonroot {tmp_project_path}")

    def acquire_prevent_experiment_overlap_flag(self):
        print("ACQUIRING PREVENT EXPERIMENT OVERLAP FLAG")
        if self.redis_client.get("prevent_experiment_overlap_flag") is True:
            print("WAITING FOR PREVIOUS LOCAL EXPERIMENT TO FINISH")
            while self.redis_client.get("prevent_experiment_overlap_flag") is True:
                pass

        self.redis_client.set("prevent_experiment_overlap_flag", True)

        # TODO: also check and wait for remote machines to prevent overlapping experiments

    def release_prevent_experiment_overlap_flag(self):
        print("RELEASING PREVENT EXPERIMENT OVERLAP FLAG")
        self.redis_client.set("prevent_experiment_overlap_flag", False)

    def run(self, print_build_logs=False):
        print("RUNNING EXPERIMENT")
        self.redis_client.set("status", "RUNNING EXPERIMENT")
        self.generate_job_csv()
        self.generate_trial_csv()

        self.acquire_prevent_experiment_overlap_flag()

        start = time.time()

        try:
            self.initialize_experiment_variables()
            self.copy_project()
            self.stage_jobs()

            build_project_modules(
                self.docker_client,
                self.redis_client.get_list("unique_module_paths"),
                print_build_logs=print_build_logs,
                redis_client=self.redis_client,
            )

            self.archive_project()
            self.set_manager_flag()

            self.redis_client.set("status", "RUNNING EXPERIMENT")

            self.wait_for_experiment_to_finish()
            self.unset_manager_flag()

        except:
            self.wait_for_save_outputs()
            self.release_prevent_experiment_overlap_flag()
            self.unset_manager_flag()
            self.redis_client.set("status", "EXPERIMENT FAILED")
            raise

        end = time.time()

        self.release_prevent_experiment_overlap_flag()
        self.redis_client.set("status", "EXPERIMENT COMPLETE")

        print(f"EXPERIMENT EXECUTION TIME: {round((end - start), 2)} seconds")
Ejemplo n.º 2
0
class DockerWrapper:
    def __init__(self, input_args):
        super().__init__()

        self.json_pathname = input_args[1]
        self.redis_address = input_args[2]
        self.redis_client = DockexRedisClient(self.redis_address)

        self.tmp_dockex_path = self.redis_client.get("tmp_dockex_path")

        self.docker_client = docker.from_env()

        self.job_config = read_job_config(self.json_pathname)

        self.dockerfile_path = f"{self.job_config['path']}/Dockerfile"

        if "image_tag" in self.job_config.keys():
            self.image_tag = self.job_config["image_tag"]
        else:
            self.image_tag = module_path_to_image_tag(self.job_config["path"])

        self.command_args = self.generate_command_args()
        self.volumes = self.generate_volumes()
        self.network_mode = "host"

        self.environment = None
        if "include_json_pathname_env_variable" in self.job_config.keys():
            if self.job_config["include_json_pathname_env_variable"]:
                self.environment = {"JSON_PATHNAME": self.json_pathname}

        self.skip_build = False
        if "skip_docker_wrapper_build" in self.job_config.keys():
            if self.job_config["skip_docker_wrapper_build"] is True:
                self.skip_build = True

        # build path depends on if path is in core or relative to /tmp/dockex/project
        if self.job_config["path"].startswith("core/"):
            self.build_path = "."
        else:
            self.build_path = "/tmp/dockex/project"

        if "experiment_job" in self.job_config.keys():
            self.experiment_job = self.job_config["experiment_job"]
        else:
            self.experiment_job = False

        if self.experiment_job is True:
            self.detach = False
        else:
            self.detach = True

        self.build_kwargs_dict = dict(
            path=self.build_path, dockerfile=self.dockerfile_path, tag=self.image_tag
        )

        self.run_kwargs_dict = dict(
            image=self.image_tag,
            name=self.job_config["name"],
            command=self.command_args,
            detach=self.detach,
            network_mode=self.network_mode,
            volumes=self.volumes,
            environment=self.environment,
        )

        # check global gpus enable
        if self.redis_client.get("enable_gpus") is True:
            self.run_kwargs_dict["enable_gpus"] = True
        else:
            self.run_kwargs_dict["enable_gpus"] = False

        # allow module to override global gpus enable
        if "enable_gpus" in self.job_config.keys():
            if self.job_config["enable_gpus"] is True:
                self.run_kwargs_dict["enable_gpus"] = True
            else:
                self.run_kwargs_dict["enable_gpus"] = False

        self.good_to_launch = None
        self.experiment_manager_address = None
        self.experiment_manager = None
        self.dependency_lookup_db = None
        self.job_lookup_db = None
        self.stats_keys = None
        self.container_data_prefix = "/tmp/dockex/data/"

        self.sleep_seconds = 0.25

    def generate_command_args(self):
        command_args = f"{self.json_pathname}"

        if "omit_json_pathname_arg" in self.job_config.keys():
            if self.job_config["omit_json_pathname_arg"]:
                command_args = ""

        if "pass_redis_address_arg" in self.job_config.keys():
            if self.job_config["pass_redis_address_arg"]:
                if command_args == "":
                    command_args = f"{self.redis_address}"
                else:
                    command_args = f"{command_args} {self.redis_address}"

        if "command_args" in self.job_config.keys():
            if command_args == "":
                command_args = f"{self.job_config['command_args']}"
            else:
                command_args = f"{command_args} {self.job_config['command_args']}"

        return command_args

    def generate_volumes(self):
        volumes = {self.tmp_dockex_path: {"bind": "/tmp/dockex", "mode": "rw"}}

        if "bind_mount_docker_socket" in self.job_config.keys():
            if self.job_config["bind_mount_docker_socket"]:
                volumes["/var/run/docker.sock"] = {
                    "bind": "/var/run/docker.sock",
                    "mode": "rw",
                }

        if "volumes" in self.job_config.keys():
            for volume_key in self.job_config["volumes"].keys():
                volumes[volume_key] = {
                    "bind": self.job_config["volumes"][volume_key],
                    "mode": "rw",
                }

        return volumes

    def connect_to_experiment_manager(self):
        print("GETTING MANAGER REDIS ADDRESS")
        keep_trying = True
        while keep_trying:
            self.experiment_manager_address = self.redis_client.get(
                "manager_redis_address"
            )

            if self.experiment_manager_address is not None:
                keep_trying = False
                print("FOUND MANAGER REDIS ADDRESS")
            else:
                print("NO MANAGER FOUND, TRYING AGAIN")
                time.sleep(self.sleep_seconds)

        print("CONNECTING TO EXPERIMENT MANAGER")
        self.experiment_manager = DockexRedisClient(self.experiment_manager_address)

        experiment_manager_ip_address = self.experiment_manager.get("ip_address")
        experiment_manager_port = self.experiment_manager.get("redis_port")

        self.dependency_lookup_db = redis.StrictRedis(
            host=experiment_manager_ip_address, port=experiment_manager_port, db=1
        )
        self.job_lookup_db = redis.StrictRedis(
            host=experiment_manager_ip_address, port=experiment_manager_port, db=3
        )

    def prepare_input_pathnames(self):
        input_pathnames = self.job_config["input_pathnames"]

        if len(input_pathnames.keys()) > 0:
            # loop through ftp clients, connect, keep trying until it connects (in case workers take a while to spin up
            for input_pathname_key in input_pathnames.keys():
                input_pathname = input_pathnames[input_pathname_key]

                if input_pathname is not None:
                    local_input_pathname = (
                        f"{self.container_data_prefix}{input_pathname}"
                    )

                    # if the file doesn't exist, go find it
                    print("CHECKING FOR FILE: " + local_input_pathname)
                    if not os.path.isfile(local_input_pathname):
                        print("GOING TO LOOK FOR FILE")
                        ftp_find_file(
                            self.experiment_manager.get_list("dockex_machines"),
                            self.redis_client.get("ip_address"),
                            f"data/{input_pathname}",
                            local_input_pathname,
                        )

                    # update input_pathnames with local path
                    input_pathnames[input_pathname_key] = local_input_pathname

            # assign local input pathnames to job config for job
            self.job_config["input_pathnames"] = input_pathnames

        # check that all input pathnames exist
        if len(self.job_config["input_pathnames"].values()) > 0:
            check_pathnames = [
                os.path.isfile(check_pathname)
                for check_pathname in self.job_config["input_pathnames"].values()
                if check_pathname is not None
            ]
            self.good_to_launch = all(check is True for check in check_pathnames)
        else:
            self.good_to_launch = True

    def prepare_output_pathnames(self):
        output_pathnames = self.job_config["output_pathnames"]
        if len(output_pathnames.keys()) > 0:
            for output_pathname_key in output_pathnames.keys():
                output_pathname = output_pathnames[output_pathname_key]
                if output_pathname is not None:
                    local_output_pathname = (
                        f"{self.container_data_prefix}{output_pathname}"
                    )

                    # if the file is inside a directory, make sure that directory exists
                    local_output_path = os.path.split(local_output_pathname)[0]
                    if local_output_path != "":
                        check_make_directory(local_output_path)

                        os.system(f"chown -R nonroot:nonroot {local_output_path}")

                    output_pathnames[output_pathname_key] = local_output_pathname

            self.job_config["output_pathnames"] = output_pathnames

    def launch_experiment_job(self):
        print("GOOD TO LAUNCH")
        # overwrite json file with local input/output pathnames
        write_job_config(self.json_pathname, self.job_config)

        # update pending ready running numbers for experiment and job_command
        # use a backend pipeline so it's all atomic
        # this is a job going from ready to running
        update_pipeline = self.experiment_manager.strict_redis.pipeline()
        update_pipeline.decr("num_ready_jobs")
        update_pipeline.decr(self.stats_keys["num_ready_jobs"])
        update_pipeline.incr("num_running_jobs")
        update_pipeline.incr(self.stats_keys["num_running_jobs"])
        update_pipeline.execute()

        start_time = datetime.datetime.now()

        # launch the job
        try:
            build_image_run_container(
                self.docker_client,
                self.build_kwargs_dict,
                self.run_kwargs_dict,
                print_build_logs=True,
                skip_build=self.skip_build,
                native_run=True,
            )

        except Exception as e:
            print("EXCEPTION WHILE RUNNING CONTAINER")
            print(e)

        end_time = datetime.datetime.now()

        self.job_config["start_time"] = str(start_time)
        self.job_config["end_time"] = str(end_time)
        self.job_config["execution_time"] = str(end_time - start_time)

        print("GOOD LAUNCH")

    def cleanup_job(self):
        # release the credits
        self.redis_client.strict_redis.decrby(
            "cpu_credits_used", int(self.job_config["cpu_credits"])
        )
        self.redis_client.strict_redis.decrby(
            "gpu_credits_used", int(self.job_config["gpu_credits"])
        )

        skip_output_pathnames = self.job_config["skip_output_pathnames"]
        if type(skip_output_pathnames) is not list:
            if skip_output_pathnames is True:
                skip_output_pathnames = list(skip_output_pathnames.keys())
            else:
                skip_output_pathnames = []

        # check if its output_pathnames exist
        successful_job = True
        for local_output_pathname_key in self.job_config["output_pathnames"].keys():
            # local output_pathname contains the container_data_prefix
            local_output_pathname = self.job_config["output_pathnames"][
                local_output_pathname_key
            ]

            # remove the local data_path prepend
            output_pathname = local_output_pathname.replace(
                self.container_data_prefix, ""
            )

            # if the output_pathname doesn't exist and we're not skipping that output_pathname, an error occurred
            if not os.path.isfile(local_output_pathname):
                if local_output_pathname_key not in skip_output_pathnames:
                    # set the flag
                    successful_job = False

            # if the file does exist, save the output if requested
            # NOTE: it's important to push to output_saver before updating num_complete_jobs
            # NOTE: because ExperimentManager assumes this to determine when experiment has ended
            else:
                if self.job_config["save_outputs"]:
                    self.experiment_manager.rpush("output_saver", output_pathname)

            self.check_dependencies(output_pathname)

        # update the progress counts on ExperimentStager
        # this is a running to complete
        update_pipeline = self.experiment_manager.strict_redis.pipeline()
        update_pipeline.decr("num_running_jobs")
        update_pipeline.decr(self.stats_keys["num_running_jobs"])
        update_pipeline.incr("num_complete_jobs")
        update_pipeline.incr(self.stats_keys["num_complete_jobs"])
        update_pipeline.execute()

        if successful_job:
            self.job_config["status"] = "SUCCESS"

        else:
            self.job_config["status"] = "ERROR"

            # push to error_jobs list
            self.experiment_manager.rpush("error_jobs", self.job_config)

            # update progress counts
            update_pipeline = self.experiment_manager.strict_redis.pipeline()
            update_pipeline.incr("num_error_jobs")
            update_pipeline.incr(self.stats_keys["num_error_jobs"])
            update_pipeline.execute()

        job_config_json = json.dumps(self.job_config)

        # write job dict with status to backend
        self.job_lookup_db.set(self.job_config["name"], job_config_json)

    def check_dependencies(self, output_pathname):

        # get the job keys that depend on this output_pathname
        print("OUTPUT_PATHNAME: " + output_pathname)
        dependent_job_names = [
            b.decode("utf-8")
            for b in self.dependency_lookup_db.smembers(output_pathname)
        ]
        print("DEPENDENCY NAMES: " + str(dependent_job_names))

        for dependent_job_name in dependent_job_names:
            print("PROCESSING DEPENDENCY: " + dependent_job_name)
            self.experiment_manager.rpush("decrement_dependency", dependent_job_name)

    def failure_to_launch(self):
        # report error
        print("BAD LAUNCH")
        self.job_config["status"] = "ERROR"
        print(self.job_config)

        # ExperimentWorker checked out a cpu_credit before launching JobWrapperer
        # since job errored, check credit back in
        self.redis_client.strict_redis.decrby(
            "cpu_credits_used", int(self.job_config["cpu_credits"])
        )
        self.redis_client.strict_redis.decrby(
            "gpu_credits_used", int(self.job_config["gpu_credits"])
        )

        # push to error_jobs list
        self.experiment_manager.rpush("error_jobs", self.job_config)
        self.job_lookup_db.set(self.job_config["name"], json.dumps(self.job_config))

        # propagate error for dependent jobs
        for local_output_pathname in self.job_config["output_pathnames"].values():
            # remove the local data_path prepend
            output_pathname = local_output_pathname.replace(
                self.container_data_prefix, ""
            )
            self.check_dependencies(output_pathname)

        # update progress counts
        # ready to complete/error
        update_pipeline = self.experiment_manager.strict_redis.pipeline()
        update_pipeline.decr("num_ready_jobs")
        update_pipeline.decr(self.stats_keys["num_ready_jobs"])
        update_pipeline.incr("num_error_jobs")
        update_pipeline.incr(self.stats_keys["num_error_jobs"])
        update_pipeline.incr("num_complete_jobs")
        update_pipeline.incr(self.stats_keys["num_complete_jobs"])
        update_pipeline.execute()

    def run(self):
        print(self.job_config)

        print("build kwargs:")
        print(self.build_kwargs_dict)

        print("run kwargs")
        print(self.run_kwargs_dict)

        if self.experiment_job is not True:
            build_image_run_container(
                self.docker_client,
                self.build_kwargs_dict,
                self.run_kwargs_dict,
                print_build_logs=True,
                skip_build=self.skip_build,
                native_run=True,
            )

        else:
            print("RUNNING EXPERIMENT JOB")
            self.connect_to_experiment_manager()
            self.prepare_input_pathnames()
            self.prepare_output_pathnames()

            self.stats_keys = get_module_stats_keys(self.job_config["module_name"])

            if self.good_to_launch:
                self.launch_experiment_job()
                self.cleanup_job()
            else:
                self.failure_to_launch()

            # make sure there aren't any lingering root permission files
            os.system(f"chown -R nonroot:nonroot {self.container_data_prefix}")

            print("SUCCESS")
Ejemplo n.º 3
0
class ExperimentWorker(PythonJobWithBackend):
    def __init__(self,
                 input_args,
                 checking_manager_sleep_seconds=0.5,
                 working_sleep_seconds=0.25):
        super().__init__(input_args)

        self.checking_manager_sleep_seconds = checking_manager_sleep_seconds
        self.working_sleep_seconds = working_sleep_seconds

        self.docker_client = docker.from_env()

        self.experiment_manager = None
        self.experiment_manager_dict = None

    def run_job(self):
        while True:
            # check if we're connected to a manager
            # if we're NOT connected to a manager
            if self.experiment_manager is None:
                # check if there are any managers available
                dockex_machines_df = pd.DataFrame(
                    self.redis_client.get_list("dockex_machines"))

                if len(dockex_machines_df) > 0:
                    manager_machines_df = dockex_machines_df.loc[
                        dockex_machines_df.manager_flag == True]

                    if len(manager_machines_df) > 0:
                        # if so, connect to the manager
                        self.experiment_manager_dict = manager_machines_df.iloc[
                            0].to_dict()
                        self.experiment_manager = DockexRedisClient(
                            self.experiment_manager_dict["redis_address"])
                        self.redis_client.set(
                            "manager_redis_address",
                            self.experiment_manager_dict["redis_address"],
                        )

                        # if the manager is not the local manager
                        if (self.experiment_manager_dict["redis_address"] !=
                                self.redis_address):
                            # empty the project directory
                            empty_directory("/tmp/dockex/project")
                            empty_directory("/tmp/dockex/data")

                            # need to copy project archive, unarchive it, and build module images
                            project_archive_filename = self.experiment_manager.get(
                                "project_archive_filename")
                            local_project_archive_filename = (
                                f"/tmp/dockex/data/{project_archive_filename}")

                            found_project_archive = ftp_find_file(
                                self.experiment_manager.get_list(
                                    "dockex_machines"),
                                self.redis_client.get("ip_address"),
                                f"data/{project_archive_filename}",
                                local_project_archive_filename,
                            )

                            if found_project_archive:
                                with zipfile.ZipFile(
                                        local_project_archive_filename,
                                        "r") as zip_file:
                                    zip_file.extractall("/tmp/dockex/project")

                                # build the module images
                                experiment_module_paths = self.experiment_manager.get_list(
                                    "unique_module_paths")
                                # TODO: need a way to signal to the experiment that a build failed
                                # TODO: maybe a flag on manager that the experiment continually checks
                                # TODO: or maybe manager needs to test build before setting manager flag?
                                # TODO: even then though, if a build fails on remote host, that host should NOT work on that experiment name
                                # TODO: maybe a worker should track bad experiment names
                                self.redis_client.set(
                                    "status", "BUILDING PROJECT MODULES")
                                build_project_modules(self.docker_client,
                                                      experiment_module_paths)

                            else:
                                self.experiment_manager_dict = None
                                self.experiment_manager = None
                                self.redis_client.strict_redis.delete(
                                    "manager_redis_address")

                    else:
                        time.sleep(self.checking_manager_sleep_seconds)
                else:
                    time.sleep(self.checking_manager_sleep_seconds)

            # if we are connected to a manager
            else:
                # check if the manager is still a manager
                # if it is NOT still a manager
                if self.experiment_manager.get("manager_flag") is not True:
                    # disconnect from the manager
                    self.experiment_manager_dict = None
                    self.experiment_manager = None
                    self.redis_client.strict_redis.delete(
                        "manager_redis_address")

                # if it is still a manager
                else:
                    # check that the experiment name is the same
                    # if it is NOT the same, a new experiment has started
                    if (self.experiment_manager.get("experiment_name") !=
                            self.experiment_manager_dict["experiment_name"]):
                        # disconnect from the manager
                        self.experiment_manager_dict = None
                        self.experiment_manager = None
                        self.redis_client.strict_redis.delete(
                            "manager_redis_address")

                    # if the experiment name is the same
                    else:
                        # see if we can pull any work to do
                        # get the list of ready_jobs lists
                        ready_jobs_df = pd.DataFrame(
                            self.experiment_manager.smembers(
                                "ready_jobs_list_key_dicts"))

                        if len(ready_jobs_df) > 0:
                            # start with the jobs requiring the most credits
                            ready_jobs_df = ready_jobs_df.sort_values(
                                by=["gpu_credits", "cpu_credits"],
                                ascending=False)

                            num_open_cpu_credits = self.redis_client.get(
                                "cpu_credits_total") - self.redis_client.get(
                                    "cpu_credits_used")
                            num_open_gpu_credits = self.redis_client.get(
                                "gpu_credits_total") - self.redis_client.get(
                                    "gpu_credits_used")

                            if num_open_cpu_credits > 0 or num_open_gpu_credits > 0:
                                for ready_jobs_df_ind in ready_jobs_df.index:
                                    num_open_cpu_credits = self.redis_client.get(
                                        "cpu_credits_total"
                                    ) - self.redis_client.get(
                                        "cpu_credits_used")
                                    num_open_gpu_credits = self.redis_client.get(
                                        "gpu_credits_total"
                                    ) - self.redis_client.get(
                                        "gpu_credits_used")

                                    required_cpu_credits = int(
                                        ready_jobs_df.loc[ready_jobs_df_ind,
                                                          "cpu_credits"])
                                    required_gpu_credits = int(
                                        ready_jobs_df.loc[ready_jobs_df_ind,
                                                          "gpu_credits"])
                                    ready_jobs_key = ready_jobs_df.loc[
                                        ready_jobs_df_ind,
                                        "ready_jobs_list_key"]

                                    slots_min_list = []
                                    if required_cpu_credits > 0:
                                        num_open_cpu_slots = int(
                                            np.floor(num_open_cpu_credits /
                                                     required_cpu_credits))
                                        slots_min_list.append(
                                            num_open_cpu_slots)

                                    if required_gpu_credits > 0:
                                        num_open_gpu_slots = int(
                                            np.floor(num_open_gpu_credits /
                                                     required_gpu_credits))
                                        slots_min_list.append(
                                            num_open_gpu_slots)

                                    num_open_slots = int(
                                        np.min(slots_min_list))

                                    if num_open_slots > 0:
                                        p = (self.experiment_manager.
                                             strict_redis.pipeline())
                                        p.lrange(
                                            ready_jobs_key, 0,
                                            (num_open_slots -
                                             1))  # lrange is inclusive, so - 1
                                        p.ltrim(ready_jobs_key, num_open_slots,
                                                -1)
                                        pop_job_dicts, _ = p.execute()

                                        if len(pop_job_dicts) > 0:
                                            for pop_job_dict in pop_job_dicts:
                                                pop_job_dict = json.loads(
                                                    pop_job_dict)
                                                print(pop_job_dict)

                                                # checkout the credits
                                                self.redis_client.strict_redis.incrby(
                                                    "cpu_credits_used",
                                                    required_cpu_credits,
                                                )
                                                self.redis_client.strict_redis.incrby(
                                                    "gpu_credits_used",
                                                    required_gpu_credits,
                                                )

                                                self.redis_client.redis_launch_job(
                                                    f"/tmp/dockex/json/{pop_job_dict['name']}.json",
                                                    pop_job_dict,
                                                )

                        time.sleep(self.working_sleep_seconds)