コード例 #1
0
    def connect_to_experiment_manager(self):
        print("GETTING MANAGER REDIS ADDRESS")
        keep_trying = True
        while keep_trying:
            self.experiment_manager_address = self.redis_client.get(
                "manager_redis_address"
            )

            if self.experiment_manager_address is not None:
                keep_trying = False
                print("FOUND MANAGER REDIS ADDRESS")
            else:
                print("NO MANAGER FOUND, TRYING AGAIN")
                time.sleep(self.sleep_seconds)

        print("CONNECTING TO EXPERIMENT MANAGER")
        self.experiment_manager = DockexRedisClient(self.experiment_manager_address)

        experiment_manager_ip_address = self.experiment_manager.get("ip_address")
        experiment_manager_port = self.experiment_manager.get("redis_port")

        self.dependency_lookup_db = redis.StrictRedis(
            host=experiment_manager_ip_address, port=experiment_manager_port, db=1
        )
        self.job_lookup_db = redis.StrictRedis(
            host=experiment_manager_ip_address, port=experiment_manager_port, db=3
        )
コード例 #2
0
    def pop_callback(self, credits_update):
        if credits_update["mode"] == "incr":
            DockexRedisClient(credits_update["redis_address"]).strict_redis.incr(
                f"{credits_update['type']}_credits_total"
            )
        elif credits_update["mode"] == "decr":
            DockexRedisClient(credits_update["redis_address"]).strict_redis.decr(
                f"{credits_update['type']}_credits_total"
            )

        elif credits_update["mode"] == "set":
            DockexRedisClient(credits_update["redis_address"]).strict_redis.set(
                f"{credits_update['type']}_credits_total", credits_update["value"]
            )
コード例 #3
0
    def run_job(self):
        while True:
            try:
                discovered_machine_ips = self.redis_client.get_list(
                    "machines_on_network")

                if len(discovered_machine_ips) > 0:
                    for machine_ip_address in discovered_machine_ips:
                        # this assumes that all cluster machines use same port for redis
                        check_redis_address = (
                            f"http://{machine_ip_address}:{self.redis_port}")

                        try:
                            if (DockexRedisClient(check_redis_address).get(
                                    "dockex_backend") is True):
                                self.redis_client.sadd(
                                    "dockex_redis_addresses",
                                    check_redis_address)

                        except (redis.exceptions.ConnectionError, TypeError):
                            self.redis_client.srem("dockex_redis_addresses",
                                                   check_redis_address)

                # machine discovery won't pick up local machine if using 127.0.0.1
                # always make sure local machine gets registered
                else:
                    self.redis_client.sadd("dockex_redis_addresses",
                                           self.redis_address)

            except Exception as e:
                print(e)

            time.sleep(self.sleep_seconds)
コード例 #4
0
    def launch_redis(self):
        print("BUILDING AND RUNNING REDIS")

        build_image_run_container(
            self.docker_client,
            dict(
                path=".",
                dockerfile="core/services/backend/dockex_redis/Dockerfile",
                tag="dockex_redis_image",
            ),
            dict(
                image="dockex_redis_image",
                name="dockex_redis",
                detach=True,
                network_mode="host",
                volumes={
                    self.config["tmp_dockex_path"]: {
                        "bind": "/tmp/dockex",
                        "mode": "rw",
                    }
                },
            ),
            print_build_logs=True,
        )

        # connect to redis and flush
        self.redis_client = DockexRedisClient(self.config["redis_address"])

        trying_to_connect = True
        while trying_to_connect:
            try:
                self.redis_client.flushdb()
                trying_to_connect = False

            except redis.exceptions.ConnectionError:
                pass

        # fill redis with dockex config values
        for key in self.config.keys():
            self.redis_client.set(key, self.config[key])

        # mark the redis instance as a dockex backend
        self.redis_client.set("dockex_backend", True)

        self.redis_client.set("status", "LAUNCHED REDIS")
コード例 #5
0
    def run_job(self):
        while True:
            try:
                dockex_redis_addresses = self.redis_client.smembers(
                    "dockex_redis_addresses"
                )

                p = self.redis_client.strict_redis.pipeline()
                p.delete("dockex_machines")

                for dockex_redis_address in dockex_redis_addresses:
                    try:
                        temp_client = DockexRedisClient(dockex_redis_address)

                        dockex_status_dict = dict(
                            machine_name=temp_client.get("machine_name"),
                            redis_address=dockex_redis_address,
                            manager_flag=temp_client.get("manager_flag"),
                            experiment_name=temp_client.get("experiment_name"),
                            ip_address=temp_client.get("ip_address"),
                            tmp_dockex_ftpd_port=temp_client.get(
                                "tmp_dockex_ftpd_port"
                            ),
                            tmp_dockex_ftpd_password=temp_client.get(
                                "tmp_dockex_ftpd_password"
                            ),
                        )

                        p.rpush("dockex_machines", json.dumps(dockex_status_dict))

                    except redis.exceptions.ConnectionError:
                        pass

                p.execute()

            except Exception as e:
                print(e)

            time.sleep(self.sleep_seconds)
コード例 #6
0
    def __init__(self, input_args):
        super().__init__()

        self.json_pathname = input_args[1]
        self.redis_address = input_args[2]
        self.redis_client = DockexRedisClient(self.redis_address)

        self.tmp_dockex_path = self.redis_client.get("tmp_dockex_path")

        self.docker_client = docker.from_env()

        self.job_config = read_job_config(self.json_pathname)

        self.dockerfile_path = f"{self.job_config['path']}/Dockerfile"

        if "image_tag" in self.job_config.keys():
            self.image_tag = self.job_config["image_tag"]
        else:
            self.image_tag = module_path_to_image_tag(self.job_config["path"])

        self.command_args = self.generate_command_args()
        self.volumes = self.generate_volumes()
        self.network_mode = "host"

        self.environment = None
        if "include_json_pathname_env_variable" in self.job_config.keys():
            if self.job_config["include_json_pathname_env_variable"]:
                self.environment = {"JSON_PATHNAME": self.json_pathname}

        self.skip_build = False
        if "skip_docker_wrapper_build" in self.job_config.keys():
            if self.job_config["skip_docker_wrapper_build"] is True:
                self.skip_build = True

        # build path depends on if path is in core or relative to /tmp/dockex/project
        if self.job_config["path"].startswith("core/"):
            self.build_path = "."
        else:
            self.build_path = "/tmp/dockex/project"

        if "experiment_job" in self.job_config.keys():
            self.experiment_job = self.job_config["experiment_job"]
        else:
            self.experiment_job = False

        if self.experiment_job is True:
            self.detach = False
        else:
            self.detach = True

        self.build_kwargs_dict = dict(
            path=self.build_path, dockerfile=self.dockerfile_path, tag=self.image_tag
        )

        self.run_kwargs_dict = dict(
            image=self.image_tag,
            name=self.job_config["name"],
            command=self.command_args,
            detach=self.detach,
            network_mode=self.network_mode,
            volumes=self.volumes,
            environment=self.environment,
        )

        # check global gpus enable
        if self.redis_client.get("enable_gpus") is True:
            self.run_kwargs_dict["enable_gpus"] = True
        else:
            self.run_kwargs_dict["enable_gpus"] = False

        # allow module to override global gpus enable
        if "enable_gpus" in self.job_config.keys():
            if self.job_config["enable_gpus"] is True:
                self.run_kwargs_dict["enable_gpus"] = True
            else:
                self.run_kwargs_dict["enable_gpus"] = False

        self.good_to_launch = None
        self.experiment_manager_address = None
        self.experiment_manager = None
        self.dependency_lookup_db = None
        self.job_lookup_db = None
        self.stats_keys = None
        self.container_data_prefix = "/tmp/dockex/data/"

        self.sleep_seconds = 0.25
コード例 #7
0
class DockerWrapper:
    def __init__(self, input_args):
        super().__init__()

        self.json_pathname = input_args[1]
        self.redis_address = input_args[2]
        self.redis_client = DockexRedisClient(self.redis_address)

        self.tmp_dockex_path = self.redis_client.get("tmp_dockex_path")

        self.docker_client = docker.from_env()

        self.job_config = read_job_config(self.json_pathname)

        self.dockerfile_path = f"{self.job_config['path']}/Dockerfile"

        if "image_tag" in self.job_config.keys():
            self.image_tag = self.job_config["image_tag"]
        else:
            self.image_tag = module_path_to_image_tag(self.job_config["path"])

        self.command_args = self.generate_command_args()
        self.volumes = self.generate_volumes()
        self.network_mode = "host"

        self.environment = None
        if "include_json_pathname_env_variable" in self.job_config.keys():
            if self.job_config["include_json_pathname_env_variable"]:
                self.environment = {"JSON_PATHNAME": self.json_pathname}

        self.skip_build = False
        if "skip_docker_wrapper_build" in self.job_config.keys():
            if self.job_config["skip_docker_wrapper_build"] is True:
                self.skip_build = True

        # build path depends on if path is in core or relative to /tmp/dockex/project
        if self.job_config["path"].startswith("core/"):
            self.build_path = "."
        else:
            self.build_path = "/tmp/dockex/project"

        if "experiment_job" in self.job_config.keys():
            self.experiment_job = self.job_config["experiment_job"]
        else:
            self.experiment_job = False

        if self.experiment_job is True:
            self.detach = False
        else:
            self.detach = True

        self.build_kwargs_dict = dict(
            path=self.build_path, dockerfile=self.dockerfile_path, tag=self.image_tag
        )

        self.run_kwargs_dict = dict(
            image=self.image_tag,
            name=self.job_config["name"],
            command=self.command_args,
            detach=self.detach,
            network_mode=self.network_mode,
            volumes=self.volumes,
            environment=self.environment,
        )

        # check global gpus enable
        if self.redis_client.get("enable_gpus") is True:
            self.run_kwargs_dict["enable_gpus"] = True
        else:
            self.run_kwargs_dict["enable_gpus"] = False

        # allow module to override global gpus enable
        if "enable_gpus" in self.job_config.keys():
            if self.job_config["enable_gpus"] is True:
                self.run_kwargs_dict["enable_gpus"] = True
            else:
                self.run_kwargs_dict["enable_gpus"] = False

        self.good_to_launch = None
        self.experiment_manager_address = None
        self.experiment_manager = None
        self.dependency_lookup_db = None
        self.job_lookup_db = None
        self.stats_keys = None
        self.container_data_prefix = "/tmp/dockex/data/"

        self.sleep_seconds = 0.25

    def generate_command_args(self):
        command_args = f"{self.json_pathname}"

        if "omit_json_pathname_arg" in self.job_config.keys():
            if self.job_config["omit_json_pathname_arg"]:
                command_args = ""

        if "pass_redis_address_arg" in self.job_config.keys():
            if self.job_config["pass_redis_address_arg"]:
                if command_args == "":
                    command_args = f"{self.redis_address}"
                else:
                    command_args = f"{command_args} {self.redis_address}"

        if "command_args" in self.job_config.keys():
            if command_args == "":
                command_args = f"{self.job_config['command_args']}"
            else:
                command_args = f"{command_args} {self.job_config['command_args']}"

        return command_args

    def generate_volumes(self):
        volumes = {self.tmp_dockex_path: {"bind": "/tmp/dockex", "mode": "rw"}}

        if "bind_mount_docker_socket" in self.job_config.keys():
            if self.job_config["bind_mount_docker_socket"]:
                volumes["/var/run/docker.sock"] = {
                    "bind": "/var/run/docker.sock",
                    "mode": "rw",
                }

        if "volumes" in self.job_config.keys():
            for volume_key in self.job_config["volumes"].keys():
                volumes[volume_key] = {
                    "bind": self.job_config["volumes"][volume_key],
                    "mode": "rw",
                }

        return volumes

    def connect_to_experiment_manager(self):
        print("GETTING MANAGER REDIS ADDRESS")
        keep_trying = True
        while keep_trying:
            self.experiment_manager_address = self.redis_client.get(
                "manager_redis_address"
            )

            if self.experiment_manager_address is not None:
                keep_trying = False
                print("FOUND MANAGER REDIS ADDRESS")
            else:
                print("NO MANAGER FOUND, TRYING AGAIN")
                time.sleep(self.sleep_seconds)

        print("CONNECTING TO EXPERIMENT MANAGER")
        self.experiment_manager = DockexRedisClient(self.experiment_manager_address)

        experiment_manager_ip_address = self.experiment_manager.get("ip_address")
        experiment_manager_port = self.experiment_manager.get("redis_port")

        self.dependency_lookup_db = redis.StrictRedis(
            host=experiment_manager_ip_address, port=experiment_manager_port, db=1
        )
        self.job_lookup_db = redis.StrictRedis(
            host=experiment_manager_ip_address, port=experiment_manager_port, db=3
        )

    def prepare_input_pathnames(self):
        input_pathnames = self.job_config["input_pathnames"]

        if len(input_pathnames.keys()) > 0:
            # loop through ftp clients, connect, keep trying until it connects (in case workers take a while to spin up
            for input_pathname_key in input_pathnames.keys():
                input_pathname = input_pathnames[input_pathname_key]

                if input_pathname is not None:
                    local_input_pathname = (
                        f"{self.container_data_prefix}{input_pathname}"
                    )

                    # if the file doesn't exist, go find it
                    print("CHECKING FOR FILE: " + local_input_pathname)
                    if not os.path.isfile(local_input_pathname):
                        print("GOING TO LOOK FOR FILE")
                        ftp_find_file(
                            self.experiment_manager.get_list("dockex_machines"),
                            self.redis_client.get("ip_address"),
                            f"data/{input_pathname}",
                            local_input_pathname,
                        )

                    # update input_pathnames with local path
                    input_pathnames[input_pathname_key] = local_input_pathname

            # assign local input pathnames to job config for job
            self.job_config["input_pathnames"] = input_pathnames

        # check that all input pathnames exist
        if len(self.job_config["input_pathnames"].values()) > 0:
            check_pathnames = [
                os.path.isfile(check_pathname)
                for check_pathname in self.job_config["input_pathnames"].values()
                if check_pathname is not None
            ]
            self.good_to_launch = all(check is True for check in check_pathnames)
        else:
            self.good_to_launch = True

    def prepare_output_pathnames(self):
        output_pathnames = self.job_config["output_pathnames"]
        if len(output_pathnames.keys()) > 0:
            for output_pathname_key in output_pathnames.keys():
                output_pathname = output_pathnames[output_pathname_key]
                if output_pathname is not None:
                    local_output_pathname = (
                        f"{self.container_data_prefix}{output_pathname}"
                    )

                    # if the file is inside a directory, make sure that directory exists
                    local_output_path = os.path.split(local_output_pathname)[0]
                    if local_output_path != "":
                        check_make_directory(local_output_path)

                        os.system(f"chown -R nonroot:nonroot {local_output_path}")

                    output_pathnames[output_pathname_key] = local_output_pathname

            self.job_config["output_pathnames"] = output_pathnames

    def launch_experiment_job(self):
        print("GOOD TO LAUNCH")
        # overwrite json file with local input/output pathnames
        write_job_config(self.json_pathname, self.job_config)

        # update pending ready running numbers for experiment and job_command
        # use a backend pipeline so it's all atomic
        # this is a job going from ready to running
        update_pipeline = self.experiment_manager.strict_redis.pipeline()
        update_pipeline.decr("num_ready_jobs")
        update_pipeline.decr(self.stats_keys["num_ready_jobs"])
        update_pipeline.incr("num_running_jobs")
        update_pipeline.incr(self.stats_keys["num_running_jobs"])
        update_pipeline.execute()

        start_time = datetime.datetime.now()

        # launch the job
        try:
            build_image_run_container(
                self.docker_client,
                self.build_kwargs_dict,
                self.run_kwargs_dict,
                print_build_logs=True,
                skip_build=self.skip_build,
                native_run=True,
            )

        except Exception as e:
            print("EXCEPTION WHILE RUNNING CONTAINER")
            print(e)

        end_time = datetime.datetime.now()

        self.job_config["start_time"] = str(start_time)
        self.job_config["end_time"] = str(end_time)
        self.job_config["execution_time"] = str(end_time - start_time)

        print("GOOD LAUNCH")

    def cleanup_job(self):
        # release the credits
        self.redis_client.strict_redis.decrby(
            "cpu_credits_used", int(self.job_config["cpu_credits"])
        )
        self.redis_client.strict_redis.decrby(
            "gpu_credits_used", int(self.job_config["gpu_credits"])
        )

        skip_output_pathnames = self.job_config["skip_output_pathnames"]
        if type(skip_output_pathnames) is not list:
            if skip_output_pathnames is True:
                skip_output_pathnames = list(skip_output_pathnames.keys())
            else:
                skip_output_pathnames = []

        # check if its output_pathnames exist
        successful_job = True
        for local_output_pathname_key in self.job_config["output_pathnames"].keys():
            # local output_pathname contains the container_data_prefix
            local_output_pathname = self.job_config["output_pathnames"][
                local_output_pathname_key
            ]

            # remove the local data_path prepend
            output_pathname = local_output_pathname.replace(
                self.container_data_prefix, ""
            )

            # if the output_pathname doesn't exist and we're not skipping that output_pathname, an error occurred
            if not os.path.isfile(local_output_pathname):
                if local_output_pathname_key not in skip_output_pathnames:
                    # set the flag
                    successful_job = False

            # if the file does exist, save the output if requested
            # NOTE: it's important to push to output_saver before updating num_complete_jobs
            # NOTE: because ExperimentManager assumes this to determine when experiment has ended
            else:
                if self.job_config["save_outputs"]:
                    self.experiment_manager.rpush("output_saver", output_pathname)

            self.check_dependencies(output_pathname)

        # update the progress counts on ExperimentStager
        # this is a running to complete
        update_pipeline = self.experiment_manager.strict_redis.pipeline()
        update_pipeline.decr("num_running_jobs")
        update_pipeline.decr(self.stats_keys["num_running_jobs"])
        update_pipeline.incr("num_complete_jobs")
        update_pipeline.incr(self.stats_keys["num_complete_jobs"])
        update_pipeline.execute()

        if successful_job:
            self.job_config["status"] = "SUCCESS"

        else:
            self.job_config["status"] = "ERROR"

            # push to error_jobs list
            self.experiment_manager.rpush("error_jobs", self.job_config)

            # update progress counts
            update_pipeline = self.experiment_manager.strict_redis.pipeline()
            update_pipeline.incr("num_error_jobs")
            update_pipeline.incr(self.stats_keys["num_error_jobs"])
            update_pipeline.execute()

        job_config_json = json.dumps(self.job_config)

        # write job dict with status to backend
        self.job_lookup_db.set(self.job_config["name"], job_config_json)

    def check_dependencies(self, output_pathname):

        # get the job keys that depend on this output_pathname
        print("OUTPUT_PATHNAME: " + output_pathname)
        dependent_job_names = [
            b.decode("utf-8")
            for b in self.dependency_lookup_db.smembers(output_pathname)
        ]
        print("DEPENDENCY NAMES: " + str(dependent_job_names))

        for dependent_job_name in dependent_job_names:
            print("PROCESSING DEPENDENCY: " + dependent_job_name)
            self.experiment_manager.rpush("decrement_dependency", dependent_job_name)

    def failure_to_launch(self):
        # report error
        print("BAD LAUNCH")
        self.job_config["status"] = "ERROR"
        print(self.job_config)

        # ExperimentWorker checked out a cpu_credit before launching JobWrapperer
        # since job errored, check credit back in
        self.redis_client.strict_redis.decrby(
            "cpu_credits_used", int(self.job_config["cpu_credits"])
        )
        self.redis_client.strict_redis.decrby(
            "gpu_credits_used", int(self.job_config["gpu_credits"])
        )

        # push to error_jobs list
        self.experiment_manager.rpush("error_jobs", self.job_config)
        self.job_lookup_db.set(self.job_config["name"], json.dumps(self.job_config))

        # propagate error for dependent jobs
        for local_output_pathname in self.job_config["output_pathnames"].values():
            # remove the local data_path prepend
            output_pathname = local_output_pathname.replace(
                self.container_data_prefix, ""
            )
            self.check_dependencies(output_pathname)

        # update progress counts
        # ready to complete/error
        update_pipeline = self.experiment_manager.strict_redis.pipeline()
        update_pipeline.decr("num_ready_jobs")
        update_pipeline.decr(self.stats_keys["num_ready_jobs"])
        update_pipeline.incr("num_error_jobs")
        update_pipeline.incr(self.stats_keys["num_error_jobs"])
        update_pipeline.incr("num_complete_jobs")
        update_pipeline.incr(self.stats_keys["num_complete_jobs"])
        update_pipeline.execute()

    def run(self):
        print(self.job_config)

        print("build kwargs:")
        print(self.build_kwargs_dict)

        print("run kwargs")
        print(self.run_kwargs_dict)

        if self.experiment_job is not True:
            build_image_run_container(
                self.docker_client,
                self.build_kwargs_dict,
                self.run_kwargs_dict,
                print_build_logs=True,
                skip_build=self.skip_build,
                native_run=True,
            )

        else:
            print("RUNNING EXPERIMENT JOB")
            self.connect_to_experiment_manager()
            self.prepare_input_pathnames()
            self.prepare_output_pathnames()

            self.stats_keys = get_module_stats_keys(self.job_config["module_name"])

            if self.good_to_launch:
                self.launch_experiment_job()
                self.cleanup_job()
            else:
                self.failure_to_launch()

            # make sure there aren't any lingering root permission files
            os.system(f"chown -R nonroot:nonroot {self.container_data_prefix}")

            print("SUCCESS")
コード例 #8
0
    def __init__(
            self,
            project_path="/home/experiment/project",  # according to core/experiment/dockex_experiment
            tmp_dockex_path="/tmp/dockex",
            initial_job_num=None,
            experiment_name_prefix=None,
            sleep_seconds=0.5,
            save_project=False,
    ):

        super().__init__()

        if project_path is None:
            raise ValueError("A project_path must be provided.")
        else:
            self.project_path = os.path.expanduser(project_path)

        self.tmp_dockex_path = tmp_dockex_path

        self.dockex_config = read_job_config(tmp_dockex_path + "/dockex_config.json")
        self.redis_client = DockexRedisClient(self.dockex_config["redis_address"])

        self.docker_client = docker.from_env()

        manager_ip_address = self.redis_client.get("ip_address")
        manager_port = self.redis_client.get("redis_port")

        self.dependency_lookup_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=1
        )
        self.dependency_counts_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=2
        )
        self.job_lookup_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=3
        )

        self.initial_job_num = initial_job_num
        if self.initial_job_num is not None:
            self.job_num = self.initial_job_num
        else:
            self.job_num = self.redis_client.get("manager_job_num")

        self.sleep_seconds = sleep_seconds

        self.job_list = []

        self.dockex_path_list = self.redis_client.get("dockex_path_list")

        self.experiment_name_prefix = experiment_name_prefix
        self.experiment_name = f"dockex_{str(datetime.datetime.now()).replace('-', '_').replace(' ', '_').replace(':', '_').split('.')[0]}"
        if self.experiment_name_prefix is not None:
            self.experiment_name = (
                f"{self.experiment_name_prefix}_{self.experiment_name}"
            )

        self.csv_filename = f"jobs_{self.experiment_name}.csv"
        self.csv_pathname = (
            f"/tmp/dockex/data/{self.csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.trials_csv_filename = f"trials_{self.experiment_name}.csv"
        self.trials_csv_pathname = (
            f"/tmp/dockex/data/{self.trials_csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.extra_output_pathnames = []
        self.save_project = save_project
        self.project_archive_pathname = None
        self.project_archive_filename = None

        self.trial_dict = dict()
        self.trials_list = []
コード例 #9
0
class ExperimentManager(abc.ABC):
    def __init__(
            self,
            project_path="/home/experiment/project",  # according to core/experiment/dockex_experiment
            tmp_dockex_path="/tmp/dockex",
            initial_job_num=None,
            experiment_name_prefix=None,
            sleep_seconds=0.5,
            save_project=False,
    ):

        super().__init__()

        if project_path is None:
            raise ValueError("A project_path must be provided.")
        else:
            self.project_path = os.path.expanduser(project_path)

        self.tmp_dockex_path = tmp_dockex_path

        self.dockex_config = read_job_config(tmp_dockex_path + "/dockex_config.json")
        self.redis_client = DockexRedisClient(self.dockex_config["redis_address"])

        self.docker_client = docker.from_env()

        manager_ip_address = self.redis_client.get("ip_address")
        manager_port = self.redis_client.get("redis_port")

        self.dependency_lookup_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=1
        )
        self.dependency_counts_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=2
        )
        self.job_lookup_db = redis.StrictRedis(
            host=manager_ip_address, port=manager_port, db=3
        )

        self.initial_job_num = initial_job_num
        if self.initial_job_num is not None:
            self.job_num = self.initial_job_num
        else:
            self.job_num = self.redis_client.get("manager_job_num")

        self.sleep_seconds = sleep_seconds

        self.job_list = []

        self.dockex_path_list = self.redis_client.get("dockex_path_list")

        self.experiment_name_prefix = experiment_name_prefix
        self.experiment_name = f"dockex_{str(datetime.datetime.now()).replace('-', '_').replace(' ', '_').replace(':', '_').split('.')[0]}"
        if self.experiment_name_prefix is not None:
            self.experiment_name = (
                f"{self.experiment_name_prefix}_{self.experiment_name}"
            )

        self.csv_filename = f"jobs_{self.experiment_name}.csv"
        self.csv_pathname = (
            f"/tmp/dockex/data/{self.csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.trials_csv_filename = f"trials_{self.experiment_name}.csv"
        self.trials_csv_pathname = (
            f"/tmp/dockex/data/{self.trials_csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.extra_output_pathnames = []
        self.save_project = save_project
        self.project_archive_pathname = None
        self.project_archive_filename = None

        self.trial_dict = dict()
        self.trials_list = []

    def send_to_output_saver(self, extra_output_pathname):
        self.extra_output_pathnames.append(extra_output_pathname)

    def generate_job_name(self, module_name):
        job_num = self.job_num
        job_name = f"{module_name}_{str(self.job_num)}"
        self.job_num += 1
        return job_name, job_num

    def add_job(
            self,
            module_path,
            params=None,
            input_pathnames=None,
            skip_job=False,
            skip_input_pathnames=False,
            skip_output_pathnames=False,
            cpu_credits=1,
            gpu_credits=0,
            save_outputs=False,
            params_nested_update=False,
            trial_tag=None,
            save_trial=False
    ):

        if cpu_credits == 0 and gpu_credits == 0:
            raise ValueError("Either cpu_credits or gpu_credits must be > 0")

        if params is None:
            params = dict()

        if input_pathnames is None:
            input_pathnames = dict()

        module_name = pathlib.PurePath(module_path).name
        config_pathname = f"{self.project_path}/{module_path}/{module_name}.json"

        with open(config_pathname, "r") as fp:
            config = json.load(fp)

        job_name, job_num = self.generate_job_name(module_name)

        config["name"] = job_name
        config["job_num"] = job_num
        config["path"] = module_path
        config["module_name"] = module_name

        config["params_nested_update"] = params_nested_update

        if "params" in config.keys():
            if params_nested_update:
                config["params"] = update(copy.deepcopy(config["params"]), params)
            else:
                config["params"].update(params)

        else:
            config["params"] = params

        if "input_pathnames" in config.keys():
            config["input_pathnames"].update(input_pathnames)
        else:
            config["input_pathnames"] = input_pathnames

        config["skip_job"] = skip_job
        config["skip_input_pathnames"] = skip_input_pathnames
        config["skip_output_pathnames"] = skip_output_pathnames
        config["cpu_credits"] = cpu_credits
        config["gpu_credits"] = gpu_credits
        config["save_outputs"] = save_outputs

        config[
            "skip_docker_wrapper_build"
        ] = (
            True
        )  # ExperimentWorker takes care of building containers before wrapper launched

        config["experiment_job"] = True

        for params_key in config["params"].keys():
            if config["params"][params_key] == "DOCKEX_REQUIRED":
                raise ValueError(
                    f'Missing required parameter "{params_key}" for job name "{job_name}"'
                )

        for input_pathname_key in config["input_pathnames"].keys():
            if config["input_pathnames"][input_pathname_key] == "DOCKEX_REQUIRED":
                raise ValueError(
                    f'Missing required input pathname "{input_pathname_key}" for job name "{job_name}"'
                )

        for output_pathname_key in config["output_pathnames"].keys():
            config["output_pathnames"][
                output_pathname_key
            ] = f"{module_name}/{job_name}{config['output_pathnames'][output_pathname_key]}"

        if skip_job is False:
            self.job_list.append(copy.deepcopy(config))

        if trial_tag is not None:
            self.trial_dict[trial_tag] = copy.deepcopy(config)
        
        if save_trial is True:
            self.trials_list.append(copy.deepcopy(self.trial_dict))

        return config["output_pathnames"]

    def archive_project(self):
        self.redis_client.set("status", "ARCHIVING PROJECT")
        self.project_archive_filename = (
            f"project_{self.experiment_name}.zip"
        )  # this assumes we're running in a container or using /tmp/dockex locally
        self.project_archive_pathname = (
            f"/tmp/dockex/data/{self.project_archive_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally
        shutil.make_archive(
            self.project_archive_pathname.replace(".zip", ""),
            "zip",
            "/tmp/dockex/project",
        )  # this assumes we're running in a container or using /tmp/dockex locally

        self.redis_client.set("project_archive_filename", self.project_archive_filename)

    def wait_for_jobs_to_end(self):
        keep_waiting = True
        while keep_waiting:
            time.sleep(self.sleep_seconds)

            num_complete_jobs = self.redis_client.get("num_complete_jobs")
            num_total_jobs = self.redis_client.get("num_total_jobs")

            print_progress(num_complete_jobs, num_total_jobs)

            if num_complete_jobs == num_total_jobs:
                keep_waiting = False

    def wait_for_save_outputs(self):
        # make sure output_saver flag is True
        self.redis_client.set("output_saver_working_flag", True)

        # send an experiment done message to output_saver
        # it should set flag to False once it processes this message
        self.redis_client.rpush("output_saver", CLOSE_ZIP_COMMAND)

        # wait for OutputSaver to finish its business
        while self.redis_client.get("output_saver_working_flag") is True:
            pass

    def wait_for_experiment_to_finish(self):
        print("WAITING FOR EXPERIMENT TO FINISH")
        self.redis_client.set("status", "WAITING FOR EXPERIMENT TO FINISH")

        # store the job csv in the experiment zip file
        self.redis_client.rpush("output_saver", self.csv_filename)

        # if a trials csv exists, store it in the experiment zip file
        if os.path.isfile(self.trials_csv_pathname):
            self.redis_client.rpush("output_saver", self.trials_csv_filename)

        # send extra outputs to output_saver
        for extra_output_pathname in self.extra_output_pathnames:
            self.redis_client.rpush("output_saver", extra_output_pathname)

        if self.save_project:
            self.redis_client.rpush("output_saver", self.project_archive_filename)

        self.wait_for_jobs_to_end()

        # generate a csv of all the finished jobs and add it to the zip
        post_job_list = [
            json.loads(b) for b in self.job_lookup_db.mget(self.job_lookup_db.keys("*"))
        ]
        post_csv_filename = f"post_{self.csv_filename}"
        post_csv_pathname = (
            f"/tmp/dockex/data/{post_csv_filename}"
        )  # this assumes we're running in a container or using /tmp/dockex locally
        pd.DataFrame(post_job_list).sort_values(by="job_num", ascending=True).set_index(
            "name"
        ).to_csv(post_csv_pathname)
        self.redis_client.rpush("output_saver", post_csv_filename)

        self.wait_for_save_outputs()

        os.remove(post_csv_pathname)
        os.remove(self.csv_pathname)
        os.remove(self.project_archive_pathname)

    def initialize_experiment_variables(self):
        # set the global job num for future experiments
        self.redis_client.set("manager_job_num", self.job_num)

        # flush experiment dbs
        self.dependency_lookup_db.flushdb()
        self.dependency_counts_db.flushdb()
        self.job_lookup_db.flushdb()

        # initialize the overall experiment job counts
        self.redis_client.set("num_total_jobs", 0)
        self.redis_client.set("num_pending_jobs", 0)
        self.redis_client.set("num_ready_jobs", 0)
        self.redis_client.set("num_running_jobs", 0)
        self.redis_client.set("num_complete_jobs", 0)
        self.redis_client.set("num_error_jobs", 0)

        self.redis_client.strict_redis.delete("unique_module_paths")

        unique_module_names = self.redis_client.get_list("unique_module_names")
        for unique_module_name in unique_module_names:
            stats_keys = get_module_stats_keys(unique_module_name)

            for key in stats_keys.values():
                self.redis_client.strict_redis.delete(key)
        self.redis_client.strict_redis.delete("unique_module_names")

        ready_jobs_list_key_dicts = self.redis_client.smembers(
            "ready_jobs_list_key_dicts"
        )
        for ready_jobs_list_key_dict in ready_jobs_list_key_dicts:
            self.redis_client.strict_redis.delete(
                ready_jobs_list_key_dict["ready_jobs_list_key"]
            )
        self.redis_client.strict_redis.delete("ready_jobs_list_key_dicts")

        self.redis_client.set("experiment_name", self.experiment_name)

        # reset output_saver just in case a zip was left open
        self.redis_client.rpush("output_saver", CLOSE_ZIP_COMMAND)

        self.redis_client.strict_redis.delete("error_jobs")

    def stage_jobs(self):
        print("STAGING JOBS")
        self.redis_client.set("status", "STAGING JOBS")

        unique_module_names = []
        unique_module_paths = []
        for job in self.job_list:
            input_pathnames = job["input_pathnames"]
            module_name = job["module_name"]
            module_path = job["path"]
            name = job["name"]
            skip_input_pathnames = job["skip_input_pathnames"]

            if module_path not in unique_module_paths:
                unique_module_paths.append(module_path)
                self.redis_client.rpush("unique_module_paths", module_path)

            ready_jobs_list_dict = OrderedDict(
                [
                    ("cpu_credits", job["cpu_credits"]),
                    ("gpu_credits", job["gpu_credits"]),
                ]
            )

            # register the ready_jobs list that corresponds to this job's credits
            ready_jobs_list_key = ready_jobs_dict_to_key(ready_jobs_list_dict)

            ready_jobs_list_dict["ready_jobs_list_key"] = ready_jobs_list_key

            # this is an ordered dict to guarantee the resulting json string is always in the same order
            # we're using a redis set here, and don't want duplicate entries if dict keys are in different order
            self.redis_client.sadd("ready_jobs_list_key_dicts", ready_jobs_list_dict)

            stats_keys = get_module_stats_keys(module_name)

            if module_name not in unique_module_names:
                unique_module_names.append(module_name)
                self.redis_client.rpush("unique_module_names", module_name)

                # it's important that total_jobs is updated first for accurately detecting experiment completion
                self.redis_client.set(stats_keys["num_total_jobs"], 1)
                self.redis_client.set(stats_keys["num_pending_jobs"], 0)
                self.redis_client.set(stats_keys["num_ready_jobs"], 0)
                self.redis_client.set(stats_keys["num_running_jobs"], 0)
                self.redis_client.set(stats_keys["num_complete_jobs"], 0)
                self.redis_client.set(stats_keys["num_error_jobs"], 0)

            else:
                # it's important that total_jobs is updated first for accurately detecting experiment completion
                self.redis_client.strict_redis.incr(stats_keys["num_total_jobs"])

            num_input_pathnames = 0
            if len(input_pathnames.keys()) > 0:
                for input_pathname_key in input_pathnames.keys():
                    input_pathname = input_pathnames[input_pathname_key]

                    if input_pathname is not None:
                        if (
                                skip_input_pathnames is False
                                or skip_input_pathnames is None
                        ):
                            self.dependency_lookup_db.sadd(input_pathname, name)
                            num_input_pathnames += 1

                        elif skip_input_pathnames is True:
                            pass

                        elif type(skip_input_pathnames) is list:
                            if input_pathname_key in skip_input_pathnames:
                                pass

                            else:
                                self.dependency_lookup_db.sadd(input_pathname, name)
                                num_input_pathnames += 1

            if num_input_pathnames > 0:
                self.dependency_counts_db.set(name, num_input_pathnames)
                self.redis_client.strict_redis.incr(stats_keys["num_pending_jobs"])
                self.redis_client.strict_redis.incr("num_pending_jobs")

            else:
                self.redis_client.rpush(ready_jobs_list_key, job)
                self.redis_client.strict_redis.incr(stats_keys["num_ready_jobs"])
                self.redis_client.strict_redis.incr("num_ready_jobs")

            self.redis_client.strict_redis.incr("num_total_jobs")

            # register the job on the backend
            self.job_lookup_db.set(name, json.dumps(job))

    def set_manager_flag(self):
        print("SETTING MANAGER FLAG")
        self.redis_client.set("status", "SETTING MANAGER FLAG")
        self.redis_client.set("manager_flag", True)

    def unset_manager_flag(self):
        print("UNSETTING MANAGER FLAG")
        self.redis_client.set("status", "UNSETTING MANAGER FLAG")
        self.redis_client.set("manager_flag", False)

    def generate_job_csv(self):
        print("GENERATING JOB CSV")
        pd.DataFrame(self.job_list).to_csv(self.csv_pathname)

    def generate_trial_csv(self):
        print('GENERATING TRIALS CSV')
        if len(self.trials_list) > 0:
            pd.DataFrame(self.trials_list).to_csv(self.trials_csv_pathname)

    def copy_project(self):
        print("COPYING PROJECT")
        self.redis_client.set("status", "COPYING PROJECT DIRECTORY")
        tmp_project_path = f"{self.tmp_dockex_path}/project"
        empty_make_directory(tmp_project_path)
        copy_tree(self.project_path, tmp_project_path)
        os.system(f"chown -R nonroot:nonroot {tmp_project_path}")

    def acquire_prevent_experiment_overlap_flag(self):
        print("ACQUIRING PREVENT EXPERIMENT OVERLAP FLAG")
        if self.redis_client.get("prevent_experiment_overlap_flag") is True:
            print("WAITING FOR PREVIOUS LOCAL EXPERIMENT TO FINISH")
            while self.redis_client.get("prevent_experiment_overlap_flag") is True:
                pass

        self.redis_client.set("prevent_experiment_overlap_flag", True)

        # TODO: also check and wait for remote machines to prevent overlapping experiments

    def release_prevent_experiment_overlap_flag(self):
        print("RELEASING PREVENT EXPERIMENT OVERLAP FLAG")
        self.redis_client.set("prevent_experiment_overlap_flag", False)

    def run(self, print_build_logs=False):
        print("RUNNING EXPERIMENT")
        self.redis_client.set("status", "RUNNING EXPERIMENT")
        self.generate_job_csv()
        self.generate_trial_csv()

        self.acquire_prevent_experiment_overlap_flag()

        start = time.time()

        try:
            self.initialize_experiment_variables()
            self.copy_project()
            self.stage_jobs()

            build_project_modules(
                self.docker_client,
                self.redis_client.get_list("unique_module_paths"),
                print_build_logs=print_build_logs,
                redis_client=self.redis_client,
            )

            self.archive_project()
            self.set_manager_flag()

            self.redis_client.set("status", "RUNNING EXPERIMENT")

            self.wait_for_experiment_to_finish()
            self.unset_manager_flag()

        except:
            self.wait_for_save_outputs()
            self.release_prevent_experiment_overlap_flag()
            self.unset_manager_flag()
            self.redis_client.set("status", "EXPERIMENT FAILED")
            raise

        end = time.time()

        self.release_prevent_experiment_overlap_flag()
        self.redis_client.set("status", "EXPERIMENT COMPLETE")

        print(f"EXPERIMENT EXECUTION TIME: {round((end - start), 2)} seconds")
コード例 #10
0
    def __init__(self, input_args):
        super().__init__(input_args)

        self.redis_address = input_args[2]
        self.redis_client = DockexRedisClient(self.redis_address)
コード例 #11
0
    def run_job(self):
        while True:
            try:
                dockex_machines = self.redis_client.get_list("dockex_machines")

                cluster_cpu_list = []
                cluster_ram_total_list = []
                cluster_ram_used_list = []

                cluster_gpu_list = []
                cluster_gpu_memory_total_list = []
                cluster_gpu_memory_used_list = []

                cluster_cpu_credits_total = 0
                cluster_cpu_credits_used = 0
                cluster_gpu_credits_total = 0
                cluster_gpu_credits_used = 0

                p = self.redis_client.strict_redis.pipeline()
                p.delete("cluster_monitor")
                p.delete("cluster_stats")

                for dockex_machine in dockex_machines:
                    try:
                        temp_redis_client = DockexRedisClient(
                            dockex_machine["redis_address"])
                        dockex_machine[
                            "hardware_monitor"] = temp_redis_client.get(
                                "hardware_monitor")
                        dockex_machine[
                            "credits_monitor"] = temp_redis_client.get(
                                "credits_monitor")
                        dockex_machine["status"] = temp_redis_client.get(
                            "status")
                        dockex_machine["data_path"] = temp_redis_client.get(
                            "data_path")
                        dockex_machine["json_path"] = temp_redis_client.get(
                            "json_path")
                        dockex_machine[
                            "redis_address"] = temp_redis_client.get(
                                "redis_address")
                        dockex_machine[
                            "webdis_address"] = temp_redis_client.get(
                                "webdis_address")

                        p.rpush("cluster_monitor", json.dumps(dockex_machine))

                        cluster_cpu_list += dockex_machine["hardware_monitor"][
                            "cpu_percent_per_cpu"]
                        cluster_ram_total_list.append(
                            dockex_machine["hardware_monitor"]
                            ["virtual_memory_total"])
                        cluster_ram_used_list.append(
                            dockex_machine["hardware_monitor"]
                            ["virtual_memory_used"])

                        cluster_gpu_list += dockex_machine["hardware_monitor"][
                            "gpu_percent_per_gpu"]
                        cluster_gpu_memory_total_list.append(
                            dockex_machine["hardware_monitor"]
                            ["gpu_memory_total"])
                        cluster_gpu_memory_used_list.append(
                            dockex_machine["hardware_monitor"]
                            ["gpu_memory_used"])

                        cluster_cpu_credits_total += dockex_machine[
                            "credits_monitor"]["cpu_credits_total"]
                        cluster_cpu_credits_used += dockex_machine[
                            "credits_monitor"]["cpu_credits_used"]

                        cluster_gpu_credits_total += dockex_machine[
                            "credits_monitor"]["gpu_credits_total"]
                        cluster_gpu_credits_used += dockex_machine[
                            "credits_monitor"]["gpu_credits_used"]

                    except Exception as e:
                        print(e)

                cluster_num_cpus = len(cluster_cpu_list)
                if cluster_num_cpus > 0:
                    cluster_cpu_utilization = round(
                        sum(cluster_cpu_list) / float(cluster_num_cpus), 1)
                else:
                    cluster_cpu_utilization = 0.0

                cluster_num_gpus = len(cluster_gpu_list)
                if cluster_num_gpus > 0:
                    cluster_gpu_utilization = round(
                        sum(cluster_gpu_list) / float(cluster_num_gpus), 1)
                else:
                    cluster_gpu_utilization = 0.0

                virtual_memory_total = sum(cluster_ram_total_list)
                virtual_memory_used = sum(cluster_ram_used_list)
                if virtual_memory_total > 0.0:
                    virtual_memory_percent = round(
                        (virtual_memory_used * 100.0 / virtual_memory_total),
                        1)
                else:
                    virtual_memory_percent = 0.0

                gpu_memory_total = sum(cluster_gpu_memory_total_list)
                gpu_memory_used = sum(cluster_gpu_memory_used_list)
                if gpu_memory_total > 0.0:
                    gpu_memory_percent = round(
                        (gpu_memory_used * 100.0 / gpu_memory_total), 1)
                else:
                    gpu_memory_percent = 0.0

                cluster_stats = {
                    "machine_count": len(dockex_machines),
                    "cpu_count": cluster_num_cpus,
                    "cpu_percent": cluster_cpu_utilization,
                    "cpu_percent_per_cpu": cluster_cpu_list,
                    "virtual_memory_total": virtual_memory_total,
                    "virtual_memory_used": virtual_memory_used,
                    "virtual_memory_percent": virtual_memory_percent,
                    "gpu_count": cluster_num_gpus,
                    "gpu_percent": cluster_gpu_utilization,
                    "gpu_percent_per_gpu": cluster_gpu_list,
                    "gpu_memory_total": gpu_memory_total,
                    "gpu_memory_used": gpu_memory_used,
                    "gpu_memory_percent": gpu_memory_percent,
                    "cpu_credits_total": cluster_cpu_credits_total,
                    "cpu_credits_used": cluster_cpu_credits_used,
                    "gpu_credits_total": cluster_gpu_credits_total,
                    "gpu_credits_used": cluster_gpu_credits_used,
                }

                p.set("cluster_stats", json.dumps(cluster_stats))

                p.execute()

            except Exception as e:
                print(e)

            time.sleep(self.sleep_seconds)
コード例 #12
0
class DockexStartup:
    def __init__(self):
        super().__init__()

        self.config = None
        self.redis_client = None
        self.docker_client = docker.from_env()

    @staticmethod
    def get_ip_address(ifname):
        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
        return socket.inet_ntoa(
            fcntl.ioctl(
                s.fileno(),
                0x8915,  # SIOCGIFADDR
                struct.pack("256s", ifname[:15].encode("utf-8")),
            )[20:24])

    def generate_dockex_config(self):
        with open("base_config.json", "r") as f:
            config = json.load(f)

        config["machine_name"] = uuid.uuid4().hex

        if os.path.isfile("user_config.json"):
            with open("user_config.json", "r") as f:
                user_config = json.load(f)

            config.update(user_config)

        # TODO: disable distributed mode for now
        # if "network_interface" in config.keys():
        #     try:
        #         config["ip_address"] = self.get_ip_address(config["network_interface"])
        #
        #     except OSError as e:
        #         print("ERROR: bad network interface")
        #         raise e

        # TODO: disable distributed mode for now
        print(
            "Forcing IP to 127.0.0.1. Distributed mode will be enabled in a future release."
        )
        config["ip_address"] = "127.0.0.1"

        config["redis_address"] = ("http://" + config["ip_address"] + ":" +
                                   str(config["redis_port"]))
        config["webdis_address"] = ("http://" + config["ip_address"] + ":" +
                                    str(config["webdis_port"]))

        config["data_path"] = "/tmp/dockex/data"
        config["json_path"] = "/tmp/dockex/json"
        config["project_path"] = "/tmp/dockex/project"

        dockex_config_pathname = "/tmp/dockex/dockex_config.json"
        with open(dockex_config_pathname, "w") as f:
            json.dump(config, f, indent=4)

        self.config = config

    def create_dockex_directories(self):
        empty_make_directory(self.config["data_path"])
        empty_make_directory(self.config["json_path"])
        empty_make_directory(self.config["project_path"])

    def generate_app_env(self):
        with open("app/.base_env", "r") as f:
            env = f.read()

        env = env.replace(
            "REACT_APP_WEBDIS_ADDRESS=",
            ("REACT_APP_WEBDIS_ADDRESS='" +
             str(self.config["webdis_address"]) + "'"),
        )
        env = env.replace("PORT=", ("PORT=" + str(self.config["app_port"])))

        with open("/tmp/dockex/.env", "w") as f:
            f.write(env)

    def generate_redis_conf(self):
        base_redis_conf_pathname = "core/services/backend/dockex_redis/base_redis.conf"
        redis_conf_pathname = self.config["tmp_dockex_path"] + "/redis.conf"

        with open(base_redis_conf_pathname, "r") as f:
            data = f.read()

        # replace the ip address and port with dockex_config values
        data = data.replace("127.0.0.1", self.config["ip_address"])
        data = data.replace("6379", str(self.config["redis_port"]))

        # set the number of databases
        data = data.replace("databases 16", "databases 4")

        # turn off saving
        data = data.replace("save 900 1", "# save 900 1")
        data = data.replace("save 300 10", "# save 300 10")
        data = data.replace("save 60 10000", "# save 60 10000")

        # change working directory to tmp_dockex_path
        data = data.replace("dir ./",
                            ("dir " + self.config["tmp_dockex_path"]))

        with open(redis_conf_pathname, "w") as f:
            f.write(data)

    def generate_webdis_json(self):
        webdis_json_pathname = "/tmp/dockex/webdis.json"

        conf = {
            "redis_host":
            self.config["ip_address"],
            "redis_port":
            self.config["redis_port"],
            "redis_auth":
            None,
            "http_host":
            self.config["ip_address"],
            "http_port":
            self.config["webdis_port"],
            "threads":
            5,
            "pool_size":
            20,
            "daemonize":
            False,
            "websockets":
            False,
            "database":
            0,
            "acl": [
                {
                    "disabled": ["DEBUG"]
                },
                {
                    "http_basic_auth": "user:password",
                    "enabled": ["DEBUG"]
                },
            ],
            "verbosity":
            6,
            "logfile":
            "webdis.log",
        }

        with open(webdis_json_pathname, "w") as f:
            json.dump(conf, f, indent=4)

    def initialize(self):
        print("INITIALIZING")
        self.generate_dockex_config()
        self.create_dockex_directories()
        self.generate_app_env()
        self.generate_redis_conf()
        self.generate_webdis_json()

        # change owner of dockex tmp directory to nonroot
        # startup must be root to launch sibling docker containers
        os.system("chown -R nonroot:nonroot /tmp/dockex")

    def launch_redis(self):
        print("BUILDING AND RUNNING REDIS")

        build_image_run_container(
            self.docker_client,
            dict(
                path=".",
                dockerfile="core/services/backend/dockex_redis/Dockerfile",
                tag="dockex_redis_image",
            ),
            dict(
                image="dockex_redis_image",
                name="dockex_redis",
                detach=True,
                network_mode="host",
                volumes={
                    self.config["tmp_dockex_path"]: {
                        "bind": "/tmp/dockex",
                        "mode": "rw",
                    }
                },
            ),
            print_build_logs=True,
        )

        # connect to redis and flush
        self.redis_client = DockexRedisClient(self.config["redis_address"])

        trying_to_connect = True
        while trying_to_connect:
            try:
                self.redis_client.flushdb()
                trying_to_connect = False

            except redis.exceptions.ConnectionError:
                pass

        # fill redis with dockex config values
        for key in self.config.keys():
            self.redis_client.set(key, self.config[key])

        # mark the redis instance as a dockex backend
        self.redis_client.set("dockex_backend", True)

        self.redis_client.set("status", "LAUNCHED REDIS")

    def launch_webdis(self):
        print("LAUNCHING WEBDIS")
        build_image_run_container(
            self.docker_client,
            dict(
                path=".",
                dockerfile="core/services/backend/dockex_webdis/Dockerfile",
                tag="dockex_webdis_image",
            ),
            dict(
                image="dockex_webdis_image",
                name="dockex_webdis",
                detach=True,
                network_mode="host",
                volumes={
                    self.config["tmp_dockex_path"]: {
                        "bind": "/tmp/dockex",
                        "mode": "rw",
                    }
                },
            ),
            print_build_logs=True,
        )

        self.redis_client.set("status", "LAUNCHED WEBDIS")

    def initialize_experiment_variables(self):
        print("INITIALIZING EXPERIMENT VARIABLES")

        # initialize count for ExperimentManagers
        self.redis_client.set("manager_job_num", 0)

        # initialize the overall experiment job counts
        self.redis_client.set("num_total_jobs", 0)
        self.redis_client.set("num_pending_jobs", 0)
        self.redis_client.set("num_ready_jobs", 0)
        self.redis_client.set("num_running_jobs", 0)
        self.redis_client.set("num_complete_jobs", 0)
        self.redis_client.set("num_error_jobs", 0)

        # initialize machine as not a manager
        self.redis_client.set("manager_flag", False)

        # initialize credits
        self.redis_client.set("cpu_credits_total", 0)
        self.redis_client.set("cpu_credits_used", 0)
        self.redis_client.set("gpu_credits_total", 0)
        self.redis_client.set("gpu_credits_used", 0)

        self.redis_client.set("status", "INITIALIZED EXPERIMENT VARIABLES")

    def launch_json_launcher(self):
        print("LAUNCHING JSON LAUNCHER")

        build_image_run_container(
            self.docker_client,
            dict(
                path=".",
                dockerfile="core/services/launchers/json_launcher/Dockerfile",
                tag="json_launcher_image",
            ),
            dict(
                image="json_launcher_image",
                name="json_launcher",
                command=
                f"core/services/launchers/json_launcher/json_launcher.json {self.config['redis_address']}",
                detach=True,
                network_mode="host",
                volumes={
                    self.config["tmp_dockex_path"]: {
                        "bind": "/tmp/dockex",
                        "mode": "rw",
                    },
                    "/var/run/docker.sock": {
                        "bind": "/var/run/docker.sock",
                        "mode": "rw",
                    },
                },
            ),
            print_build_logs=True,
        )

        self.redis_client.set("status", "LAUNCHED DOCKER LAUNCHER")

    def launch_services(self):
        print("LAUNCHING SERVICES")

        self.redis_client.json_launch_job(
            "core/services/frontend/app_server/app_server.json")

        self.redis_client.json_launch_job(
            "core/services/launchers/redis_launcher/redis_launcher.json")

        self.redis_client.json_launch_job(
            "core/services/monitors/hardware_monitor/hardware_monitor.json")
        self.redis_client.json_launch_job(
            "core/services/monitors/credits_monitor/credits_monitor.json")
        self.redis_client.json_launch_job(
            "core/services/monitors/dockex_machine_monitor/dockex_machine_monitor.json"
        )
        self.redis_client.json_launch_job(
            "core/services/monitors/cluster_monitor/cluster_monitor.json")
        self.redis_client.json_launch_job(
            "core/services/monitors/progress_monitor/progress_monitor.json")

        self.redis_client.json_launch_job(
            "core/services/network/machine_discovery/machine_discovery.json")
        self.redis_client.json_launch_job(
            "core/services/network/dockex_machine_identifier/dockex_machine_identifier.json"
        )

        # TODO: disable distributed mode for now
        # only run ftpd if running distributed
        # if self.config['ip_address'] != "127.0.0.1":
        #     self.redis_client.json_launch_job("core/services/ftp/tmp_dockex_ftpd/tmp_dockex_ftpd.json")

        self.redis_client.json_launch_job(
            "core/services/experiment/output_saver/output_saver.json")
        self.redis_client.json_launch_job(
            "core/services/experiment/experiment_worker/experiment_worker.json"
        )
        self.redis_client.json_launch_job(
            "core/services/experiment/decrement_dependency/decrement_dependency.json"
        )
        self.redis_client.json_launch_job(
            "core/services/experiment/credits_updater/credits_updater.json")

    def build_dockex_experiment_image(self):
        print("Building dockex experiment image")
        build_image(
            self.docker_client,
            dict(
                path=".",
                dockerfile=f"core/experiment/dockex_experiment/Dockerfile",
                tag="dockex_experiment_image",
            ),
            print_build_logs=True,
        )

    def run(self):
        print("DOCKEX STARTUP")

        self.initialize()
        self.launch_redis()
        self.launch_webdis()
        self.initialize_experiment_variables()
        self.build_dockex_experiment_image()
        self.launch_json_launcher()
        self.launch_services()

        self.redis_client.set("status", "READY")
コード例 #13
0
    def run_job(self):
        while True:
            # check if we're connected to a manager
            # if we're NOT connected to a manager
            if self.experiment_manager is None:
                # check if there are any managers available
                dockex_machines_df = pd.DataFrame(
                    self.redis_client.get_list("dockex_machines"))

                if len(dockex_machines_df) > 0:
                    manager_machines_df = dockex_machines_df.loc[
                        dockex_machines_df.manager_flag == True]

                    if len(manager_machines_df) > 0:
                        # if so, connect to the manager
                        self.experiment_manager_dict = manager_machines_df.iloc[
                            0].to_dict()
                        self.experiment_manager = DockexRedisClient(
                            self.experiment_manager_dict["redis_address"])
                        self.redis_client.set(
                            "manager_redis_address",
                            self.experiment_manager_dict["redis_address"],
                        )

                        # if the manager is not the local manager
                        if (self.experiment_manager_dict["redis_address"] !=
                                self.redis_address):
                            # empty the project directory
                            empty_directory("/tmp/dockex/project")
                            empty_directory("/tmp/dockex/data")

                            # need to copy project archive, unarchive it, and build module images
                            project_archive_filename = self.experiment_manager.get(
                                "project_archive_filename")
                            local_project_archive_filename = (
                                f"/tmp/dockex/data/{project_archive_filename}")

                            found_project_archive = ftp_find_file(
                                self.experiment_manager.get_list(
                                    "dockex_machines"),
                                self.redis_client.get("ip_address"),
                                f"data/{project_archive_filename}",
                                local_project_archive_filename,
                            )

                            if found_project_archive:
                                with zipfile.ZipFile(
                                        local_project_archive_filename,
                                        "r") as zip_file:
                                    zip_file.extractall("/tmp/dockex/project")

                                # build the module images
                                experiment_module_paths = self.experiment_manager.get_list(
                                    "unique_module_paths")
                                # TODO: need a way to signal to the experiment that a build failed
                                # TODO: maybe a flag on manager that the experiment continually checks
                                # TODO: or maybe manager needs to test build before setting manager flag?
                                # TODO: even then though, if a build fails on remote host, that host should NOT work on that experiment name
                                # TODO: maybe a worker should track bad experiment names
                                self.redis_client.set(
                                    "status", "BUILDING PROJECT MODULES")
                                build_project_modules(self.docker_client,
                                                      experiment_module_paths)

                            else:
                                self.experiment_manager_dict = None
                                self.experiment_manager = None
                                self.redis_client.strict_redis.delete(
                                    "manager_redis_address")

                    else:
                        time.sleep(self.checking_manager_sleep_seconds)
                else:
                    time.sleep(self.checking_manager_sleep_seconds)

            # if we are connected to a manager
            else:
                # check if the manager is still a manager
                # if it is NOT still a manager
                if self.experiment_manager.get("manager_flag") is not True:
                    # disconnect from the manager
                    self.experiment_manager_dict = None
                    self.experiment_manager = None
                    self.redis_client.strict_redis.delete(
                        "manager_redis_address")

                # if it is still a manager
                else:
                    # check that the experiment name is the same
                    # if it is NOT the same, a new experiment has started
                    if (self.experiment_manager.get("experiment_name") !=
                            self.experiment_manager_dict["experiment_name"]):
                        # disconnect from the manager
                        self.experiment_manager_dict = None
                        self.experiment_manager = None
                        self.redis_client.strict_redis.delete(
                            "manager_redis_address")

                    # if the experiment name is the same
                    else:
                        # see if we can pull any work to do
                        # get the list of ready_jobs lists
                        ready_jobs_df = pd.DataFrame(
                            self.experiment_manager.smembers(
                                "ready_jobs_list_key_dicts"))

                        if len(ready_jobs_df) > 0:
                            # start with the jobs requiring the most credits
                            ready_jobs_df = ready_jobs_df.sort_values(
                                by=["gpu_credits", "cpu_credits"],
                                ascending=False)

                            num_open_cpu_credits = self.redis_client.get(
                                "cpu_credits_total") - self.redis_client.get(
                                    "cpu_credits_used")
                            num_open_gpu_credits = self.redis_client.get(
                                "gpu_credits_total") - self.redis_client.get(
                                    "gpu_credits_used")

                            if num_open_cpu_credits > 0 or num_open_gpu_credits > 0:
                                for ready_jobs_df_ind in ready_jobs_df.index:
                                    num_open_cpu_credits = self.redis_client.get(
                                        "cpu_credits_total"
                                    ) - self.redis_client.get(
                                        "cpu_credits_used")
                                    num_open_gpu_credits = self.redis_client.get(
                                        "gpu_credits_total"
                                    ) - self.redis_client.get(
                                        "gpu_credits_used")

                                    required_cpu_credits = int(
                                        ready_jobs_df.loc[ready_jobs_df_ind,
                                                          "cpu_credits"])
                                    required_gpu_credits = int(
                                        ready_jobs_df.loc[ready_jobs_df_ind,
                                                          "gpu_credits"])
                                    ready_jobs_key = ready_jobs_df.loc[
                                        ready_jobs_df_ind,
                                        "ready_jobs_list_key"]

                                    slots_min_list = []
                                    if required_cpu_credits > 0:
                                        num_open_cpu_slots = int(
                                            np.floor(num_open_cpu_credits /
                                                     required_cpu_credits))
                                        slots_min_list.append(
                                            num_open_cpu_slots)

                                    if required_gpu_credits > 0:
                                        num_open_gpu_slots = int(
                                            np.floor(num_open_gpu_credits /
                                                     required_gpu_credits))
                                        slots_min_list.append(
                                            num_open_gpu_slots)

                                    num_open_slots = int(
                                        np.min(slots_min_list))

                                    if num_open_slots > 0:
                                        p = (self.experiment_manager.
                                             strict_redis.pipeline())
                                        p.lrange(
                                            ready_jobs_key, 0,
                                            (num_open_slots -
                                             1))  # lrange is inclusive, so - 1
                                        p.ltrim(ready_jobs_key, num_open_slots,
                                                -1)
                                        pop_job_dicts, _ = p.execute()

                                        if len(pop_job_dicts) > 0:
                                            for pop_job_dict in pop_job_dicts:
                                                pop_job_dict = json.loads(
                                                    pop_job_dict)
                                                print(pop_job_dict)

                                                # checkout the credits
                                                self.redis_client.strict_redis.incrby(
                                                    "cpu_credits_used",
                                                    required_cpu_credits,
                                                )
                                                self.redis_client.strict_redis.incrby(
                                                    "gpu_credits_used",
                                                    required_gpu_credits,
                                                )

                                                self.redis_client.redis_launch_job(
                                                    f"/tmp/dockex/json/{pop_job_dict['name']}.json",
                                                    pop_job_dict,
                                                )

                        time.sleep(self.working_sleep_seconds)