Example #1
0
    def submit(self, app: Application, mode: RunMode) -> str:
        if len(self._apps) == self._cache_size:
            if not self._evict_lru():
                raise IndexError(
                    f"App cache size ({self._cache_size}) exceeded. Increase the cache size"
                )

        id = self._ids.setdefault(app.name, -1) + 1
        self._ids[app.name] = id
        app_id = f"{app.name}_{id}"

        assert (
            app_id not in self._apps
        ), "no app_id collisons expected since incremental integer suffix is used"

        local_app = _LocalApplication(app.name)
        local_app.set_run_mode(mode)

        for role in app.roles:
            container = role.container
            assert (
                container
            ), "all roles in a submitted app must have container association"

            img_root = self._image_fetcher.fetch(container.image)
            cmd = os.path.join(img_root, role.entrypoint)
            for replica_id in range(role.num_replicas):
                args = [cmd] + macros.substitute(role.args, img_root, app_id,
                                                 str(replica_id))
                log.info(f"Running {args} with env: {role.env}")
                proc = subprocess.Popen(args, env=role.env)
                local_app.add_process(role.name, proc)

        self._apps[app_id] = local_app
        return app_id
Example #2
0
    def _to_app_popen_args(self, app_id: str, roles: List[Role],
                           cfg: RunConfig):
        """
        returns the popen args for all processes that needs to be created for the app

        ::

         # for each role
         [
           { <role_name_1> : [{args: cmd, env: env, ... other popen args ...}, ...]},
           { <role_name_1> : [{args: cmd, env: env, ... other popen args ...}, ...]},
           ...
         ]

         # example (app has 2 roles: master (1 replica), trainer (2 replicas)
         [
           {
             "master" : [
               {args: "master.par", env: env, ... other popen args ...}
              ]
           },
           {
             "trainer" : [
               {args: "trainer.par", env: env, ... other popen args ...},
               {args: "trainer.par", env: env, ... other popen args ...}
              ]
           },
         ]
        """
        app_popen_params = []
        for role in roles:
            container = role.container
            assert (
                container
            ), "all roles in a submitted app must have container association"

            image_fetcher = self._get_img_fetcher(cfg)
            img_root = image_fetcher.fetch(container.image)
            cmd = os.path.join(img_root, role.entrypoint)

            role_popen_params = {}
            for replica_id in range(role.num_replicas):
                args = [cmd] + macros.substitute(role.args, img_root, app_id,
                                                 str(replica_id))
                replica_popen_params = role_popen_params.setdefault(
                    role.name, [])
                env_vars = {**self._default_role_envs(), **role.env}
                params: Dict[str, Any] = {"args": args, "env": env_vars}
                app_log_dir = self._get_app_log_dir(app_id, cfg)
                if app_log_dir:
                    base_log_dir = os.path.join(app_log_dir, role.name,
                                                str(replica_id))
                    params["stdout"] = os.path.join(base_log_dir, "stdout.log")
                    params["stderr"] = os.path.join(base_log_dir, "stderr.log")

                replica_popen_params.append(params)

            app_popen_params.append(role_popen_params)
        return app_popen_params
Example #3
0
    def _to_popen_request(
        self,
        app: Application,
        cfg: RunConfig,
    ) -> PopenRequest:
        """
        Converts the application and cfg into a ``PopenRequest``.
        """

        app_id = make_unique(app.name)
        image_fetcher = self._get_img_fetcher(cfg)
        app_log_dir, redirect_std = self._get_app_log_dir(app_id, cfg)

        role_params: Dict[str, List[ReplicaParam]] = {}
        role_log_dirs: Dict[str, List[str]] = {}
        for role in app.roles:
            replica_params = role_params.setdefault(role.name, [])
            replica_log_dirs = role_log_dirs.setdefault(role.name, [])

            container = role.container
            img_root = image_fetcher.fetch(container.image)
            cmd = os.path.join(img_root, role.entrypoint)

            for replica_id in range(role.num_replicas):
                args = [cmd] + macros.substitute(role.args, img_root, app_id,
                                                 str(replica_id))
                replica_log_dir = os.path.join(app_log_dir, role.name,
                                               str(replica_id))

                env_vars = {
                    # this is the top level (agent if using elastic role) error file
                    # a.k.a scheduler reply file
                    "TORCHELASTIC_ERROR_FILE":
                    os.path.join(replica_log_dir, "error.json"),
                    **role.env,
                }
                stdout = None
                stderr = None
                if redirect_std:
                    stdout = os.path.join(replica_log_dir, "stdout.log")
                    stderr = os.path.join(replica_log_dir, "stderr.log")

                replica_params.append(
                    ReplicaParam(args, env_vars, stdout, stderr))
                replica_log_dirs.append(replica_log_dir)

        return PopenRequest(app_id, app_log_dir, role_params, role_log_dirs)
Example #4
0
    def _to_app_popen_args(
        self,
        app_id: str,
        roles: List[Role],
        app_log_dir: str,
        redirect_std: bool,
        cfg: RunConfig,
        dryrun: bool = True,
    ):
        """
        returns the popen args for all processes that needs to be created for the app

        ::

         # for each role
         [
           { <role_name_1> : [{args: cmd, env: env, ... other popen args ...}, ...]},
           { <role_name_1> : [{args: cmd, env: env, ... other popen args ...}, ...]},
           ...
         ]

         # example (app has 2 roles: master (1 replica), trainer (2 replicas)
         [
           {
             "master" : [
               {args: "master.par", env: env, ... other popen args ...}
              ]
           },
           {
             "trainer" : [
               {args: "trainer.par", env: env, ... other popen args ...},
               {args: "trainer.par", env: env, ... other popen args ...}
              ]
           },
         ]
        """
        app_popen_params = []
        for role in roles:
            container = role.container
            assert (
                container
            ), "all roles in a submitted app must have container association"

            image_fetcher = self._get_img_fetcher(cfg)
            img_root = image_fetcher.fetch(container.image)
            cmd = os.path.join(img_root, role.entrypoint)

            role_popen_params = {}
            for replica_id in range(role.num_replicas):
                args = [cmd] + macros.substitute(role.args, img_root, app_id,
                                                 str(replica_id))
                replica_popen_params = role_popen_params.setdefault(
                    role.name, [])
                replica_log_dir = os.path.join(app_log_dir, role.name,
                                               str(replica_id))
                if not dryrun:
                    os.makedirs(replica_log_dir)
                env_vars = {
                    # this is the top level (agent if using elastic role) error file
                    # a.k.a scheduler reply file
                    "TORCHELASTIC_ERROR_FILE":
                    os.path.join(replica_log_dir, "error.json"),
                    **role.env,
                }
                params: Dict[str, Any] = {"args": args, "env": env_vars}
                if redirect_std:
                    params["stdout"] = os.path.join(replica_log_dir,
                                                    "stdout.log")
                    params["stderr"] = os.path.join(replica_log_dir,
                                                    "stderr.log")

                replica_popen_params.append(params)

            app_popen_params.append(role_popen_params)
        return app_popen_params