Ejemplo n.º 1
0
    def get_client_cs(self, service_node_info):
        '''
        Args:
            service_node_info: info that service maps to a compute node for a job
        Returns:
            {"ip": value, "port": value, "box_name": value}
        '''
        box_name = service_node_info["box_name"]
        controller_port = constants.CONTROLLER_PORT
        tensorboard_port = None
        ssh_port = 22

        if not box_name in self.config.get("boxes"):
            if pc_utils.is_localhost(box_name):
                box_name = "local"

        box_addr = self.config.get(
            "boxes",
            box_name,
            dict_key="address",
            default_value=box_name,
            prop_error="box not defined in config file: " + box_name)

        if "@" in box_addr:
            # strip off the username
            _, box_addr = box_addr.split("@", 1)
        #console.print("box_addr=", box_addr)

        if not "." in box_addr and box_addr != "localhost":
            raise Exception(
                "box option must specify a machine by its IP address: " +
                str(box_addr))

        cs = {"ip": box_addr, "port": controller_port, "box_name": box_name}
        return cs
Ejemplo n.º 2
0
    def get_psm_client(self, service_node_info):
        box_os = service_node_info["box_os"]
        box_addr = service_node_info["box_addr"]
        is_box_windows = (box_os == "windows")

        if pc_utils.is_localhost(box_addr=box_addr):
            psm_client = LocalPsmClient()
        else:
            psm_client = RemotePsmClient(box_addr, is_box_windows)

        return psm_client
Ejemplo n.º 3
0
    def run_job_on_box(self,
                       job_id,
                       run_data_list,
                       box_index,
                       box_info,
                       app_info,
                       pool_info,
                       resume_name=None,
                       repeat=None,
                       using_hp=None,
                       exper_name=None,
                       snapshot_dir=None,
                       args=None):

        box_name = box_info.box_name
        box_addr = box_info.address
        box_os = box_info.box_os
        is_box_windows = (box_os == "windows")

        run_data = run_data_list[0]
        run_name = run_data["run_name"]

        if pc_utils.is_localhost(box_addr=box_addr):
            psm_client = LocalPsmClient()
        else:
            psm_client = RemotePsmClient(box_addr, is_box_windows)

        psm_client.restart_psm_if_needed()
        #print("psm created for box: " + box_addr)

        team = self.config.get("general", "xt-team-name")
        node_id = utils.node_id(box_index)

        cwd_dir = os.path.expanduser(constants.CWD_DIR)
        fn_src_zip = file_utils.path_join(cwd_dir, constants.CODE_ZIP_FN)

        fn_entry = psm_client.enqueue(team, job_id, run_name, node_id,
                                      fn_src_zip)

        service_node_info = {
            "fn_entry": fn_entry,
            "box_addr": box_addr,
            "box_os": box_os,
            "box_name": box_name,
            "job_id": job_id,
            "run_name": run_name
        }

        fb.feedback("submitted", is_final=True)

        return service_node_info
Ejemplo n.º 4
0
    def cancel_thru_os(self, box_name, show_progress=True):
        progress = console.print if show_progress else console.diag

        progress("  checking running processes on: " + box_name)

        is_local = pc_utils.is_localhost(box_name)
        #console.print("box_name=", box_name, ", is_local=", is_local)
        ''' kill the controller process on the specified local/remote box'''
        if is_local:  # pc_utils.is_localhost(box_name, box_addr):
            result = self.cancel_local_controller(progress)
        else:
            result = self.cancel_remote_controller(box_name, progress)

        return result
Ejemplo n.º 5
0
    def keysend(self, box):
        # syntax: xt keysend <box name>
        box_name = box
        if not box_name:
            errors.syntax_error("must specify a box name/address")

        info = box_information.get_box_addr(self.config, box_name, self.store)
        box_addr = info["box_addr"]

        if pc_utils.is_localhost(box_name,
                                 box_addr) or box_name == "azure-batch":
            errors.syntax_error(
                "must specify a remote box name or address (e.g., xt keysend [email protected]"
            )

        console.print(
            "this will require 2 connections to the remote host, so you will be prompted for a password twice"
        )
        status = self.core.keysend(box_name)
        if status:
            console.print("public key successfully sent.")
Ejemplo n.º 6
0
    def adjust_run_commands(self, job_id, job_runs, using_hp, experiment,
                            service_type, snapshot_dir, args):
        '''
        This method is called to allow the backend to inject needed shell commands before the user cmd.  This 
        base implementation does so by generating a new script file and adding it to the snapshot_dir.
        '''
        store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable,  \
            storage_name, storage_key = self.get_action_args(args)

        # local or POOL of vm's
        fn_wrapped = None  # we use same generated script on each box/job
        data_local = args["data_local"]
        model_local = args["model_local"]

        for i, box_runs in enumerate(job_runs):
            # wrap the user commands in FIRST RUN of each box (apply data/model actions)
            br = box_runs[0]
            box_info = br["box_info"]
            box_name = box_info.box_name
            box_secret = br["box_secret"]
            actions = box_info.actions
            node_id = utils.node_id(i)

            is_windows = box_info.box_os == "windows"

            run_specs = br["run_specs"]
            cmd_parts = run_specs["cmd_parts"]
            run_name = br["run_name"]

            if not fn_wrapped:

                # we only do this once (for the first box/job)
                using_localhost = pc_utils.is_localhost(
                    box_name, box_info.address)

                # data_local overrides store_data_dir for LOCAL machine
                if using_localhost and data_local:
                    store_data_dir = os.path.join(
                        os.path.expanduser(data_local), store_data_dir)
                    data_action = "use_local"
                    if not "data" in actions:
                        actions.append("data")

                # model_local overrides store_model_dir for LOCAL machine
                if using_localhost and model_local:
                    store_model_dir = os.path.join(
                        os.path.expanduser(model_local), store_model_dir)
                    model_action = "use_local"
                    if not "model" in actions:
                        actions.append("model")

                setup = self.config.get_setup_from_target_def(self.compute_def)

                env_vars = self.get_env_vars_for_box(box_name, box_info, i,
                                                     box_secret)
                post_cmds = []

                # add env vars to script
                setter = "@set" if is_windows else "export"

                for name, value in env_vars.items():
                    cmd = "{} {}={}".format(setter, name, value)
                    post_cmds.append(cmd)

                #"xt download before/code --job={} --unzip "

                fn_wrapped = super().wrap_user_command(
                    cmd_parts,
                    snapshot_dir,
                    store_data_dir,
                    data_action,
                    data_writable,
                    store_model_dir,
                    model_action,
                    model_writable,
                    storage_name,
                    storage_key,
                    actions,
                    is_windows=is_windows,
                    sudo_available=False,
                    pip_freeze=False,
                    setup=setup,
                    post_setup_cmds=post_cmds,
                    args=args,
                    nonempty=True)

            # we update each box's command
            script_part = "{} {} {}".format(os.path.basename(fn_wrapped),
                                            node_id, run_name)
            if self.is_windows:
                sh_parts = [script_part]
            else:
                sh_parts = ['/bin/bash', '--login', script_part]
            run_specs["cmd_parts"] = sh_parts