Example #1
0
    def get_box_run_status_inner(self,
                                 box_name,
                                 ws=None,
                                 run_name=None,
                                 stage_flags=""):
        ''' requires that the desired box is the current box'''
        text = ""

        info = box_information.get_box_addr(self.config, box_name, self.store)
        box_addr = info["box_addr"]
        controller_port = info["controller_port"]

        if not self.client.is_controller_running(box_name, box_addr,
                                                 controller_port):
            text += "box: " + box_name + "\n"
            text += "  controller is NOT running\n"
        else:
            self.client.change_box(box_name, port=controller_port)

            #text += self.get_core_status(ws) + "\n"
            text += "\n" + self.get_box_status(box_name=box_name) + "\n"

            text += "\n" + stage_flags + " runs on " + box_name.upper() + ":\n"
            text += self.client.jobs_report(ws=ws,
                                            run_name=run_name,
                                            stage_flags=stage_flags)

        return text
    def addr(self, box):
        box_name = box

        info = box_information.get_box_addr(self.config, box_name, self.store)
        box_addr = info["box_addr"]
        controller_port = info["controller_port"]
        tb_port = info["tensorboard_port"]

        if controller_port:
            console.print(
                "{} address: {}, controller port={}, tensorboard port".format(
                    box_name, box_addr, controller_port, tb_port))
        else:
            console.print("{} address: {}".format(box_name, box_addr))
    def keysend(self, box):
        # syntax: xt keysend <box name>
        box_name = box
        if not box_name:
            errors.syntax_error("must specify a box name/address")

        info = box_information.get_box_addr(self.config, box_name, self.store)
        box_addr = info["box_addr"]

        if pc_utils.is_localhost(box_name,
                                 box_addr) or box_name == "azure-batch":
            errors.syntax_error(
                "must specify a remote box name or address (e.g., xt keysend [email protected]"
            )

        console.print(
            "this will require 2 connections to the remote host, so you will be prompted for a password twice"
        )
        status = self.core.keysend(box_name)
        if status:
            console.print("public key successfully sent.")
    def scp(self, cmd):
        # fixup the boxname:xxx patterns
        parts = cmd.split(" ")
        for i, part in enumerate(parts):
            if ":" in part:
                # remove surrounding quotes
                if part.startswith('"') and part.endswith('"'):
                    part = part[1:-1]
                elif part.startswith("'") and part.endswith("'"):
                    part = part[1:-1]

                names = part.split(":")
                if len(names) == 2 and len(names[0]) > 1:
                    # it looks like a box name
                    box_name = names[0]
                    #console.print("box_name=", box_name)

                    info = box_information.get_box_addr(
                        self.config, box_name, self.store)
                    box_addr = info["box_addr"]

                    #console.print("box_addr=", box_addr)
                    if box_addr:
                        new_part = box_addr + ":" + ":".join(names[1:])
                        #console.print("new part=", new_part)
                        parts[i] = new_part

        #cmd = " ".join(parts).replace(": ", ":")
        #console.print("new cmd=", cmd)

        # remove empty parts
        parts = [part for part in parts if part]

        exit_code, output = process_utils.run_scp_cmd(self,
                                                      parts,
                                                      report_error=True)
        if output:
            console.print(output)
        else:
            console.print("SCP command completed")
Example #5
0
    def connect_to_controller(self, box_name=None, ip_addr=None, port=None):
        '''
        establish communication with the XT controller process on the specified box.
        return True if connection established, False otherwise.
        '''
        connected = False
        console.diag("init_controler: box_name={}".format(box_name))

        if self.conn == box_name:
            connected = True
        else:
            if ip_addr:
                box_addr = ip_addr
            else:
                info = box_information.get_box_addr(self.config, box_name,
                                                    self.store)
                box_addr = info["box_addr"]
                controller_port = info["controller_port"]
                self.token = info["box_secret"]

                ip_addr = self.core.get_ip_addr_from_box_addr(box_addr)
                port = controller_port if controller_port else constants.CONTROLLER_PORT

            # the controller should now be running - try to connect
            try:
                console.diag("  connecting to controller")
                self.connect(box_name, ip_addr, port=port)
                console.diag("  connection successful!")

                # magic step: allows our callback to work correctly!
                # this must always be executed (even if self.conn is already true)
                bgsrv = rpyc.BgServingThread(self.conn)
                console.diag("  now running BgServingThread")
                connected = True
            except BaseException as ex:
                #self.report_controller_init_failure(box_name, box_addr, self.port, ex)
                # most common reasons for failure: not yet running (backend service) or finished running
                pass

        return connected
Example #6
0
    def connect_to_box_for_run(self, ws_name, run_name):
        state = None
        box_name, job_id, node_index = self.get_run_info(ws_name, run_name)
        info = box_information.get_box_addr(self.config, box_name, self.store)
        ip_addr = info["box_addr"]
        controller_port = info["controller_port"]

        if not controller_port:
            controller_port = self.port

        if state == "deallocated":
            connected = False
        elif controller_port:
            connected = self.connect_to_controller(ip_addr=ip_addr,
                                                   port=controller_port)
        else:
            connected = self.connect_to_controller(box_name=box_name)

        if controller_port:
            box_name = ip_addr + ":" + str(controller_port)

        return state, connected, box_name, job_id
Example #7
0
    def get_tensorboard_status(self, ws_name, run_name, box_name):
        if ws_name and run_name:
            self.connect_to_box_for_run(run_name)
        else:
            self.change_box(box_name)

        # get running status from controller
        status = self.conn.root.get_tensorboard_status(self.token)

        # add other info to status
        if not box_name:
            box_name, job_id, node_index = self.get_run_info(ws_name, run_name)

        info = box_information.get_box_addr(self.config, box_name, self.store)
        tensorboard_port = info["tensorboard_port"]

        status["box_name"] = box_name
        status["ip_addr"] = ip_addr
        status[
            "tensorboard_port"] = tensorboard_port if tensorboard_port else constants.TENSORBOARD_PORT

        return status
Example #8
0
    def cancel_controller(self, box_name, os_call_only=False):
        shutdown = False

        if not os_call_only:
            try:
                # first try to cancel it thru a SHUTDOWN REQUEST
                self.ensure_token_is_set()

                info = box_information.get_box_addr(self.config, box_name,
                                                    self.store)
                box_addr = info["box_addr"]

                is_running = self.is_controller_running(box_name, box_addr)
                if is_running:
                    self.conn.root.shutdown(self.token)
                    shutdown = True
            except BaseException as ex:
                console.print("shutdown request result: ex={}".format(ex))
                raise ex

        if not shutdown:
            # if above fails, kill the process if local or PEER
            self.cancel_thru_os(box_name)
    def ssh(self, name, cmd, workspace, output):
        capture_output = True if cmd else False

        if name.startswith("run"):
            # assume it's a RUN name
            # from xtlib.backends.backend_philly import Philly
            # rr = run_helper.get_run_record(self.store, workspace, name)
            # if not "cluster" in rr:
            #     errors.store_error("only philly runs are currently supported for this cmd")

            # philly = Philly(core=self.core)
            # ssh_cmd = philly.get_ssh_for_run(workspace, name)
            # print("ssh_cmd: " + ssh_cmd)

            # exit_code, output = process_utils.sync_run(ssh_cmd, report_error=True, capture_output=capture_output)
            pass
        else:
            # assume it's a BOX name
            info = box_information.get_box_addr(self.config, name, self.store)
            ssh_ip = info["box_addr"]

            #console.print("ssh_cmd: ssh_ip=", ssh_ip, ", cmd=", cmd)
            capture_as_bytes = bool(output)

            exit_code, ssh_output = process_utils.sync_run_ssh(
                self,
                ssh_ip,
                cmd,
                capture_output=capture_output,
                capture_as_bytes=capture_as_bytes)

        if output:
            # write as bytes
            with open(output, "wb") as outfile:
                outfile.write(ssh_output)
        elif capture_output:
            console.print(ssh_output)