Exemple #1
0
def wrapup_run_with_context(store, run, context_dict):
    context = utils.dict_to_object(context_dict)
    status = "cancelled"
    exit_code = 0
    node_id = utils.node_id(context.node_index)

    # use info from run, when possible (context is shared among all child runs)
    run_index = run["run_index"]
    run_name = run["run_name"]

    # these we don't have info for
    rundir = None  # unknown
    log = True
    capture = True

    store.wrapup_run(context.ws,
                     run_name,
                     context.aggregate_dest,
                     context.dest_name,
                     status,
                     exit_code,
                     context.primary_metric,
                     context.maximize_metric,
                     context.report_rollup,
                     rundir,
                     context.after_files_list,
                     log,
                     capture,
                     job_id=context.job_id,
                     node_id=node_id,
                     run_index=run_index)
def get_service_node_info(job_info, node_index):

    node_id = utils.node_id(node_index)
    service_info_by_node = job_info["service_info_by_node"]
    service_node_info = service_info_by_node[node_id]

    return service_node_info
def get_client_cs(core, job_id, node_index):
    '''
    instantiate the backend service that owns the specified job node and 
    request it's client connection string
    '''
    cs = None
    box_secret = None

    filter = {"_id": job_id}
    jobs = core.store.mongo.get_info_for_jobs(filter, None)
    if not jobs:
        errors.store_error("unknown job_id: {}".format(job_id))

    job = jobs[0]
    node_id = utils.node_id(node_index)

    compute = utils.safe_value(job, "compute")
    secrets_by_node = utils.safe_value(job, "secrets_by_node")
    if not secrets_by_node:
        errors.store_error("unknown node_index={} for job={}".format(
            node_index, job_id))

    box_secret = utils.safe_value(secrets_by_node, node_id)

    service_info_by_node = utils.safe_value(job, "service_info_by_node")
    node_info = utils.safe_value(service_info_by_node, node_id)

    if compute and node_info:
        backend = core.create_backend(compute)
        cs = backend.get_client_cs(node_info)

    cs_plus = {"cs": cs, "box_secret": box_secret, "job": job}
    return cs_plus
Exemple #4
0
    def run_job_on_box(self,
                       job_id,
                       run_data_list,
                       box_index,
                       box_info,
                       app_info,
                       pool_info,
                       resume_name=None,
                       repeat=None,
                       using_hp=None,
                       exper_name=None,
                       snapshot_dir=None,
                       args=None):

        box_name = box_info.box_name
        box_addr = box_info.address
        box_os = box_info.box_os
        is_box_windows = (box_os == "windows")

        run_data = run_data_list[0]
        run_name = run_data["run_name"]

        if pc_utils.is_localhost(box_addr=box_addr):
            psm_client = LocalPsmClient()
        else:
            psm_client = RemotePsmClient(box_addr, is_box_windows)

        psm_client.restart_psm_if_needed()
        #print("psm created for box: " + box_addr)

        team = self.config.get("general", "xt-team-name")
        node_id = utils.node_id(box_index)

        cwd_dir = os.path.expanduser(constants.CWD_DIR)
        fn_src_zip = file_utils.path_join(cwd_dir, constants.CODE_ZIP_FN)

        fn_entry = psm_client.enqueue(team, job_id, run_name, node_id,
                                      fn_src_zip)

        service_node_info = {
            "fn_entry": fn_entry,
            "box_addr": box_addr,
            "box_os": box_os,
            "box_name": box_name,
            "job_id": job_id,
            "run_name": run_name
        }

        fb.feedback("submitted", is_final=True)

        return service_node_info
Exemple #5
0
    def close(self):
        if self.xt_logging and self.direct_run and self.store and self.context:
            context = self.context
            status = "completed"
            exit_code = 0
            rundir = "."
            node_id = utils.node_id(context.node_index)

            # wrap up the run (usually done by controller)
            self.store.wrapup_run(context.ws,
                                  self.run_name,
                                  context.aggregate_dest,
                                  context.dest_name,
                                  status=status,
                                  exit_code=exit_code,
                                  primary_metric=context.primary_metric,
                                  maximize_metric=context.maximize_metric,
                                  report_rollup=context.report_rollup,
                                  rundir=rundir,
                                  after_files_list=context.after_files_list,
                                  after_omit_list=context.after_omit_list,
                                  log_events=context.log,
                                  capture_files=context.after_upload,
                                  job_id=context.job_id,
                                  is_parent=True,
                                  node_id=node_id,
                                  run_index=None)

        if self.train_writer:
            self.train_writer.close()
            self.test_writer.close()

        if self.train_writer2:
            self.train_writer2.close()
            self.test_writer2.close()

        if self.is_aml and self.store:
            # partially log the end of the run

            # TODO: how to do this partial log for killed/error runs?
            status = "completed"
            exit_code = 0
            restarts = None
            hparams_dict = None
            metrics_rollup_dict = None
            end_time = utils.get_time()
            log_records = []

            self.store.end_run(self.ws_name,
                               self.run_name,
                               status,
                               exit_code,
                               hparams_dict,
                               metrics_rollup_dict,
                               end_time=None,
                               restarts=restarts,
                               aggregate_dest=None,
                               dest_name=None,
                               is_aml=True)

            self.store.update_mongo_run_at_end(self.ws_name, self.run_name,
                                               status, exit_code, restarts,
                                               end_time, log_records,
                                               hparams_dict,
                                               metrics_rollup_dict)
Exemple #6
0
    def process_run_command(self, args):
        self.args = args

        # ensure workspace exists
        workspace = args['workspace']
        dry_run = args['dry_run']
        fake_submit = args["fake_submit"]

        if not fake_submit:
            self.store.ensure_workspace_exists(workspace, flag_as_error=False)

        # PRE-PROCESS ARGS
        service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, compute, compute_def = \
            self.process_args(args)

        # create backend helper (pool, philly, batch, aml)
        cluster = utils.safe_value(compute_def, "cluster")
        vc = utils.safe_value(compute_def, "vc")
        self.backend = self.core.create_backend(compute, cluster=cluster, vc=vc, username=None)

        # add conda_packages and pip_packages from SETUP to ARGS
        setup_def = self.config.get_setup_from_target_def(compute_def)

        conda_packages = utils.safe_value(setup_def, "conda-packages")
        pip_packages = utils.safe_value(setup_def, "pip-packages")

        args["conda_packages"] = conda_packages if conda_packages else []
        args["pip_packages"] = pip_packages if pip_packages else []

        self.adjust_pip_packages(args)

        snapshot_dir = self.temp_dir

        if fake_submit:
            script_dir = snapshot_dir
        else:
            # note: always create a snapshot dir for backends to add needed files
            file_utils.ensure_dir_deleted(snapshot_dir)
            script_dir = self.snapshot_all_code(snapshot_dir, cmd_parts, args)

        self.script_dir = script_dir
        direct_run = args["direct_run"]

        # do we need to start the xt controller?
        use_controller = not direct_run
        adjustment_scripts = None

        # create a job_secret that can later be used to authenticate with the XT controller
        # NOTE: we currently log this secret as a job property, which allows all team members to view and control this job
        job_secret = str(uuid.uuid4())

        # do we need to build a "docker run" command?
        if not self.backend.provides_container_support():
            env = args["docker"]
            if not env:
                docker_name = utils.safe_value(compute_def, "docker")
            if docker_name and docker_name != "none":
                cmd_parts = self.build_docker_cmd(docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args)
                args["docker"] = docker_name     # for use in building run context info

        # BUILD CMDS (from static hparam search, user multi cmds, or single user cmd)
        cmds, total_run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style = \
            self.build_cmds_with_search(service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args)

        if dry_run:
            return

        # make new values available
        args["search_style"] = search_style
        args["total_run_count"] = total_run_count

        resume_name = args['resume_name']
        keep_name = False  # args['keep_name']
        experiment = args['experiment']
        is_distributed = args['distributed']
        direct_run = args["direct_run"]

        # CREATE JOB to hold all runs
        if fake_submit:
            # use lastrun/lastjob info to get a fast incremental fake job number
            xtd = xt_dict.read_xt_dict()
            fake_job_num = xtd["fake_job_num"] if "fake_job_num" in xtd else 1
            xtd["fake_job_num"] = fake_job_num + 1
            xt_dict.write_xt_dict(xtd)
            job_id = "fake_job" + str(fake_job_num)
        else:
            job_id = self.store.create_job()
        fb.feedback(job_id)

        # start the feedback (by parts)
        fb.feedback("{}: {}".format("target", compute))

        # write hparams to FILES
        boxes, num_boxes = self.write_hparams_to_files(job_id, cmds, fake_submit, using_hp, args)

        if sweeps_text and not fake_submit:
            self.upload_sweep_data(sweeps_text, experiment, job_id, args=args)

        # if num_boxes > 1 and service_type != "batch":
        #     fb.feedback("", is_final=True)

        parent_name = None

        # BUILD RUNS, by box
        job_runs = []
        run_count = 1 if is_distributed else len(boxes) 
        secrets_by_node = {}
        remote_control = args["remote_control"]

        for i in range(run_count):
            box_name = boxes[i]

            # generate a box secret for talking to XT controller for this node
            box_secret =  str(uuid.uuid4()) if remote_control else ""

            # build runs for box_name
            run_data = self.build_first_run_for_node(i, boxes[i], target_file, ps_path, using_hp, using_aml_hparam, run_specs, job_id, 
                parent_name, cmds, pool_info, repeat_count, fake_submit, search_style, box_secret, args)

            # for now, adhere to the more general design of multiple runs per box
            box_runs = [run_data]      
            job_runs.append(box_runs)

            node_id = utils.node_id(i)            
            secrets_by_node[node_id] = box_secret

            # FEEDBACK 
            ptype = "single " if search_style == "single" else "parent "
            if is_distributed:
                ptype = "master "

            if run_count == 1:
                node_msg = "creating {}run".format(ptype)
            else:
                node_msg = "creating {}runs: {}/{}".format(ptype, i+1, run_count)

            if service_type == "pool":
                node_msg += ", box: " + box_name

            fb.feedback(node_msg, id="node_msg")  # , add_seperator=is_last)
            last_msg = node_msg

            # run the job

        # build box: runs dict for job info file
        runs_by_box, last_run = self.build_runs_by_box(job_runs, workspace)

        # now that we have run names for all static run names for all nodes, we can adjust cmds (and before files) for using the controller
        if use_controller:
            # we will create 2 temp. controller files in the CURRENT DIRECTORY (that will be captured to JOB)
            # this will also adjust commands for each node to run the XT controller
            adjustment_scripts = self.core.adjust_job_for_controller_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, 
                search_style, args=args)

        else:
            adjustment_scripts = self.core.adjust_job_for_direct_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, 
                search_style, args=args)

        # add env vars used by both controller and runs
        env_vars = args["env_vars"]

        # create a job guid to uniquely identify this job across all XT instances
        job_guid = str(uuid.uuid4())

        # we add with "node0" and "job_secret", but backend service will override for each node
        scriptor.add_controller_env_vars(env_vars, self.config, None, "node0")

        data_local = args["data_local"]
        if "$scriptdir" in data_local:
            data_local = os.path.realpath(data_local.replace("$scriptdir", script_dir))
            args["data_local"] = data_local

        model_local = args["model_local"]
        if "$scriptdir" in model_local:
            model_local = os.path.realpath(model_local.replace("$scriptdir", script_dir))
            args["model_local"] = model_local

        # ADJUST CMDS: this allows backend to write scripts to snapshot dir, if needed, as a way of adjusting/wrapping run commands
        self.backend.adjust_run_commands(job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args=args)

        # upload CODE from snapshot_dir
        code_upload = args["code_upload"]
        code_omit = args["code_omit"]
        code_zip = args["code_zip"]
    
        if not fake_submit:
            if code_upload:
                self.core.upload_before_files_to_job(job_id, snapshot_dir, "before/code", code_omit, code_zip, "code", args)

            # upload DATA from data_local (do we need to keep this?  should we upload to normal DATA location, vs. job?)
            data_upload = args["data_upload"]
            if data_upload:
                if not data_local:
                    errors.config_error("cannot do data-upload because no data-local path is defined in the XT config file")

                data_omit = args["data_omit"]
                data_zip = "none"

                self.core.upload_before_files_to_job(job_id, data_local, "before/data", data_omit, data_zip, "data", args)
        
        # dispatch to BACKEND submitters
        '''
        Note: backend submitter functions are responsible for:
            - submitting the job (for each node, queue runs for that node)
            - return service job id (or list of them if per node)

        NOTE: there is a timing issue where submitted job needs access to job info, but final piece
        of job info (service info) is only return after job is submitted.  Therefore, we structure steps as follows:

            - primary job info is logged
            - job is submitted thru backend
            - service info for job is logged
        '''

        # LOG PRIMARY JOB INFO
        dd = {}

        if not fake_submit:
            # mark runs as QUEUED
            for runs in runs_by_box.values():
                first_run = runs[0]
                self.store.log_run_event(workspace, first_run["run_name"], "status-change", {"status": "queued"}) 

            # write the job info file (now that backend has had a chance to update it)
            job_num = int(job_id[3:])

            xt_cmd = args["xt_cmd"]
            schedule = args["schedule"]
            concurrent = args["concurrent"]

            # this job property is used to ensure we don't exceed the specified # of runs when using repeat_count on each node
            dynamic_runs_remaining = None if search_style == "single" else total_run_count
            node_count = len(runs_by_box)

            # static_runs_by_node = None
            # if schedule == "static":
            #     static_runs_by_node = self.build_static_runs_by_node(total_run_count, node_count)
            #console.diag("static_runs_by_node=", static_runs_by_node)

            active_runs = mongo_run_index.build_active_runs(schedule, total_run_count, node_count)

            dd = {"job_id": job_id, "job_num": job_num, "compute": compute, "ws_name": workspace, "exper_name": experiment, 
                "pool_info": compute_def, "runs_by_box": runs_by_box, 
                "primary_metric": args["primary_metric"], 
                "run_count": total_run_count, "repeat": repeat_count, "search_type": args["search_type"], 
                "username": args["username"], "hold": args["hold"], "started": utils.get_time(),
                "job_status": "submitted", "running_nodes": 0, 
                "running_runs": 0, "error_runs": 0, "completed_runs": 0, "job_guid": job_guid, "job_secret": job_secret,
                "dynamic_runs_remaining": dynamic_runs_remaining, "search_style": search_style,     
                "active_runs": active_runs,  "connect_info_by_node": {}, "secrets_by_node": secrets_by_node,  
                "xt_cmd": xt_cmd, "schedule": schedule, "node_count": node_count, "concurrent": concurrent,
                "service_job_info": None, "service_info_by_node": None,
            }

            self.store.log_job_info(job_id, dd)

        # SUBMIT JOB 
        # NOTE: we use "pool_info" here (vs. compute_def, which has not been updated with explicit args)
        service_job_info, service_info_by_node = self.backend.submit_job(job_id, job_runs, workspace, pool_info, resume_name, 
            repeat_count, using_hp, runs_by_box, experiment, snapshot_dir, adjustment_scripts, args)

        # POST SUBMIT processing

        # update job info 
        if not fake_submit:
            dd["service_job_info"] = service_job_info
            dd["service_info_by_node"] = service_info_by_node
            self.store.log_job_info(job_id, dd)

        # update lastrun/lastjob info
        xtd = xt_dict.read_xt_dict()
        xtd["last_run"] = last_run
        xtd["last_job"] = job_id
        xt_dict.write_xt_dict(xtd)

        # return values for API support (X)
        return cmds, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, job_id 
    def adjust_run_commands(self, job_id, job_runs, using_hp, experiment,
                            service_type, snapshot_dir, args):
        '''
        This method is called to allow the backend to inject needed shell commands before the user cmd.  At the
        time this is called, files can still be added to snapshot_dir.
        '''
        store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable,  \
            storage_name, storage_key = self.get_action_args(args)

        # local or POOL of vm's
        fn_wrapped = None  # we use same script for each box (but with different ARGS)
        username = args["username"]

        for i, box_runs in enumerate(job_runs):
            # wrap the user commands in FIRST RUN of each box (apply data/model actions)
            br = box_runs[0]
            box_info = br["box_info"]
            actions = ["data", "model"]
            run_name = br["run_name"]
            is_windows = False
            node_id = utils.node_id(i)

            run_specs = br["run_specs"]
            cmd_parts = run_specs["cmd_parts"]

            if not fn_wrapped:
                # just wrap the user cmd once (shared by all boxes/nodes)
                assert cmd_parts[0] == "python"
                assert cmd_parts[1] == "-u"
                assert len(cmd_parts[2]) > 0

                # update the target_fn (might have been switched to the xt controller)
                target_fn = cmd_parts[2]
                arg_parts = cmd_parts[3:]

                setup = self.config.get_setup_from_target_def(self.compute_def)

                # we only do this once (for the first box/job)
                fn_wrapped = super().wrap_user_command(cmd_parts,
                                                       snapshot_dir,
                                                       store_data_dir,
                                                       data_action,
                                                       data_writable,
                                                       store_model_dir,
                                                       model_action,
                                                       model_writable,
                                                       storage_name,
                                                       storage_key,
                                                       actions,
                                                       is_windows=is_windows,
                                                       sudo_available=False,
                                                       username=username,
                                                       use_username=False,
                                                       install_blobfuse=True,
                                                       setup=setup,
                                                       change_dir=False,
                                                       args=args)

                # AML wants a python script, so use our tiny python shim to run wrapped.sh
                fn_shim = "aml_shim.py"
                fn_from = file_utils.get_xtlib_dir() + "/backends/" + fn_shim
                fn_to = snapshot_dir + "/" + fn_shim
                shutil.copyfile(fn_from, fn_to)

                # copy to submit-logs
                utils.copy_to_submit_logs(args, fn_from)

            # we update each box's command (passing RUN_NAME as arg to wrapped.sh)
            script_part = "{} {} {}".format(os.path.basename(fn_wrapped),
                                            node_id, run_name)
            sh_parts = ['/bin/bash', '--login', script_part]

            # pass sh_parts as a single argument to avoid wierd "arg": 1 problems with AML estimators
            wrapped_parts = ["python", "-u", fn_shim, " ".join(sh_parts)]
            run_specs["cmd_parts"] = wrapped_parts
Exemple #8
0
    def adjust_run_commands(self, job_id, job_runs, using_hp, experiment,
                            service_type, snapshot_dir, args):
        '''
        This method is called to allow the backend to inject needed shell commands before the user cmd.  This 
        base implementation does so by generating a new script file and adding it to the snapshot_dir.
        '''
        store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable,  \
            storage_name, storage_key = self.get_action_args(args)

        # local or POOL of vm's
        fn_wrapped = None  # we use same generated script on each box/job
        data_local = args["data_local"]
        model_local = args["model_local"]

        for i, box_runs in enumerate(job_runs):
            # wrap the user commands in FIRST RUN of each box (apply data/model actions)
            br = box_runs[0]
            box_info = br["box_info"]
            box_name = box_info.box_name
            box_secret = br["box_secret"]
            actions = box_info.actions
            node_id = utils.node_id(i)

            is_windows = box_info.box_os == "windows"

            run_specs = br["run_specs"]
            cmd_parts = run_specs["cmd_parts"]
            run_name = br["run_name"]

            if not fn_wrapped:

                # we only do this once (for the first box/job)
                using_localhost = pc_utils.is_localhost(
                    box_name, box_info.address)

                # data_local overrides store_data_dir for LOCAL machine
                if using_localhost and data_local:
                    store_data_dir = os.path.join(
                        os.path.expanduser(data_local), store_data_dir)
                    data_action = "use_local"
                    if not "data" in actions:
                        actions.append("data")

                # model_local overrides store_model_dir for LOCAL machine
                if using_localhost and model_local:
                    store_model_dir = os.path.join(
                        os.path.expanduser(model_local), store_model_dir)
                    model_action = "use_local"
                    if not "model" in actions:
                        actions.append("model")

                setup = self.config.get_setup_from_target_def(self.compute_def)

                env_vars = self.get_env_vars_for_box(box_name, box_info, i,
                                                     box_secret)
                post_cmds = []

                # add env vars to script
                setter = "@set" if is_windows else "export"

                for name, value in env_vars.items():
                    cmd = "{} {}={}".format(setter, name, value)
                    post_cmds.append(cmd)

                #"xt download before/code --job={} --unzip "

                fn_wrapped = super().wrap_user_command(
                    cmd_parts,
                    snapshot_dir,
                    store_data_dir,
                    data_action,
                    data_writable,
                    store_model_dir,
                    model_action,
                    model_writable,
                    storage_name,
                    storage_key,
                    actions,
                    is_windows=is_windows,
                    sudo_available=False,
                    pip_freeze=False,
                    setup=setup,
                    post_setup_cmds=post_cmds,
                    args=args,
                    nonempty=True)

            # we update each box's command
            script_part = "{} {} {}".format(os.path.basename(fn_wrapped),
                                            node_id, run_name)
            if self.is_windows:
                sh_parts = [script_part]
            else:
                sh_parts = ['/bin/bash', '--login', script_part]
            run_specs["cmd_parts"] = sh_parts
    def build_data_frames(self):
        '''
        1. for each run, collect the reported metrics as metric sets (by reported col list)

        2. append to the dataframe for that col list
        '''
        # build "data_frames"
        no_metrics = []
        pp_run_names = []
        used_max = False
        data_frames_by_cols = {}
        got_columns = False

        for i, record in enumerate(self.run_log_records):
            # extract metrics for this run
            run = record["_id"]
            node = utils.node_id(record["node_index"])
            job = record["job_id"]
            experiment = record["exper_name"]
            workspace = record["ws"]
            search_style = utils.safe_value(record, "search_style")
            if search_style and search_style != "single":
                # parent run with children - skip it
                continue

            log_records = record["log_records"]

            metric_sets = run_helper.build_metrics_sets(log_records)
            if not metric_sets:
                no_metrics.append(run)
                continue

            if self.max_runs and len(pp_run_names) >= self.max_runs:
                used_max = True
                break

            if not got_columns:
                # set x and y columns
                explicit = qfe.get_explicit_options()
                if not "x" in explicit:
                    self.x_col = self.get_actual_x_column(
                        metric_sets, self.x_col, self.col_names)

                if not self.col_names:
                    # not specified by user, so build defaults
                    self.col_names = self.get_default_y_columns(
                        metric_sets, self.x_col)

                got_columns = True

            # merge metric sets into dfx
            for metric_set in metric_sets:

                # create a pandas DataFrame
                df = pd.DataFrame(metric_set["records"])
                cols = str(list(df.columns))

                # ensure this df has our x_col
                if self.x_col and not self.x_col in cols:
                    continue

                # ensure this df has at least 1 y_col
                found_y = False
                for y in self.col_names:
                    if y in cols:
                        found_y = True
                        break

                if not found_y:
                    continue

                # add run_name column
                df["run"] = [run] * df.shape[0]
                df["node"] = [node] * df.shape[0]
                df["job"] = [job] * df.shape[0]
                df["experiment"] = [experiment] * df.shape[0]
                df["workspace"] = [workspace] * df.shape[0]

                if not cols in data_frames_by_cols:
                    data_frames_by_cols[cols] = df
                else:
                    dfx = data_frames_by_cols[cols]
                    dfx = dfx.append(df)
                    data_frames_by_cols[cols] = dfx

            pp_run_names.append(run)

        if no_metrics:
            console.print(
                "\nnote: following runs were skipped (currently have no logged metrics): \n    {}\n"
                .format(", ".join(no_metrics)))

        if used_max:
            console.print(
                "plotting first {} runs (use --max-runs to override)".format(
                    self.max_runs))
        else:
            console.print("plotting {} runs...".format(len(pp_run_names)))

        # update our list of run_names to proces
        self.run_names = pp_run_names

        return data_frames_by_cols
Exemple #10
0
    def monitor_job_node(self, job_id, jupyter, sleep, node_index, log_name,
                         escape):

        if node_index is None:
            node_index = 0

        backend, job_info = job_helper.get_job_backend(self.store, self.core,
                                                       job_id)

        node_id = utils.node_id(node_index)
        service_info_by_node = job_info["service_info_by_node"]
        service_node_info = service_info_by_node[node_id]

        service_name = backend.get_name()
        node_count = len(job_info["service_info_by_node"])

        console.print("==> monitoring: {}, node{} [{}] (press escape or control-c to exit, +/- to change node)" \
            .format(job_id, node_index, service_name), flush=True)

        # the monitoring loop
        start_offset = 0
        simple_status = None
        service_status = None
        kb_sleep = .1  # be quick to respond to user's key presses
        sleeps_per_call = max(1, sleep // kb_sleep)
        sleep_count = 0
        ch = None
        first_call = True
        offset_by_node = {}
        node_id = utils.node_id(node_index)
        first_text_of_stream = True
        display_count = 0
        started = time.time()

        try:
            with KeyPressChecker() as checker:

                while simple_status != "completed":
                    ch = checker.getch_nowait()
                    #print(ch, end=" ")

                    if ch == constants.ESCAPE or ch == constants.CONTROL_C:
                        break

                    if ch in ["+", "-"] and node_count > 1:
                        # save current context
                        offset_by_node[node_id] = start_offset

                        # increment node_index
                        delta = 1 if ch == "+" else -1
                        node_index = (delta + node_index) % node_count
                        node_id = utils.node_id(node_index)

                        # set new context
                        service_node_info = service_info_by_node[node_id]
                        start_offset = offset_by_node[
                            node_id] if node_id in offset_by_node else 0
                        first_text_of_stream = not start_offset
                        console.print(
                            "==> switching to: node{}".format(node_index))

                    if ch == "q":
                        # diagnostic aid (undocumented, only for pool service)
                        console.print("{} service queue:".format(node_id),
                                      flush=True)

                        entries = backend.get_service_queue_entries(
                            service_node_info)
                        if entries is None:
                            console.print("  <not supported for this service")
                        else:
                            if entries:
                                for entry in entries:
                                    marker = "*" if entry["current"] else " "
                                    console.print("  {} {}".format(
                                        marker, entry["name"]))
                            else:
                                console.print("  <no entries>")

                    if sleep_count == sleeps_per_call:

                        # time to read the log file
                        result = backend.read_log_file(
                            service_node_info,
                            log_name=log_name,
                            start_offset=start_offset)

                        new_text = result["new_text"]
                        new_simple = result["simple_status"]
                        new_service = result["service_status"]
                        start_offset = result["next_offset"]
                        new_log = result["log_name"]

                        if new_log != log_name:
                            log_name = new_log
                            console.print(
                                "==> node{} streaming log: {}".format(
                                    node_index, log_name))
                            first_text_of_stream = True

                        if new_service != service_status:
                            service_status = new_service
                            simple_status = new_simple
                            console.print("==> node{} status: {} ({})".format(
                                node_index, service_status, simple_status))

                        if new_text:
                            if node_count > 1:
                                # prepend each new line with node_id
                                prefix = node_id + ": "
                                new_text = new_text.replace(
                                    "\n", "\n" + prefix)

                                if first_text_of_stream:
                                    new_text = prefix + new_text
                                    first_text_of_stream = False

                            console.print(new_text, end="")
                            display_count += 1

                        first_call = False
                        sleep_count = 0

                    time.sleep(kb_sleep)
                    sleep_count += 1

                    if escape:
                        # have we exceeded max time in monitoring?
                        elapsed = time.time() - started
                        if elapsed >= escape:
                            break

        except KeyboardInterrupt:
            ch = constants.CONTROL_C

        show_final_status = (display_count > 0)

        if ch == constants.ESCAPE:
            console.print("==> monitoring cancelled (escape key detected)")

        elif ch == constants.CONTROL_C:
            console.print("==> monitoring cancelled (control-c detected)")
            ch = single_char_input("do you want to cancel the job? (y/n): ")
            if ch == "y":
                self.cancel_job(job_id)
                show_final_status = False

        if show_final_status:
            console.print("==> node{} status: {} ({})".format(
                node_index, service_status, simple_status))