def wrapup_run_with_context(store, run, context_dict): context = utils.dict_to_object(context_dict) status = "cancelled" exit_code = 0 node_id = utils.node_id(context.node_index) # use info from run, when possible (context is shared among all child runs) run_index = run["run_index"] run_name = run["run_name"] # these we don't have info for rundir = None # unknown log = True capture = True store.wrapup_run(context.ws, run_name, context.aggregate_dest, context.dest_name, status, exit_code, context.primary_metric, context.maximize_metric, context.report_rollup, rundir, context.after_files_list, log, capture, job_id=context.job_id, node_id=node_id, run_index=run_index)
def get_service_node_info(job_info, node_index): node_id = utils.node_id(node_index) service_info_by_node = job_info["service_info_by_node"] service_node_info = service_info_by_node[node_id] return service_node_info
def get_client_cs(core, job_id, node_index): ''' instantiate the backend service that owns the specified job node and request it's client connection string ''' cs = None box_secret = None filter = {"_id": job_id} jobs = core.store.mongo.get_info_for_jobs(filter, None) if not jobs: errors.store_error("unknown job_id: {}".format(job_id)) job = jobs[0] node_id = utils.node_id(node_index) compute = utils.safe_value(job, "compute") secrets_by_node = utils.safe_value(job, "secrets_by_node") if not secrets_by_node: errors.store_error("unknown node_index={} for job={}".format( node_index, job_id)) box_secret = utils.safe_value(secrets_by_node, node_id) service_info_by_node = utils.safe_value(job, "service_info_by_node") node_info = utils.safe_value(service_info_by_node, node_id) if compute and node_info: backend = core.create_backend(compute) cs = backend.get_client_cs(node_info) cs_plus = {"cs": cs, "box_secret": box_secret, "job": job} return cs_plus
def run_job_on_box(self, job_id, run_data_list, box_index, box_info, app_info, pool_info, resume_name=None, repeat=None, using_hp=None, exper_name=None, snapshot_dir=None, args=None): box_name = box_info.box_name box_addr = box_info.address box_os = box_info.box_os is_box_windows = (box_os == "windows") run_data = run_data_list[0] run_name = run_data["run_name"] if pc_utils.is_localhost(box_addr=box_addr): psm_client = LocalPsmClient() else: psm_client = RemotePsmClient(box_addr, is_box_windows) psm_client.restart_psm_if_needed() #print("psm created for box: " + box_addr) team = self.config.get("general", "xt-team-name") node_id = utils.node_id(box_index) cwd_dir = os.path.expanduser(constants.CWD_DIR) fn_src_zip = file_utils.path_join(cwd_dir, constants.CODE_ZIP_FN) fn_entry = psm_client.enqueue(team, job_id, run_name, node_id, fn_src_zip) service_node_info = { "fn_entry": fn_entry, "box_addr": box_addr, "box_os": box_os, "box_name": box_name, "job_id": job_id, "run_name": run_name } fb.feedback("submitted", is_final=True) return service_node_info
def close(self): if self.xt_logging and self.direct_run and self.store and self.context: context = self.context status = "completed" exit_code = 0 rundir = "." node_id = utils.node_id(context.node_index) # wrap up the run (usually done by controller) self.store.wrapup_run(context.ws, self.run_name, context.aggregate_dest, context.dest_name, status=status, exit_code=exit_code, primary_metric=context.primary_metric, maximize_metric=context.maximize_metric, report_rollup=context.report_rollup, rundir=rundir, after_files_list=context.after_files_list, after_omit_list=context.after_omit_list, log_events=context.log, capture_files=context.after_upload, job_id=context.job_id, is_parent=True, node_id=node_id, run_index=None) if self.train_writer: self.train_writer.close() self.test_writer.close() if self.train_writer2: self.train_writer2.close() self.test_writer2.close() if self.is_aml and self.store: # partially log the end of the run # TODO: how to do this partial log for killed/error runs? status = "completed" exit_code = 0 restarts = None hparams_dict = None metrics_rollup_dict = None end_time = utils.get_time() log_records = [] self.store.end_run(self.ws_name, self.run_name, status, exit_code, hparams_dict, metrics_rollup_dict, end_time=None, restarts=restarts, aggregate_dest=None, dest_name=None, is_aml=True) self.store.update_mongo_run_at_end(self.ws_name, self.run_name, status, exit_code, restarts, end_time, log_records, hparams_dict, metrics_rollup_dict)
def process_run_command(self, args): self.args = args # ensure workspace exists workspace = args['workspace'] dry_run = args['dry_run'] fake_submit = args["fake_submit"] if not fake_submit: self.store.ensure_workspace_exists(workspace, flag_as_error=False) # PRE-PROCESS ARGS service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, compute, compute_def = \ self.process_args(args) # create backend helper (pool, philly, batch, aml) cluster = utils.safe_value(compute_def, "cluster") vc = utils.safe_value(compute_def, "vc") self.backend = self.core.create_backend(compute, cluster=cluster, vc=vc, username=None) # add conda_packages and pip_packages from SETUP to ARGS setup_def = self.config.get_setup_from_target_def(compute_def) conda_packages = utils.safe_value(setup_def, "conda-packages") pip_packages = utils.safe_value(setup_def, "pip-packages") args["conda_packages"] = conda_packages if conda_packages else [] args["pip_packages"] = pip_packages if pip_packages else [] self.adjust_pip_packages(args) snapshot_dir = self.temp_dir if fake_submit: script_dir = snapshot_dir else: # note: always create a snapshot dir for backends to add needed files file_utils.ensure_dir_deleted(snapshot_dir) script_dir = self.snapshot_all_code(snapshot_dir, cmd_parts, args) self.script_dir = script_dir direct_run = args["direct_run"] # do we need to start the xt controller? use_controller = not direct_run adjustment_scripts = None # create a job_secret that can later be used to authenticate with the XT controller # NOTE: we currently log this secret as a job property, which allows all team members to view and control this job job_secret = str(uuid.uuid4()) # do we need to build a "docker run" command? if not self.backend.provides_container_support(): env = args["docker"] if not env: docker_name = utils.safe_value(compute_def, "docker") if docker_name and docker_name != "none": cmd_parts = self.build_docker_cmd(docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args) args["docker"] = docker_name # for use in building run context info # BUILD CMDS (from static hparam search, user multi cmds, or single user cmd) cmds, total_run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style = \ self.build_cmds_with_search(service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args) if dry_run: return # make new values available args["search_style"] = search_style args["total_run_count"] = total_run_count resume_name = args['resume_name'] keep_name = False # args['keep_name'] experiment = args['experiment'] is_distributed = args['distributed'] direct_run = args["direct_run"] # CREATE JOB to hold all runs if fake_submit: # use lastrun/lastjob info to get a fast incremental fake job number xtd = xt_dict.read_xt_dict() fake_job_num = xtd["fake_job_num"] if "fake_job_num" in xtd else 1 xtd["fake_job_num"] = fake_job_num + 1 xt_dict.write_xt_dict(xtd) job_id = "fake_job" + str(fake_job_num) else: job_id = self.store.create_job() fb.feedback(job_id) # start the feedback (by parts) fb.feedback("{}: {}".format("target", compute)) # write hparams to FILES boxes, num_boxes = self.write_hparams_to_files(job_id, cmds, fake_submit, using_hp, args) if sweeps_text and not fake_submit: self.upload_sweep_data(sweeps_text, experiment, job_id, args=args) # if num_boxes > 1 and service_type != "batch": # fb.feedback("", is_final=True) parent_name = None # BUILD RUNS, by box job_runs = [] run_count = 1 if is_distributed else len(boxes) secrets_by_node = {} remote_control = args["remote_control"] for i in range(run_count): box_name = boxes[i] # generate a box secret for talking to XT controller for this node box_secret = str(uuid.uuid4()) if remote_control else "" # build runs for box_name run_data = self.build_first_run_for_node(i, boxes[i], target_file, ps_path, using_hp, using_aml_hparam, run_specs, job_id, parent_name, cmds, pool_info, repeat_count, fake_submit, search_style, box_secret, args) # for now, adhere to the more general design of multiple runs per box box_runs = [run_data] job_runs.append(box_runs) node_id = utils.node_id(i) secrets_by_node[node_id] = box_secret # FEEDBACK ptype = "single " if search_style == "single" else "parent " if is_distributed: ptype = "master " if run_count == 1: node_msg = "creating {}run".format(ptype) else: node_msg = "creating {}runs: {}/{}".format(ptype, i+1, run_count) if service_type == "pool": node_msg += ", box: " + box_name fb.feedback(node_msg, id="node_msg") # , add_seperator=is_last) last_msg = node_msg # run the job # build box: runs dict for job info file runs_by_box, last_run = self.build_runs_by_box(job_runs, workspace) # now that we have run names for all static run names for all nodes, we can adjust cmds (and before files) for using the controller if use_controller: # we will create 2 temp. controller files in the CURRENT DIRECTORY (that will be captured to JOB) # this will also adjust commands for each node to run the XT controller adjustment_scripts = self.core.adjust_job_for_controller_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, search_style, args=args) else: adjustment_scripts = self.core.adjust_job_for_direct_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, search_style, args=args) # add env vars used by both controller and runs env_vars = args["env_vars"] # create a job guid to uniquely identify this job across all XT instances job_guid = str(uuid.uuid4()) # we add with "node0" and "job_secret", but backend service will override for each node scriptor.add_controller_env_vars(env_vars, self.config, None, "node0") data_local = args["data_local"] if "$scriptdir" in data_local: data_local = os.path.realpath(data_local.replace("$scriptdir", script_dir)) args["data_local"] = data_local model_local = args["model_local"] if "$scriptdir" in model_local: model_local = os.path.realpath(model_local.replace("$scriptdir", script_dir)) args["model_local"] = model_local # ADJUST CMDS: this allows backend to write scripts to snapshot dir, if needed, as a way of adjusting/wrapping run commands self.backend.adjust_run_commands(job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args=args) # upload CODE from snapshot_dir code_upload = args["code_upload"] code_omit = args["code_omit"] code_zip = args["code_zip"] if not fake_submit: if code_upload: self.core.upload_before_files_to_job(job_id, snapshot_dir, "before/code", code_omit, code_zip, "code", args) # upload DATA from data_local (do we need to keep this? should we upload to normal DATA location, vs. job?) data_upload = args["data_upload"] if data_upload: if not data_local: errors.config_error("cannot do data-upload because no data-local path is defined in the XT config file") data_omit = args["data_omit"] data_zip = "none" self.core.upload_before_files_to_job(job_id, data_local, "before/data", data_omit, data_zip, "data", args) # dispatch to BACKEND submitters ''' Note: backend submitter functions are responsible for: - submitting the job (for each node, queue runs for that node) - return service job id (or list of them if per node) NOTE: there is a timing issue where submitted job needs access to job info, but final piece of job info (service info) is only return after job is submitted. Therefore, we structure steps as follows: - primary job info is logged - job is submitted thru backend - service info for job is logged ''' # LOG PRIMARY JOB INFO dd = {} if not fake_submit: # mark runs as QUEUED for runs in runs_by_box.values(): first_run = runs[0] self.store.log_run_event(workspace, first_run["run_name"], "status-change", {"status": "queued"}) # write the job info file (now that backend has had a chance to update it) job_num = int(job_id[3:]) xt_cmd = args["xt_cmd"] schedule = args["schedule"] concurrent = args["concurrent"] # this job property is used to ensure we don't exceed the specified # of runs when using repeat_count on each node dynamic_runs_remaining = None if search_style == "single" else total_run_count node_count = len(runs_by_box) # static_runs_by_node = None # if schedule == "static": # static_runs_by_node = self.build_static_runs_by_node(total_run_count, node_count) #console.diag("static_runs_by_node=", static_runs_by_node) active_runs = mongo_run_index.build_active_runs(schedule, total_run_count, node_count) dd = {"job_id": job_id, "job_num": job_num, "compute": compute, "ws_name": workspace, "exper_name": experiment, "pool_info": compute_def, "runs_by_box": runs_by_box, "primary_metric": args["primary_metric"], "run_count": total_run_count, "repeat": repeat_count, "search_type": args["search_type"], "username": args["username"], "hold": args["hold"], "started": utils.get_time(), "job_status": "submitted", "running_nodes": 0, "running_runs": 0, "error_runs": 0, "completed_runs": 0, "job_guid": job_guid, "job_secret": job_secret, "dynamic_runs_remaining": dynamic_runs_remaining, "search_style": search_style, "active_runs": active_runs, "connect_info_by_node": {}, "secrets_by_node": secrets_by_node, "xt_cmd": xt_cmd, "schedule": schedule, "node_count": node_count, "concurrent": concurrent, "service_job_info": None, "service_info_by_node": None, } self.store.log_job_info(job_id, dd) # SUBMIT JOB # NOTE: we use "pool_info" here (vs. compute_def, which has not been updated with explicit args) service_job_info, service_info_by_node = self.backend.submit_job(job_id, job_runs, workspace, pool_info, resume_name, repeat_count, using_hp, runs_by_box, experiment, snapshot_dir, adjustment_scripts, args) # POST SUBMIT processing # update job info if not fake_submit: dd["service_job_info"] = service_job_info dd["service_info_by_node"] = service_info_by_node self.store.log_job_info(job_id, dd) # update lastrun/lastjob info xtd = xt_dict.read_xt_dict() xtd["last_run"] = last_run xtd["last_job"] = job_id xt_dict.write_xt_dict(xtd) # return values for API support (X) return cmds, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, job_id
def adjust_run_commands(self, job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args): ''' This method is called to allow the backend to inject needed shell commands before the user cmd. At the time this is called, files can still be added to snapshot_dir. ''' store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable, \ storage_name, storage_key = self.get_action_args(args) # local or POOL of vm's fn_wrapped = None # we use same script for each box (but with different ARGS) username = args["username"] for i, box_runs in enumerate(job_runs): # wrap the user commands in FIRST RUN of each box (apply data/model actions) br = box_runs[0] box_info = br["box_info"] actions = ["data", "model"] run_name = br["run_name"] is_windows = False node_id = utils.node_id(i) run_specs = br["run_specs"] cmd_parts = run_specs["cmd_parts"] if not fn_wrapped: # just wrap the user cmd once (shared by all boxes/nodes) assert cmd_parts[0] == "python" assert cmd_parts[1] == "-u" assert len(cmd_parts[2]) > 0 # update the target_fn (might have been switched to the xt controller) target_fn = cmd_parts[2] arg_parts = cmd_parts[3:] setup = self.config.get_setup_from_target_def(self.compute_def) # we only do this once (for the first box/job) fn_wrapped = super().wrap_user_command(cmd_parts, snapshot_dir, store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable, storage_name, storage_key, actions, is_windows=is_windows, sudo_available=False, username=username, use_username=False, install_blobfuse=True, setup=setup, change_dir=False, args=args) # AML wants a python script, so use our tiny python shim to run wrapped.sh fn_shim = "aml_shim.py" fn_from = file_utils.get_xtlib_dir() + "/backends/" + fn_shim fn_to = snapshot_dir + "/" + fn_shim shutil.copyfile(fn_from, fn_to) # copy to submit-logs utils.copy_to_submit_logs(args, fn_from) # we update each box's command (passing RUN_NAME as arg to wrapped.sh) script_part = "{} {} {}".format(os.path.basename(fn_wrapped), node_id, run_name) sh_parts = ['/bin/bash', '--login', script_part] # pass sh_parts as a single argument to avoid wierd "arg": 1 problems with AML estimators wrapped_parts = ["python", "-u", fn_shim, " ".join(sh_parts)] run_specs["cmd_parts"] = wrapped_parts
def adjust_run_commands(self, job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args): ''' This method is called to allow the backend to inject needed shell commands before the user cmd. This base implementation does so by generating a new script file and adding it to the snapshot_dir. ''' store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable, \ storage_name, storage_key = self.get_action_args(args) # local or POOL of vm's fn_wrapped = None # we use same generated script on each box/job data_local = args["data_local"] model_local = args["model_local"] for i, box_runs in enumerate(job_runs): # wrap the user commands in FIRST RUN of each box (apply data/model actions) br = box_runs[0] box_info = br["box_info"] box_name = box_info.box_name box_secret = br["box_secret"] actions = box_info.actions node_id = utils.node_id(i) is_windows = box_info.box_os == "windows" run_specs = br["run_specs"] cmd_parts = run_specs["cmd_parts"] run_name = br["run_name"] if not fn_wrapped: # we only do this once (for the first box/job) using_localhost = pc_utils.is_localhost( box_name, box_info.address) # data_local overrides store_data_dir for LOCAL machine if using_localhost and data_local: store_data_dir = os.path.join( os.path.expanduser(data_local), store_data_dir) data_action = "use_local" if not "data" in actions: actions.append("data") # model_local overrides store_model_dir for LOCAL machine if using_localhost and model_local: store_model_dir = os.path.join( os.path.expanduser(model_local), store_model_dir) model_action = "use_local" if not "model" in actions: actions.append("model") setup = self.config.get_setup_from_target_def(self.compute_def) env_vars = self.get_env_vars_for_box(box_name, box_info, i, box_secret) post_cmds = [] # add env vars to script setter = "@set" if is_windows else "export" for name, value in env_vars.items(): cmd = "{} {}={}".format(setter, name, value) post_cmds.append(cmd) #"xt download before/code --job={} --unzip " fn_wrapped = super().wrap_user_command( cmd_parts, snapshot_dir, store_data_dir, data_action, data_writable, store_model_dir, model_action, model_writable, storage_name, storage_key, actions, is_windows=is_windows, sudo_available=False, pip_freeze=False, setup=setup, post_setup_cmds=post_cmds, args=args, nonempty=True) # we update each box's command script_part = "{} {} {}".format(os.path.basename(fn_wrapped), node_id, run_name) if self.is_windows: sh_parts = [script_part] else: sh_parts = ['/bin/bash', '--login', script_part] run_specs["cmd_parts"] = sh_parts
def build_data_frames(self): ''' 1. for each run, collect the reported metrics as metric sets (by reported col list) 2. append to the dataframe for that col list ''' # build "data_frames" no_metrics = [] pp_run_names = [] used_max = False data_frames_by_cols = {} got_columns = False for i, record in enumerate(self.run_log_records): # extract metrics for this run run = record["_id"] node = utils.node_id(record["node_index"]) job = record["job_id"] experiment = record["exper_name"] workspace = record["ws"] search_style = utils.safe_value(record, "search_style") if search_style and search_style != "single": # parent run with children - skip it continue log_records = record["log_records"] metric_sets = run_helper.build_metrics_sets(log_records) if not metric_sets: no_metrics.append(run) continue if self.max_runs and len(pp_run_names) >= self.max_runs: used_max = True break if not got_columns: # set x and y columns explicit = qfe.get_explicit_options() if not "x" in explicit: self.x_col = self.get_actual_x_column( metric_sets, self.x_col, self.col_names) if not self.col_names: # not specified by user, so build defaults self.col_names = self.get_default_y_columns( metric_sets, self.x_col) got_columns = True # merge metric sets into dfx for metric_set in metric_sets: # create a pandas DataFrame df = pd.DataFrame(metric_set["records"]) cols = str(list(df.columns)) # ensure this df has our x_col if self.x_col and not self.x_col in cols: continue # ensure this df has at least 1 y_col found_y = False for y in self.col_names: if y in cols: found_y = True break if not found_y: continue # add run_name column df["run"] = [run] * df.shape[0] df["node"] = [node] * df.shape[0] df["job"] = [job] * df.shape[0] df["experiment"] = [experiment] * df.shape[0] df["workspace"] = [workspace] * df.shape[0] if not cols in data_frames_by_cols: data_frames_by_cols[cols] = df else: dfx = data_frames_by_cols[cols] dfx = dfx.append(df) data_frames_by_cols[cols] = dfx pp_run_names.append(run) if no_metrics: console.print( "\nnote: following runs were skipped (currently have no logged metrics): \n {}\n" .format(", ".join(no_metrics))) if used_max: console.print( "plotting first {} runs (use --max-runs to override)".format( self.max_runs)) else: console.print("plotting {} runs...".format(len(pp_run_names))) # update our list of run_names to proces self.run_names = pp_run_names return data_frames_by_cols
def monitor_job_node(self, job_id, jupyter, sleep, node_index, log_name, escape): if node_index is None: node_index = 0 backend, job_info = job_helper.get_job_backend(self.store, self.core, job_id) node_id = utils.node_id(node_index) service_info_by_node = job_info["service_info_by_node"] service_node_info = service_info_by_node[node_id] service_name = backend.get_name() node_count = len(job_info["service_info_by_node"]) console.print("==> monitoring: {}, node{} [{}] (press escape or control-c to exit, +/- to change node)" \ .format(job_id, node_index, service_name), flush=True) # the monitoring loop start_offset = 0 simple_status = None service_status = None kb_sleep = .1 # be quick to respond to user's key presses sleeps_per_call = max(1, sleep // kb_sleep) sleep_count = 0 ch = None first_call = True offset_by_node = {} node_id = utils.node_id(node_index) first_text_of_stream = True display_count = 0 started = time.time() try: with KeyPressChecker() as checker: while simple_status != "completed": ch = checker.getch_nowait() #print(ch, end=" ") if ch == constants.ESCAPE or ch == constants.CONTROL_C: break if ch in ["+", "-"] and node_count > 1: # save current context offset_by_node[node_id] = start_offset # increment node_index delta = 1 if ch == "+" else -1 node_index = (delta + node_index) % node_count node_id = utils.node_id(node_index) # set new context service_node_info = service_info_by_node[node_id] start_offset = offset_by_node[ node_id] if node_id in offset_by_node else 0 first_text_of_stream = not start_offset console.print( "==> switching to: node{}".format(node_index)) if ch == "q": # diagnostic aid (undocumented, only for pool service) console.print("{} service queue:".format(node_id), flush=True) entries = backend.get_service_queue_entries( service_node_info) if entries is None: console.print(" <not supported for this service") else: if entries: for entry in entries: marker = "*" if entry["current"] else " " console.print(" {} {}".format( marker, entry["name"])) else: console.print(" <no entries>") if sleep_count == sleeps_per_call: # time to read the log file result = backend.read_log_file( service_node_info, log_name=log_name, start_offset=start_offset) new_text = result["new_text"] new_simple = result["simple_status"] new_service = result["service_status"] start_offset = result["next_offset"] new_log = result["log_name"] if new_log != log_name: log_name = new_log console.print( "==> node{} streaming log: {}".format( node_index, log_name)) first_text_of_stream = True if new_service != service_status: service_status = new_service simple_status = new_simple console.print("==> node{} status: {} ({})".format( node_index, service_status, simple_status)) if new_text: if node_count > 1: # prepend each new line with node_id prefix = node_id + ": " new_text = new_text.replace( "\n", "\n" + prefix) if first_text_of_stream: new_text = prefix + new_text first_text_of_stream = False console.print(new_text, end="") display_count += 1 first_call = False sleep_count = 0 time.sleep(kb_sleep) sleep_count += 1 if escape: # have we exceeded max time in monitoring? elapsed = time.time() - started if elapsed >= escape: break except KeyboardInterrupt: ch = constants.CONTROL_C show_final_status = (display_count > 0) if ch == constants.ESCAPE: console.print("==> monitoring cancelled (escape key detected)") elif ch == constants.CONTROL_C: console.print("==> monitoring cancelled (control-c detected)") ch = single_char_input("do you want to cancel the job? (y/n): ") if ch == "y": self.cancel_job(job_id) show_final_status = False if show_final_status: console.print("==> node{} status: {} ({})".format( node_index, service_status, simple_status))