def fixup_script_in_cmd(self, cmd): cmd_parts = utils.cmd_split(cmd) self.remove_script_dir_from_parts(cmd_parts) # add "-u" for python cmds if len(cmd_parts) > 1 and cmd_parts[0].startswith("python") and cmd_parts[1] != "-u": cmd_parts.insert(1, "-u") new_cmd = " ".join(cmd_parts) return new_cmd
def get_fn_local_config(args): # default value fn = os.path.join(".", constants.FN_CONFIG_FILE) # is this a run cmd whose script is a .yaml file? cmd = " ".join(args) parts = utils.cmd_split(cmd) found_run = False for part in parts: if found_run: if part.endswith(".yaml"): fn = part break if not part.startswith("--"): if part == "run": found_run = True else: break return fn
def distribute_cmds_to_nodes(self, cmds, num_nodes): cmds_by_node = {} # set current node node_index = 0 # build cmd_parts and distribute them among nodes for cmd in cmds: node_id = "node" + str(node_index) if not node_id in cmds_by_node: cmds_by_node[node_id] = [] # split on unquoted spaces cmd_parts = utils.cmd_split(cmd) cmds_by_node[node_id].append(cmd_parts) node_index += 1 if node_index >= num_nodes: node_index = 0 return cmds_by_node
def build_cmds_with_search(self, service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args): ''' args: - service_type: the type of backend service being used (aml, batch, etc.) - cmd_parts: list of the user's ML app and arg/options - parent_script: user-specified script that needs to be run to configure box for all child runs - run_script: if user app is a shell script or command line .bat file, the text of file - run_cmd_from_script: if user's ML app is a shell or command line script, the run command located within it - use_controller: if False, XT controller is not being used (direct run) - dry_run: if True, job will not be submitted (user just wants to see list of static runs) processing: - determine the search_style needed, the associated list of user commands, and the total number of runs returns: - cmds: the list of 1 or more commands to be run - run_count: to total number runs to be executed - repeat_count: if number of runs per node (approximately) - run_specs: a dictionary of run information (easier to pass around) - using_hp: if True, a static or dynamic hyperparameter search is being done - using_aml_hparam: if True, we are doing a direct-run AML hyperparameter search - sweeps_text: hyperparameter search specs - pool_info: information about the service/pool target - search_style: one of: single, multi, repeat, static, dynamic ''' using_hp = False show_run_report = True repeat_count = None using_aml_hparam = False search_style = None cmds = None # get run_cmd run_cmd = run_cmd_from_script if not run_cmd: run_cmd = " ".join(cmd_parts) # by default, we return same cmd new_run_cmd = run_cmd is_aml = (service_type == "aml") # self.is_aml_ws(workspace) use_aml_for_hparam = (is_aml and not use_controller) # get info about nodes/boxes boxes, pool_info, service_type = box_information.get_box_list(self.core, args=args) node_count = len(boxes) # HPARAM SEARCH cmds, sweeps_text, new_run_cmd = self.build_static_hparam_cmds(run_cmd, node_count, args) using_hp = not(not sweeps_text) if using_hp and use_aml_for_hparam: using_aml_hparam = True # for AML hyperdrive, we pass only constant args from cmd_parts #cmd_parts = [tp for tp in template_parts if tp != '{}'] if cmds: # STATIC HPARAM SEARCH run_count = len(cmds) search_style = "static" runs = args["runs"] max_runs = args["max_runs"] # USER MULTI CMDS multi_cmds = self.read_user_multi_commands(using_hp, run_script, cmd_parts, args) if multi_cmds: if cmds: errors.ComboError("cannot specify both --multi with hyperparameter search") cmds = multi_cmds if runs: run_count = runs elif max_runs: run_count = min(max_runs, len(cmds)) else: run_count = len(cmds) search_style = "multi" new_run_cmd = cmds[0] if not cmds: # SINGLE CMD # DYNAMIC HPARAM or REPEAT or SINGLE search style # we will use repeat_count on each node, as needed, to reach specified runs run_count = runs if runs else node_count if using_hp: search_style = "dynamic" else: search_style = "repeat" if run_count > 1 else "single" if search_style != "single": repeat_count = math.ceil(run_count / node_count) cmds = [new_run_cmd] show_run_report = False if show_run_report: console.print() dr = " (dry-run)" if dry_run else "" search_type = args["search_type"] stype = "(search-type=" + search_type + ") " if search_style=="static" else "" console.print("{} {}runs{}:".format(search_style, stype, dr)) for i, run_cmd_parts in enumerate(cmds): console.print(" {}. {}".format(i+1, run_cmd_parts)) console.print() # finally, package info into run_specs to make info easier to pass thru various APIs new_cmd_parts = utils.cmd_split(new_run_cmd) run_specs = {"cmd_parts": new_cmd_parts, "run_script": run_script, "run_cmd": new_run_cmd, "parent_script": parent_script} return cmds, run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style
def process_args(self, args): run_script = None parent_script = None run_cmd_from_script = None target_file = args["script"] target_args = args["script_args"] code_upload = args["code_upload"] # user may have wrong slashes for this OS target_file = file_utils.fix_slashes(target_file) if os.path.isabs(target_file): errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file)) is_rerun = "is_rerun" in args if is_rerun: # will be running from script dir, so remove any path to script file self.script_dir = os.path.dirname(target_file) target_file = os.path.basename(target_file) if target_file.endswith(".py"): # PYTHON target cmd_parts = ["python"] cmd_parts.append("-u") cmd_parts.append(target_file) else: cmd_parts = [target_file] if target_args: # split on unquoted spaces arg_parts = utils.cmd_split(target_args) cmd_parts += arg_parts if target_file == "docker": self.is_docker = True if not self.is_docker and code_upload and not os.path.exists(target_file): errors.env_error("script file not found: {}".format(target_file)) ps_path = args["parent_script"] if ps_path: parent_script = file_utils.read_text_file(ps_path, as_lines=True) if target_file.endswith(".bat") or target_file.endswith(".sh"): # a RUN SCRIPT was specified as the target run_script = file_utils.read_text_file(target_file, as_lines=True) run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script) compute = args["target"] box_def = self.config.get("boxes", compute, suppress_warning=True) setup = utils.safe_value(box_def, "setup") compute_def = self.config.get_compute_def(compute) if compute_def: # must be defined in [compute-targets] compute_def = self.config.get_compute_def(compute) if not "service" in compute_def: errors.config_error("compute target '{}' must define a 'service' property".format(compute)) service = compute_def["service"] if service in ["local", "pool"]: # its a list of box names boxes = compute_def["boxes"] if len(boxes)==1 and boxes[0] == "localhost": pool = None box = "local" service_type = "pool" else: pool = compute box = None service_type = "pool" else: # it a set of compute service properties pool = compute box = None service_name = compute_def["service"] service_type = self.config.get_service_type(service_name) elif box_def: # translate single box name to a compute_def box = compute pool = None service_type = "pool" compute_def = {"service": service_type, "boxes": [box], setup: setup} else: errors.config_error("unknown target or box: {}".format(compute)) args["target"] = compute args["compute_def"] = compute_def args["service_type"] = service_type # for legacy code args["box"] = box args["pool"] = pool return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \ compute, compute_def
def build_docker_cmd(self, docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args): for_windows = True docker_def = self.config.get("dockers", docker_name, default_value=None) if not docker_def: errors.config_error("docker '{}' not found in config file".format(docker_name)) registry_name = docker_def["registry"] image = docker_def["image"] if registry_name: # get REGISTRY credentials registry_creds = self.config.get("external-services", registry_name, suppress_warning=True) if not registry_creds: config_error("'{}' must be specified in [external-services] section of XT config file".format(registry_name)) login_server = registry_creds["login-server"] else: login_server = None #pwd = "%cd%" if for_windows else "$(pwd)" script_dir = file_utils.fix_slashes(script_dir, True) mappings = "-v {}:/usr/src".format(script_dir) options = "--rm" # collect env vars env_vars = {"XT_IN_DOCKER": 1, "XT_USERNAME": pc_utils.get_username()} scriptor.add_controller_env_vars(env_vars, self.config, job_secret, "node0") # fixup backslash char for target_file if ".py" in target_file: app = "python -u" #target_file = file_utils.fix_slashes(target_file, True) target_file = os.path.basename(target_file) else: app = target_file target_file = "" full_image = login_server + "/" + image if login_server else image # build a mapping for data? data_local = args["data_local"] if data_local: if "$scriptdir" in data_local: data_local = data_local.replace("$scriptdir", script_dir) data_local = os.path.realpath(data_local) mappings += " -v {}:/usr/data".format(data_local) env_vars["XT_DATA_DIR"] = "/usr/data" # write env vars to file in snapshot dir FN_EV = "__dockev__.txt" fn_env_var = os.path.join(snapshot_dir, FN_EV) lines = [name + "=" + str(value) for name,value in env_vars.items()] text = "\n".join(lines) file_utils.write_text_file(fn_env_var, text) # specify env var file (in current directory) to docker options += " --env-file={}".format(FN_EV) # inherit ENV VARS from running environment options += " -e XT_RUN_NAME -e XT_WORKSPACE_NAME -e XT_EXPERIMENT_NAME" docker_cmd = "docker run {} {} {} {} /usr/src/{}".format(options, mappings, full_image, app, target_file) new_parts = utils.cmd_split(docker_cmd) return new_parts
def main(cmd=None, new_start_time=None, capture_output=False, mini=False, raise_syntax_exception=True): ''' This is the XT app, used to manage and scale ML experiments, support various backends (Philly, Azure Batch, Azure ML). ''' if new_start_time: global xt_start_time xt_start_time = new_start_time import numpy as np seed = 5 if seed: np.random.seed(seed) np.random.RandomState(seed) if cmd: cmd = cmd.strip() if cmd.startswith("xt "): cmd = cmd[3:] elif cmd == "xt": cmd = "" args = utils.cmd_split(cmd) # remove empty args args = [arg for arg in args if arg] else: # if caller did not supply cmd args = sys.argv[1:] # when executing multiple commands, reset the feedback for each command feedback.reset_feedback() #console.print("cmd=", cmd, ", args=", args) console.diag("in xt_cmds.main") #console.print("config=", config) fn_local_config = get_fn_local_config(args) impl_shared = ImplShared() config = impl_shared.init_config(fn_local_config, mini=mini) store = impl_shared.store mini = config.mini_mode cmd_providers = config.get("providers", "command") impl_dict = {} for name, code_path in cmd_providers.items(): package, class_name = code_path.rsplit(".", 1) module = importlib.import_module(package) impl_class = getattr(module, class_name) impl = impl_class(config, store) impl_dict[package] = impl if name == "help": impl.set_mini_mode(mini) # this enables QFE to match a function by its module name, to the class instance to process the command # impl_dict = {"xtlib.impl_utilities": utilities, "xtlib.impl_storage": storage, # "xtlib.impl_compute": compute, "xtlib.impl_help": help_impl} # this parses args and calls the correct command function with its args and options correctly set. # the config object supplies the default value for most options and flags. dispatcher = qfe.Dispatcher( impl_dict, config, preprocessor=impl_shared.pre_dispatch_processing) if mini: # a dict of commands + arg/options to be surfaced (None means use all args/options) show_commands = { "cancel_all": ["target"], "cancel_job": ["job-id"], "cancel_run": ["run-names"], "clear_credentials": [], "config_cmd": ["default", "create", "reset"], "create_demo": ["destination", "response", "overwrite"], "create_services_template": [], "download": ["local-path", "store-path"], "extract": ["runs", "dest-dir", "browse", "workspace"], "help": ["command", "about", "browse", "version"], "help_topics": ["topic", "browse"], "list_blobs": ["path"], "list_jobs": [ "job-list", "experiment", "all", "first", "last", "filter", "sort", "reverse", "status", "available" ], "list_runs": [ "run-list", "job", "experiment", "all", "first", "last", "filter", "sort", "reverse", "status", "available" ], "monitor": ["name"], "run": [ "script", "script-args", "experiment", "hp-config", "max-runs", "nodes", "runs", "search-type", "target" ], "upload": ["local-path", "store-path"], "view_console": ["name", "target", "workspace", "node-index"], "view_metrics": ["runs", "metrics"], "view_run": ["run-name"] } dispatcher.show_commands(show_commands) qfe.remove_hidden_commands() # hide under-development commands hide_commands = [ "collect_logs", "start_tensorboard", "stop_tensorboard", "zip", "unzip", "wget" ] # hide internal cmds (for xt development use only) hide_commands.append("generate_help") dispatcher.hide_commands(hide_commands) # expand symbols like $lastjob, $lastrun impl_shared.expand_xt_symbols(args) # this is the NORMAL outer exeception handling block, but # also see the client/server exception handling in xt_run.py try: text = dispatcher.dispatch( args, capture_output=capture_output, raise_syntax_exception=raise_syntax_exception) except BaseException as ex: #console.print("in Exception Handler: utils.show_stack_trace=", utils.show_stack_trace) # does user want a stack-trace? logger.exception( "Error during displatcher.dispatch, args={}".format(args)) exc_type, exc_value, exc_traceback = sys.exc_info() errors.process_exception(exc_type, exc_value, exc_traceback) return text