def cancel_runs_by_user(self, box_name): ''' Args: box_name: the name of the box the runs ran on (pool service) Returns: cancel_results: a list of kill results records (keys: workspace, run_name, exper_name, killed, status, before_status) ''' cancel_results = [] # get list of active jobs from batch active_jobs = self.get_active_jobs() console.diag("after get_active_jobs()") if active_jobs: for job_record in active_jobs: # watch out for older jobs that didn't have service_job_info/service_info_by_node properties service_job_info = utils.safe_value(job_record, "service_job_info") service_info_by_node = utils.safe_value( job_record, "service_info_by_node") if service_job_info and service_info_by_node: job_id = job_record["job_id"] cancel_result = self.cancel_job(service_job_info, service_info_by_node) for _, node_result in cancel_result.items(): cancel_results.append(node_result) return cancel_results
def get_client_cs(core, job_id, node_index): ''' instantiate the backend service that owns the specified job node and request it's client connection string ''' cs = None box_secret = None filter = {"_id": job_id} jobs = core.store.mongo.get_info_for_jobs(filter, None) if not jobs: errors.store_error("unknown job_id: {}".format(job_id)) job = jobs[0] node_id = utils.node_id(node_index) compute = utils.safe_value(job, "compute") secrets_by_node = utils.safe_value(job, "secrets_by_node") if not secrets_by_node: errors.store_error("unknown node_index={} for job={}".format( node_index, job_id)) box_secret = utils.safe_value(secrets_by_node, node_id) service_info_by_node = utils.safe_value(job, "service_info_by_node") node_info = utils.safe_value(service_info_by_node, node_id) if compute and node_info: backend = core.create_backend(compute) cs = backend.get_client_cs(node_info) cs_plus = {"cs": cs, "box_secret": box_secret, "job": job} return cs_plus
def get_filtered_sorted_limit_runs(store, config, show_gathering, col_dict=None, args=None): console.diag("start of: get_filtered_sorted_limit_runs") # required run_list = args["run_list"] # optional pool = utils.safe_value(args, "target") available = utils.safe_value(args, "available") workspace = utils.safe_value(args, "workspace") if workspace: store.ensure_workspace_exists(workspace, flag_as_error=True) mongo = store.get_mongo() # have MONGO update any old RUN documents to new format fixup_mongo_runs.fixup_runs_if_needed(mongo.mongo_db, workspace) # get info about run properties user_to_actual, std_cols_desc = get_run_property_dicts() actual_to_user = {value: key for key, value in user_to_actual.items()} builder = ReportBuilder(config, store, client=None) # get list of specified runs pure_run_list, actual_ws = expand_run_list(store, mongo, workspace, run_list) if run_list and not pure_run_list: errors.general_error("no run(s) found") # build a filter dict for all specified filters filter_dict = build_run_filter_dict(pure_run_list, user_to_actual, builder, args) # if show_gathering: # console.print("gathering run data...", flush=True) # get the mongo records for the matching RUNS records, using_default_last, last = builder.get_mongo_records( mongo, filter_dict, workspace, "runs", actual_to_user, col_dict=col_dict, args=args) console.diag("end of: get_filtered_sorted_limit_runs") return records, using_default_last, user_to_actual, available, builder, last, std_cols_desc
def build_cmds(auto_mode, quick_test, monitoropt, nogui, philly=1): config = xt_config.get_merged_config() mini_mode = not config.get("general", "advanced-mode") is_windows = (os.name == "nt") has_gui = pc_utils.has_gui() and not nogui browse_flag = "--browse" if has_gui else "" browse_opt = "" if auto_mode or not has_gui else "--browse" timeout_opt = "--timeout=5" if auto_mode else "" monitor_opt = "--monitor=none " if nomonitor else "" templ = "{run}_{target}_lr={hparams.lr}_mo={hparams.momentum}_opt={hparams.optimizer}_tt={logdir}" # SET THESE before each demo (exper24 should be a multi-service set of simple runs) prev_exper = "exper18" curr_exper = "exper26" if mini_mode: command_dicts = commands_basic.get_command_dicts( prev_exper, curr_exper, browse_flag, browse_opt, timeout_opt, templ, ARCHIVES_DIR, monitor_opt) else: command_dicts = commands_advanced.get_command_dicts( prev_exper, curr_exper, browse_flag, browse_opt, timeout_opt, templ, ARCHIVES_DIR, monitor_opt) if not has_gui: command_dicts = list(filter( lambda c_dict: not utils.safe_value(c_dict, "needs_gui", default=False), command_dicts)) if philly == 0: command_dicts = filter( lambda c_dict: not utils.safe_value(c_dict, "needs_philly", default=False), command_dicts) list(map( lambda cmd_dict: add_cmd(cmd_dict["title"], cmd_dict["xt_cmd"]), command_dicts ))
def get_registry_creds(self, compute, env): registry_creds = None if not env: compute_def = self.config.get_compute_def(compute) env = utils.safe_value(compute_def, "environment") if env and env != "none": env_def = self.config.get("dockers", env, default_value=None) if not env_def: errors.config_error( "docker '{}' not found in config file".format(env)) registry_name = env_def["registry"] # get REGISTRY credentials registry_creds = self.config.get("external-services", registry_name, suppress_warning=True) if not registry_creds: config_error( "'{}' must be specified in [external-services] section of XT config file" .format(registry_name)) return registry_creds
def cancel_runs_by_names(self, workspace, run_names, box_name): ''' Args: workspace: the name of the workspace containing the run_names run_names: a list of run names box_name: the name of the box the runs ran on (pool service) Returns: cancel_results: a list of kill results records (keys: workspace, run_name, exper_name, killed, status, before_status) ''' # our strategy for this API: # - use the XT controller to kill specified runs (when controller is available) # - use batch_client "cancel node" if controller not available # we build service-based box names to have 3 parts job_id, service_name, node_index = box_name.split("-") active_jobs = self.get_active_jobs() cancel_results = [] if active_jobs: for job_record in active_jobs: # watch out for older jobs that didn't have service_job_info/service_info_by_node properties service_info_by_node = utils.safe_value( job_record, "service_info_by_node") if service_info_by_node: for node, node_service_info in service_info_by_node.items( ): if node_service_info.get("run_name") in run_names: cancel_result = self.cancel_node(node_service_info) cancel_results.append(cancel_result) return cancel_results
def get_user_columns(self, args): requested_list = args["columns"] add_cols = utils.safe_value(args, "add_columns") if add_cols: requested_list += add_cols return requested_list
def get_required_service_property(self, creds, prop_name, service_name): value = utils.safe_value(creds, prop_name) if not value: errors.config_error( "Missing '{}' property for service '{}' defined in [external-services] section of the XT config file" .format(prop_name, service_name)) return value
def validate_storage_and_mongo(self, mongo): ''' 1. ensure storage has been initialized for XT 2. ensure mongo and storage point to each other 3. update storage format if needed 4. update mongo format if needed ''' # ensure storage has been initialized for XT self._create_info_container_if_needed() # ensure mongo points to our storage storage_name = self.provider.get_service_name() connected_mongo = mongo.get_service_name() mongo_info = mongo.get_mongo_info() paired_storage = utils.safe_value(mongo_info, "paired_storage") if paired_storage and storage_name != paired_storage: errors.combo_error("mongo paired with storage service='{}', but passed XT storage service='{}'".format( \ paired_storage, storage_name)) storage_info = self._get_storage_info() paired_mongo = utils.safe_value(storage_info, "paired_mongo") if paired_mongo and connected_mongo != paired_mongo: errors.combo_error("this storage paired with mongo service='{}', but passed connection string for mongo service='{}'".format( \ connected_mongo, paired_mongo)) if not paired_storage: mongo_info = { "paired_storage": storage_name, "storage_version": constants.STORAGE_VERSION } mongo.set_mongo_info(mongo_info) if not paired_mongo: storage_info = { "paired_mongo": connected_mongo, "storage_version": constants.STORAGE_VERSION } self._set_storage_info(storage_info) # only check once, (takes .5 secs if already imported) # remove this check after all XT users have imported (approx. Dec 2019) # but keep around (good for mongodb repair, if needed) self.import_jobs_to_mongo_if_needed(mongo)
def get_client_cs(core, ws, run_name): cs = None box_secret = None filter = {"_id": run_name} runs = core.store.mongo.get_info_for_runs(ws, filter, {"run_logs": 0}) if not runs: errors.store_error("Unknown run: {}/{}".format(ws, run_name)) if runs: from xtlib import job_helper run = runs[0] job_id = utils.safe_value(run, "job_id") node_index = utils.safe_value(run, "node_index") cs_plus = job_helper.get_client_cs(core, job_id, node_index) cs = cs_plus["cs"] box_secret = cs_plus["box_secret"] return cs, box_secret
def get_node_run(self, service_node_info): # create aml workspace aml_ws_name = utils.safe_value(self.compute_def, "service") ws = self.get_aml_ws(aml_ws_name) # create aml experiment aml_exper_name = service_node_info["aml_exper_name"] experiment = Experiment(ws, name=aml_exper_name) # create aml run aml_run_id = service_node_info["aml_run_id"] run = Run(experiment, aml_run_id) return run
def gen_args(self, args, gen_docs=False): text = "\n" visible_args = [ arg for arg in args if not utils.safe_value(arg, "hidden") ] if visible_args: if gen_docs: text += "Arguments::\n\n" else: text += "Arguments:\n" text += self.gen_name_help_aligned(visible_args, separator="-") #text += "\n" return text
def get_activate_cmd(self): setup_def = self.config.get_setup_from_target_def(self.compute_def) if pc_utils.is_windows: activate_cmd = utils.safe_value(setup_def, "activate") else: # Attempting to activate the Conda shell from within a bash script # fails, with Conda saying that the bash environment has not # been correctly initialized to use Conda. # This thread https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script # eventually led me to the following command which is taken # from the lines of bash script that Conda appends to your # .bashrc file upon installation. This command is what # allows you to activate the Conda environment within a # bash shell. It returns a script generated by Conda # which is executed, and which stes up the conda # activate / deactivate commands in the encironment. conda_shell_bash_hook_cmd = 'eval "$(conda shell.bash hook)"' activate_cmd = utils.safe_value(setup_def, "activate") activate_cmd = "{} && {}".format(conda_shell_bash_hook_cmd, activate_cmd) return activate_cmd
def get_first_last(self, args): first = utils.safe_value(args, "first") last = utils.safe_value(args, "last") show_all = utils.safe_value(args, "all") explict = qfe.get_explicit_options() # explict overrides default for all/first/last if "all" in explict: first = None last = None elif "first" in explict: show_all = None last = None elif "last" in explict: show_all = None first = None else: # priority if no explict options set if show_all: first = None last = None return first, last
def import_run_mongo_document(self, mongo_run_fn, workspace, new_workspace, prev_job_id, new_job_id, run_name): text = file_utils.read_text_file(mongo_run_fn) run = json.loads(text) # update job_id run["job_id"] = new_job_id # update workspace run["ws"] = new_workspace # add to mongo self.store.mongo.update_run_info(new_workspace, run_name, run) end_id = utils.safe_value(run, "end_id") return end_id
def submit_node_runs(self, job_id, node_runs, workspace, aml_ws_name, xt_exper_name, aml_exper_name, compute_def, resume_name, repeat_count, using_hp, compute, runs_by_box, code_dir, node_index, show_aml_run_name, nodes, args): first_run = node_runs[0] first_run_name = first_run["run_name"] fake_submit = args["fake_submit"] # this indicates we should make serializable versions of estimator and trainer self.submit_logs = True or fake_submit # must be true if we are using fake_submit self.serializable_estimator = None self.serializable_trainer = None box_name = first_run["box_name"] run_specs = first_run["run_specs"] cmd_parts = run_specs["cmd_parts"] target_fn = args["script"] node_id = "node" + str(node_index) assert cmd_parts[0] == "python" assert cmd_parts[1] == "-u" assert len(cmd_parts[2]) > 0 # update the target_fn (might have been switched to the xt controller) target_fn = cmd_parts[2] arg_parts = cmd_parts[3:] # parse target's cmdline args arg_dict = {} for ap in arg_parts: # arg name can start with or without "-" here if "=" in ap: name, value = ap.split("=") if not value.startswith('"[') and not value.startswith('"@'): arg_dict[name] = value else: # for unspecified values arg_dict[ap] = 1 compute_target = utils.safe_value(compute_def, "compute") if not compute_target: errors.config_error( "'compute' property missing on compute target '{}' in XT config file" .format(compute)) estimator, experiment = self.create_estimator( job_id, workspace, aml_ws_name, xt_exper_name, aml_exper_name, first_run_name, code_dir, target_fn, arg_dict, compute_target, node_id, nodes, fake_submit, args) hp_config = args["hp_config"] direct_run = args["direct_run"] if using_hp and direct_run: # EXPERIMENT with hyperdrive max_runs = args["max_runs"] max_minutes = args["max_minutes"] policy_name = args["early_policy"] eval_interval = args["evaluation_interval"] delay_eval = args["delay_evaluation"] truncation_percentage = args["truncation_percentage"] slack_factor = args["slack_factor"] slack_amount = args["slack_amount"] primary_metric = args["primary_metric"] maximize_metric = args["maximize_metric"] search_type = args["search_type"] concurrent = args["concurrent"] max_concurrent_runs = nodes * concurrent if max_minutes <= 0: #max_minutes = 43200 # aml workaround: None not supported, either is -1 or 0, so use max value max_minutes = 10080 # aml workaround: documented max not supported if hp_sets: hd_dict = self.build_hyperdrive_dict(hp_sets) else: hd_dict = self.build_hyperdrive_dict_from_file(hp_config) if not policy_name: # use default policy (not that same as no policy) early_term = None else: if self.submit_logs: early_term = { "policy_type": policy_name, "eval_interval": eval_interval, "delay_eval": delay_eval, "truncation_percentage": truncation_percentage, "slack_factor": slack_factor, "slack_amount": slack_amount } self.serializable_trainer = { "estimator": serializable_estimator, "hd_dict": hd_dict, "search_type": search_type, "primary_metric": primary_metric, "maximize_metric": maximize_metric, "early_term": serializable_early_term, "max_total_runs": max_runs, "max_concurrent_runs": max_concurrent_runs, "max_minutes": max_minutes } if fake_submit: trainer = self.serializable_trainer else: early_term = self.make_early_term_policy( policy_type=policy_name, eval_interval=eval_interval, delay_eval=delay_eval, truncation_percentage=truncation_percentage, slack_factor=slack_factor, slack_amount=slack_amount) trainer = self.create_hyperdrive_trainer( estimator, hd_dict, search_type, primary_metric, maximize_metric, early_term, max_total_runs=max_runs, max_concurrent_runs=max_concurrent_runs, max_minutes=max_minutes) else: # not using AML hyperdrive trainer = estimator run_name, monitor_cmd, aml_run_name, aml_run_number, aml_run_id = \ self.run_aml_job(job_id, workspace, aml_ws_name, trainer, experiment, xt_exper_name, aml_exper_name, compute_target, code_dir, first_run_name, box_name, node_index, repeat_count, fake_submit, args) if show_aml_run_name: fb.feedback("[aml: {}/Run {}], xt: {}/{} ".format( aml_exper_name, aml_run_number, workspace, run_name), is_final=True) else: fb.feedback("{}/{}".format(aml_exper_name, aml_run_number, workspace, run_name)) mongo = self.store.get_mongo() run_names = [] for run in node_runs: run_name = run["run_name"] run_names.append(run_name) node_info = {"ws": workspace} for run_name in run_names: # we only have 1 run, so OK to hold info in flat dict here node_info["aml_exper_name"] = aml_exper_name node_info["aml_run_number"] = aml_run_number node_info["aml_run_id"] = aml_run_id node_info["run_name"] = run_name # update mongo db info for run with cluster and service_job_id mongo.update_mongo_run_from_dict(workspace, run_name, { "aml_exper_name": aml_exper_name, "aml_run_number": aml_run_number }) if monitor_cmd: console.print("monitoring notebook created; to run:") console.print(" " + monitor_cmd) return node_info
def create_estimator(self, job_id, workspace, aml_ws_name, xt_exper_name, aml_exper_name, run_name, code_dir, target_fn, arg_dict, compute_target, node_id, nodes, fake_submit, args): config = self.config ps = None if not aml_exper_name: errors.config_error( "experiment name must be specified (thru config file or command line option '--experiment')" ) if fake_submit: # for speed of testing, avoid creating real Workspace, Experiment instances ws = {"name": aml_ws_name} experiment = {"ws": ws, "name": aml_exper_name} else: ws = self.get_aml_ws(aml_ws_name) experiment = Experiment(ws, name=aml_exper_name) if compute_target == "amlcompute": actual_target = "amlcompute" # AmlCompute(ws, None) else: if fake_submit: actual_target = "amlcompute" else: if not compute_target in ws.compute_targets: errors.config_error( "compute target '{}' does not exist in AML workspace '{}'" .format(compute_target, aml_ws_name)) actual_target = ws.compute_targets[compute_target] # build ENV VARS store_creds = self.config.get_storage_creds() # store_name = store_creds["name"] # store_key = store_creds["key"] provider_code_path = config.get_storage_provider_code_path(store_creds) mongo_creds, mongo_name = self.config.get_mongo_creds() mongo_conn_str = mongo_creds["mongo-connection-string"] username = args["username"] description = args["description"] aggregate_dest = args["aggregate_dest"] env_vars = self.build_env_vars(workspace, aml_ws_name, xt_exper_name, aml_exper_name, run_name, job_id=job_id, compute_target=compute_target, username=username, description=description, aggregate_dest=aggregate_dest, node_id=node_id, args=args) framework = args["framework"] framework = framework.lower() is_distributed = args['distributed'] dist_training = args["distributed_training"] dist_training = dist_training.lower() from azureml.train.estimator import Estimator, Mpi, Gloo, Nccl from azureml.train.dnn import PyTorch, Chainer, TensorFlow fw_dict = { "pytorch": PyTorch, "tensorflow": TensorFlow, "chainer": Chainer, "estimator": Estimator } dt_dict = {"mpi": Mpi, "gloo": Gloo, "nccl": Nccl} if not framework in fw_dict: errors.user_config_errorerror( "framework must be set to 'pytorch', 'tensorflow', 'chainer', or 'estimator'" ) estimator_ctr = fw_dict[framework] if is_distributed: if not dist_training in dt_dict: errors.config_error( "distributed-training must be set to 'mpi', 'gloo', or 'nccl'" ) distributed_ctr = dt_dict[dist_training] distributed_obj = distributed_ctr() else: distributed_obj = None compute_def = args["compute_def"] direct_run = args["direct_run"] if direct_run: # relying on AML for full control (not using XT controller) node_count = utils.safe_value(compute_def, "nodes") # did cmd line overwrite nodes? if args["nodes"]: node_count = args["nodes"] if node_count is None: errors.config_error( "must specify 'nodes' property for Azure ML service '{}' in XT config file or as --nodes option in cmd line" .format(args["target"])) else: # run as separate AML runs, each with a single node node_count = 1 vm_size = args["vm_size"] conda_packages = args["conda_packages"] pip_packages = args["pip_packages"] use_gpu = args["use_gpu"] framework_version = args["fw_version"] max_secs = args["max_seconds"] user_managed = args["user_managed"] activate_cmd = self.get_activate_cmd() if activate_cmd: # we have no way of running this on AML before conda_packages and pip_packages are installed (or used to build a docker image) errors.config_error( "setup.activate property cannot be specified for AML targets") #max_secs = 10080 if max_secs <= 0 else max_secs use_docker = False environment_name = utils.safe_value(compute_def, "docker") if environment_name: envrionment_def = self.config.get_docker_def(environment_name) if envrionment_def: use_docker = (envrionment_def["type"] == "docker") # workaround AML warning if not use_docker: use_docker = None if self.submit_logs: # for testing (this should match exact args used in estimator ctr below) self.serializable_estimator = { "source_directory": code_dir, "script_params": arg_dict, "compute_target": actual_target, "vm_size": vm_size, "entry_script": target_fn, "conda_packages": conda_packages, "pip_packages": pip_packages, "use_gpu": use_gpu, "use_docker": use_docker, "framework_version": framework_version, "user_managed": user_managed, "environment_variables": env_vars, "node_count": node_count, "distributed_training": {}, "max_run_duration_seconds": max_secs } if fake_submit: estimator = self.serializable_estimator else: estimator = estimator_ctr(source_directory=code_dir, script_params=arg_dict, compute_target=actual_target, vm_size=vm_size, entry_script=target_fn, conda_packages=conda_packages, pip_packages=pip_packages, use_gpu=use_gpu, use_docker=use_docker, framework_version=framework_version, user_managed=user_managed, environment_variables=env_vars, node_count=node_count, distributed_training=distributed_obj, max_run_duration_seconds=max_secs) return estimator, experiment
def get_client_context(self, exper_name, run_name, app_info, box_info, job_id, node_index, run_specs, resume_name=None, using_hp=False, repeat=None, args=None): ''' this function gathers up all of the job-level context needed to run the job on the specified node (node_index). ''' config = self.config cmd_parts = run_specs["cmd_parts"] workspace = args['workspace'] working_dir = args['working_dir'] context = Bag() context.ws = workspace context.working_dir = working_dir context.exper_name = exper_name context.run_name = run_name context.job_id = job_id context.sku = args["sku"] context.app_name = app_info.app_name if app_info else None context.box = args["box"] context.from_ip = pc_utils.get_ip_address() context.from_host = pc_utils.get_hostname() context.box_name = box_info.box_name context.target_file, _, _ = self.get_target(cmd_parts) context.resume_name = resume_name context.generated_sweep_text = None # will be conditionally set in controller context.pool = args["pool"] context.node_index = node_index context.compute = args["target"] context.service_type = args["service_type"] # provide all provider info to controller context.providers = config.get("providers") #context.run_specs = run_specs context.cmd_parts = cmd_parts context.xt_cmd = args[ "xt_cmd"] # log our full cmd to support correct rerun's context.run_script = run_specs["run_script"] context.parent_script = run_specs["parent_script"] # for helping docker login to user's Azure Container Registry is_docker = (args["docker"] != None) # if cmd_parts: # is_docker = (cmd_parts[0] == "docker") or (cmd_parts[0] == "sudo" and cmd_parts[1] == "docker") #registry = config.get("environment", "registry", suppress_warning=True) registry = None compute_def = args["compute_def"] if compute_def and "docker" in compute_def: docker_name = compute_def["docker"] docker_def = self.config.get_docker_def(docker_name) if docker_def and "registry" in docker_def: registry = docker_def["registry"] if registry: registry_creds = config.get("external-services", registry) needs_login = is_docker and utils.safe_value( registry_creds, "login") login_server = utils.safe_value(registry_creds, "login-server") username = utils.safe_value(registry_creds, "username") password = utils.safe_value(registry_creds, "password") else: needs_login = False login_server = None username = None password = None context.docker_login = needs_login context.docker_server = login_server context.docker_username = username context.docker_password = password context.username = self.config.get("general", "username") setup = self.config.get_setup_from_target_def(compute_def) activate_cmd = utils.safe_value(setup, "activate") context.activate_cmd = activate_cmd # config info #box_os = self.get_box_os(box_name) box_os = box_info.box_os after_files_list = args["after_dirs"] after_files_list = utils.parse_list_option_value(after_files_list) context.after_files_list = after_files_list after_omit_list = args["after_omit"] after_omit_list = utils.parse_list_option_value(after_omit_list) context.after_omit_list = after_omit_list context.primary_metric = args["primary_metric"] context.maximize_metric = args["maximize_metric"] context.report_rollup = args["report_rollup"] context.after_upload = args["after_upload"] #context.scrape = config.get("general", "scrape") context.log = args["log"] # PARENT/CHILD info context.repeat = repeat context.repeats_remaining = None # will be set in controller context.total_run_count = args["total_run_count"] context.search_style = args["search_style"] context.is_parent = context.search_style != "single" # HPARAM search hp_config = args["hp_config"] if hp_config: hp_config = file_utils.path_join(constants.HP_CONFIG_DIR, os.path.basename(hp_config)) context.hp_config = hp_config context.fn_generated_config = args["fn_generated_config"] context.using_hp = using_hp context.search_type = args["search_type"] context.option_prefix = args["option_prefix"] context.restart = False context.concurrent = args["concurrent"] context.xtlib_capture = args["xtlib_upload"] # for mirroring files to grok server or storage context.mirror_dest = args["mirror_dest"] context.mirror_files = args["mirror_files"] context.grok_server = None # args["grok_server"] context.aggregate_dest = args["aggregate_dest"] context.dest_name = exper_name if context.aggregate_dest == "experiment" else job_id store_creds = self.config.get_storage_creds() context.store_creds = store_creds context.store_code_path = config.get_storage_provider_code_path( store_creds) mongo_creds, mongo_name = self.config.get_mongo_creds() context.mongo_conn_str = mongo_creds["mongo-connection-string"] context.shell_launch_prefix = box_info.shell_launch_prefix #console.print("context=", context) return context
def validate_and_add_defaults(self, arguments, options, arg_dict): ''' args: - arguments: list of the arguments for the current cmd - options: list of options for the current cmd - arg_dict: dict of name/value pairs for user-specified args and options processing: - copy arg_dict to "explicit_options" - validate all names in arg_dict (against arguments & options) - flag as error if any required arguments/options are not specified in arg_dict - add default values for all arguments/options not yet specified inarg_dict return: - fullly populated copy of arg_dict ''' # ensure all names in arg_dict are dash style (for validation) full_arg_dict = { key.replace("_", "-"): value for key, value in arg_dict.items() } # remember options that were set explicitly (dash-style) global explict_options explict_options = dict(full_arg_dict) # process all aguments, options, and flags; ensure each has a value in arg_dict all_args = arguments + options all_arg_names = [aa["name"] for aa in all_args] # process user args in arg_dict for name, value in full_arg_dict.items(): # validate arg name if not name in all_arg_names: errors.api_error("unknown args name: {}".format(name)) # now add default values for all other args for info in all_args: name = info["name"] required = info["required"] if "required" in info else None if not name in full_arg_dict: if required: self.syntax_error( "cmd '{}' missing value for required option: --{}". format(self.cmd_words, name)) default_value = utils.safe_value(info, "default") # expand "$group.value" type values default_value = self.get_default_from_config(default_value) # add to user's arg dict full_arg_dict[name] = default_value # finally, convert all names to underscore style full_arg_dict = { key.replace("-", "_"): value for key, value in full_arg_dict.items() } console.diag("full_arg_dict=", full_arg_dict) return full_arg_dict
def command_help(self, cmd_info, syntax_only=False, args_only=False): show_all = not syntax_only and not args_only if cmd_info == "flags": print_flags() return '''Shows help for the specified xt command''' name = cmd_info["name"] args = cmd_info["arguments"] args = [arg for arg in args if not utils.safe_value(arg, "hidden")] options = cmd_info["options"] options = [ opt for opt in options if not utils.safe_value(opt, "hidden") ] examples = cmd_info["examples"] see_alsos = cmd_info["see_alsos"] faqs = cmd_info["faqs"] options_before_args = cmd_info["options_before_args"] words = name.replace("_", " ") if cmd_info["keyword_optional"]: words = "[ " + words + " ]" words = " " + words opts_text = "" if options: opts_text += " [OPTIONS]" args_text = self.gen_inline_args(args) if not syntax_only: console.print() if options_before_args: usage = "Usage: {}".format( self.name) + words + opts_text + args_text else: usage = "Usage: {}".format( self.name) + words + args_text + opts_text # print usage info console.print(usage) if show_all and not self.mini_mode: # print command help doc_string = self.get_formatted_doc_str(cmd_info) help_text = doc_string if doc_string else " " + cmd_info["help"] console.print() console.print(help_text) if syntax_only: # show a quck list of options console.print(" OPTIONS: ", end="") for opt in options: console.print("--{} ".format(opt["name"]), end="") console.print() # finish line else: # show each option on its own line with a short description text = "" if options_before_args: text += self.gen_options(options) text += self.gen_args(args) else: text += self.gen_args(args) text += self.gen_options(options) console.print(text) if show_all and examples: console.print("Examples:") for example in examples: console.print(" {}:".format(example["task"])) console.print(" > {}".format(example["text"])) console.print() if self.mini_mode: # only show first example for mini mode break if show_all and faqs: console.print("FAQs:") for faq in faqs: console.print(" {}?".format(faq["question"])) console.print(" => {}".format(faq["answer"])) console.print() if self.mini_mode: # only show first FAQ for mini mode break if show_all and see_alsos: console.print("See Also:") for also in see_alsos: text = also["text"] page_path = also["page_path"] console.print(" - {}".format(text))
def process_arguments(self, scanner, tok, arguments, arg_dict): for arg_info in arguments: if utils.safe_value(arg_info, "hidden"): continue arg_name = arg_info["name"] arg_type = arg_info["type"] required = arg_info["required"] keywords = arg_info["keywords"] if "keywords" in arg_info else None current_arg = None #print("processing arg=", arg_name, arg_type, tok) if arg_type == "cmd" and tok and not tok.startswith("-"): # convert remaining tokens to a cmd_info if tok: # if self.match(tok, "topics"): # cmd_info = {"name": "topics"} # tok = scanner.scan() # else: cmd_info, tok = self.get_cmd_info(tok, scanner, for_help=True) current_arg = cmd_info elif arg_type == "text": # convert remaining tokens to a string if tok: text = scanner.get_rest_of_text(include_current_token=True) tok = None else: text = "" current_arg = text else: if tok and not tok.startswith("-"): current_arg = tok if required and not current_arg: self.syntax_error( "cmd '{}' missing required argument: {}".format( self.cmd_words, arg_name)) if current_arg: if arg_type == "str_list": value, tok = self.parse_string_list(tok, scanner) if len(value) == 0 and required: self.syntax_error( "missing value for required argument: " + arg_name) elif arg_type == "num_list": value, tok = self.parse_num_list(tok, scanner) if len(value) == 0 and required: self.syntax_error( "missing value for required argument: " + arg_name) elif arg_type == "int_list": value, tok = self.parse_int_list(tok, scanner) if len(value) == 0 and required: self.syntax_error( "missing value for required argument: " + arg_name) elif arg_type == "tag_list": value, tok = self.parse_tag_list(tok, scanner) if len(value) == 0 and required: self.syntax_error( "missing value for required argument: " + arg_name) else: value = current_arg if keywords: found = self.match_keyword(value, keywords) if not found: self.syntax_error( "Keyword argument {} has unrecognized value: {}" .format(arg_name, value)) value = found tok = scanner.scan() # store value to be passed arg_dict[arg_name] = value if tok and not tok.startswith("--"): errors.argument_error("unrecognized argument", tok) return tok
def process_args(self, args): run_script = None parent_script = None run_cmd_from_script = None target_file = args["script"] target_args = args["script_args"] code_upload = args["code_upload"] # user may have wrong slashes for this OS target_file = file_utils.fix_slashes(target_file) if os.path.isabs(target_file): errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file)) is_rerun = "is_rerun" in args if is_rerun: # will be running from script dir, so remove any path to script file self.script_dir = os.path.dirname(target_file) target_file = os.path.basename(target_file) if target_file.endswith(".py"): # PYTHON target cmd_parts = ["python"] cmd_parts.append("-u") cmd_parts.append(target_file) else: cmd_parts = [target_file] if target_args: # split on unquoted spaces arg_parts = utils.cmd_split(target_args) cmd_parts += arg_parts if target_file == "docker": self.is_docker = True if not self.is_docker and code_upload and not os.path.exists(target_file): errors.env_error("script file not found: {}".format(target_file)) ps_path = args["parent_script"] if ps_path: parent_script = file_utils.read_text_file(ps_path, as_lines=True) if target_file.endswith(".bat") or target_file.endswith(".sh"): # a RUN SCRIPT was specified as the target run_script = file_utils.read_text_file(target_file, as_lines=True) run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script) compute = args["target"] box_def = self.config.get("boxes", compute, suppress_warning=True) setup = utils.safe_value(box_def, "setup") compute_def = self.config.get_compute_def(compute) if compute_def: # must be defined in [compute-targets] compute_def = self.config.get_compute_def(compute) if not "service" in compute_def: errors.config_error("compute target '{}' must define a 'service' property".format(compute)) service = compute_def["service"] if service in ["local", "pool"]: # its a list of box names boxes = compute_def["boxes"] if len(boxes)==1 and boxes[0] == "localhost": pool = None box = "local" service_type = "pool" else: pool = compute box = None service_type = "pool" else: # it a set of compute service properties pool = compute box = None service_name = compute_def["service"] service_type = self.config.get_service_type(service_name) elif box_def: # translate single box name to a compute_def box = compute pool = None service_type = "pool" compute_def = {"service": service_type, "boxes": [box], setup: setup} else: errors.config_error("unknown target or box: {}".format(compute)) args["target"] = compute args["compute_def"] = compute_def args["service_type"] = service_type # for legacy code args["box"] = box args["pool"] = pool return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \ compute, compute_def
def get_mongo_records(self, mongo, filter_dict, workspace, which, actual_to_user, col_dict=None, args=None): first, last = self.get_first_last(args) if last: using_default_last = True else: using_default_last = False reverse = utils.safe_value(args, "reverse") # use MONGO to do all of the work (query, sort, first/last) sort_col = utils.safe_value(args, "sort", "name") if sort_col == "name": # special sorting needed; we have created "run_num" field just for this purpose sort_col = "run_num" if which == "runs" else "job_num" elif not "." in sort_col: # translate name of std col from user-friendly version to logged version user_to_actual = { value: key for key, value in actual_to_user.items() } if not sort_col in user_to_actual: errors.general_error("unknown standard property: {} (did you mean metrics.{}, hparams.{}, or tags.{}?)". \ format(sort_col, sort_col, sort_col, sort_col)) sort_col = user_to_actual[sort_col] # this is a TRICK to avoid having to call for the exists_count for calculation of skip count # it works fine, since we re-sort records on the xt client anyway sort_dir = -1 if reverse else 1 if last: sort_dir = -sort_dir first = last # ensure we only ask for records where sort_col exists, or else we MIGHT end up with less than LIMIT records if not sort_dir in filter_dict: filter_dict[sort_col] = {"$exists": True} container = workspace if which == "runs" else "__jobs__" orig_col_dict = col_dict if not col_dict: col_dict = {"log_records": 0} # put our mongo operations together in a retry-compatible function def fetch(): cursor = mongo.mongo_db[container].find(filter_dict, col_dict) cursor = cursor.sort(sort_col, 1 if not last else -1) if first: cursor = cursor.limit(first) return cursor # here is where MONGO does all the hard work for us cursor = mongo.mongo_with_retries("get_mongo_records", fetch) records = list(cursor) console.diag("after full records retreival, len(records)={}".format( len(records))) if not orig_col_dict: # pull out standard cols, translating from actual to user-friendly names records = [ self.translate_record(rec, actual_to_user) for rec in records if rec ] # pull out requested cols, flattening nested values to their dotted names records = self.flatten_records(records, sort_col, args) if last: # we had to reverse the sort done by mongo, so correct it here records.reverse() #self.sort_records(records, sort_col, reverse) return records, using_default_last, last
def process_run_command(self, args): self.args = args # ensure workspace exists workspace = args['workspace'] dry_run = args['dry_run'] fake_submit = args["fake_submit"] if not fake_submit: self.store.ensure_workspace_exists(workspace, flag_as_error=False) # PRE-PROCESS ARGS service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, compute, compute_def = \ self.process_args(args) # create backend helper (pool, philly, batch, aml) cluster = utils.safe_value(compute_def, "cluster") vc = utils.safe_value(compute_def, "vc") self.backend = self.core.create_backend(compute, cluster=cluster, vc=vc, username=None) # add conda_packages and pip_packages from SETUP to ARGS setup_def = self.config.get_setup_from_target_def(compute_def) conda_packages = utils.safe_value(setup_def, "conda-packages") pip_packages = utils.safe_value(setup_def, "pip-packages") args["conda_packages"] = conda_packages if conda_packages else [] args["pip_packages"] = pip_packages if pip_packages else [] self.adjust_pip_packages(args) snapshot_dir = self.temp_dir if fake_submit: script_dir = snapshot_dir else: # note: always create a snapshot dir for backends to add needed files file_utils.ensure_dir_deleted(snapshot_dir) script_dir = self.snapshot_all_code(snapshot_dir, cmd_parts, args) self.script_dir = script_dir direct_run = args["direct_run"] # do we need to start the xt controller? use_controller = not direct_run adjustment_scripts = None # create a job_secret that can later be used to authenticate with the XT controller # NOTE: we currently log this secret as a job property, which allows all team members to view and control this job job_secret = str(uuid.uuid4()) # do we need to build a "docker run" command? if not self.backend.provides_container_support(): env = args["docker"] if not env: docker_name = utils.safe_value(compute_def, "docker") if docker_name and docker_name != "none": cmd_parts = self.build_docker_cmd(docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args) args["docker"] = docker_name # for use in building run context info # BUILD CMDS (from static hparam search, user multi cmds, or single user cmd) cmds, total_run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style = \ self.build_cmds_with_search(service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args) if dry_run: return # make new values available args["search_style"] = search_style args["total_run_count"] = total_run_count resume_name = args['resume_name'] keep_name = False # args['keep_name'] experiment = args['experiment'] is_distributed = args['distributed'] direct_run = args["direct_run"] # CREATE JOB to hold all runs if fake_submit: # use lastrun/lastjob info to get a fast incremental fake job number xtd = xt_dict.read_xt_dict() fake_job_num = xtd["fake_job_num"] if "fake_job_num" in xtd else 1 xtd["fake_job_num"] = fake_job_num + 1 xt_dict.write_xt_dict(xtd) job_id = "fake_job" + str(fake_job_num) else: job_id = self.store.create_job() fb.feedback(job_id) # start the feedback (by parts) fb.feedback("{}: {}".format("target", compute)) # write hparams to FILES boxes, num_boxes = self.write_hparams_to_files(job_id, cmds, fake_submit, using_hp, args) if sweeps_text and not fake_submit: self.upload_sweep_data(sweeps_text, experiment, job_id, args=args) # if num_boxes > 1 and service_type != "batch": # fb.feedback("", is_final=True) parent_name = None # BUILD RUNS, by box job_runs = [] run_count = 1 if is_distributed else len(boxes) secrets_by_node = {} remote_control = args["remote_control"] for i in range(run_count): box_name = boxes[i] # generate a box secret for talking to XT controller for this node box_secret = str(uuid.uuid4()) if remote_control else "" # build runs for box_name run_data = self.build_first_run_for_node(i, boxes[i], target_file, ps_path, using_hp, using_aml_hparam, run_specs, job_id, parent_name, cmds, pool_info, repeat_count, fake_submit, search_style, box_secret, args) # for now, adhere to the more general design of multiple runs per box box_runs = [run_data] job_runs.append(box_runs) node_id = utils.node_id(i) secrets_by_node[node_id] = box_secret # FEEDBACK ptype = "single " if search_style == "single" else "parent " if is_distributed: ptype = "master " if run_count == 1: node_msg = "creating {}run".format(ptype) else: node_msg = "creating {}runs: {}/{}".format(ptype, i+1, run_count) if service_type == "pool": node_msg += ", box: " + box_name fb.feedback(node_msg, id="node_msg") # , add_seperator=is_last) last_msg = node_msg # run the job # build box: runs dict for job info file runs_by_box, last_run = self.build_runs_by_box(job_runs, workspace) # now that we have run names for all static run names for all nodes, we can adjust cmds (and before files) for using the controller if use_controller: # we will create 2 temp. controller files in the CURRENT DIRECTORY (that will be captured to JOB) # this will also adjust commands for each node to run the XT controller adjustment_scripts = self.core.adjust_job_for_controller_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, search_style, args=args) else: adjustment_scripts = self.core.adjust_job_for_direct_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, search_style, args=args) # add env vars used by both controller and runs env_vars = args["env_vars"] # create a job guid to uniquely identify this job across all XT instances job_guid = str(uuid.uuid4()) # we add with "node0" and "job_secret", but backend service will override for each node scriptor.add_controller_env_vars(env_vars, self.config, None, "node0") data_local = args["data_local"] if "$scriptdir" in data_local: data_local = os.path.realpath(data_local.replace("$scriptdir", script_dir)) args["data_local"] = data_local model_local = args["model_local"] if "$scriptdir" in model_local: model_local = os.path.realpath(model_local.replace("$scriptdir", script_dir)) args["model_local"] = model_local # ADJUST CMDS: this allows backend to write scripts to snapshot dir, if needed, as a way of adjusting/wrapping run commands self.backend.adjust_run_commands(job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args=args) # upload CODE from snapshot_dir code_upload = args["code_upload"] code_omit = args["code_omit"] code_zip = args["code_zip"] if not fake_submit: if code_upload: self.core.upload_before_files_to_job(job_id, snapshot_dir, "before/code", code_omit, code_zip, "code", args) # upload DATA from data_local (do we need to keep this? should we upload to normal DATA location, vs. job?) data_upload = args["data_upload"] if data_upload: if not data_local: errors.config_error("cannot do data-upload because no data-local path is defined in the XT config file") data_omit = args["data_omit"] data_zip = "none" self.core.upload_before_files_to_job(job_id, data_local, "before/data", data_omit, data_zip, "data", args) # dispatch to BACKEND submitters ''' Note: backend submitter functions are responsible for: - submitting the job (for each node, queue runs for that node) - return service job id (or list of them if per node) NOTE: there is a timing issue where submitted job needs access to job info, but final piece of job info (service info) is only return after job is submitted. Therefore, we structure steps as follows: - primary job info is logged - job is submitted thru backend - service info for job is logged ''' # LOG PRIMARY JOB INFO dd = {} if not fake_submit: # mark runs as QUEUED for runs in runs_by_box.values(): first_run = runs[0] self.store.log_run_event(workspace, first_run["run_name"], "status-change", {"status": "queued"}) # write the job info file (now that backend has had a chance to update it) job_num = int(job_id[3:]) xt_cmd = args["xt_cmd"] schedule = args["schedule"] concurrent = args["concurrent"] # this job property is used to ensure we don't exceed the specified # of runs when using repeat_count on each node dynamic_runs_remaining = None if search_style == "single" else total_run_count node_count = len(runs_by_box) # static_runs_by_node = None # if schedule == "static": # static_runs_by_node = self.build_static_runs_by_node(total_run_count, node_count) #console.diag("static_runs_by_node=", static_runs_by_node) active_runs = mongo_run_index.build_active_runs(schedule, total_run_count, node_count) dd = {"job_id": job_id, "job_num": job_num, "compute": compute, "ws_name": workspace, "exper_name": experiment, "pool_info": compute_def, "runs_by_box": runs_by_box, "primary_metric": args["primary_metric"], "run_count": total_run_count, "repeat": repeat_count, "search_type": args["search_type"], "username": args["username"], "hold": args["hold"], "started": utils.get_time(), "job_status": "submitted", "running_nodes": 0, "running_runs": 0, "error_runs": 0, "completed_runs": 0, "job_guid": job_guid, "job_secret": job_secret, "dynamic_runs_remaining": dynamic_runs_remaining, "search_style": search_style, "active_runs": active_runs, "connect_info_by_node": {}, "secrets_by_node": secrets_by_node, "xt_cmd": xt_cmd, "schedule": schedule, "node_count": node_count, "concurrent": concurrent, "service_job_info": None, "service_info_by_node": None, } self.store.log_job_info(job_id, dd) # SUBMIT JOB # NOTE: we use "pool_info" here (vs. compute_def, which has not been updated with explicit args) service_job_info, service_info_by_node = self.backend.submit_job(job_id, job_runs, workspace, pool_info, resume_name, repeat_count, using_hp, runs_by_box, experiment, snapshot_dir, adjustment_scripts, args) # POST SUBMIT processing # update job info if not fake_submit: dd["service_job_info"] = service_job_info dd["service_info_by_node"] = service_info_by_node self.store.log_job_info(job_id, dd) # update lastrun/lastjob info xtd = xt_dict.read_xt_dict() xtd["last_run"] = last_run xtd["last_job"] = job_id xt_dict.write_xt_dict(xtd) # return values for API support (X) return cmds, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, job_id
def build_data_frames(self): ''' 1. for each run, collect the reported metrics as metric sets (by reported col list) 2. append to the dataframe for that col list ''' # build "data_frames" no_metrics = [] pp_run_names = [] used_max = False data_frames_by_cols = {} got_columns = False for i, record in enumerate(self.run_log_records): # extract metrics for this run run = record["_id"] node = utils.node_id(record["node_index"]) job = record["job_id"] experiment = record["exper_name"] workspace = record["ws"] search_style = utils.safe_value(record, "search_style") if search_style and search_style != "single": # parent run with children - skip it continue log_records = record["log_records"] metric_sets = run_helper.build_metrics_sets(log_records) if not metric_sets: no_metrics.append(run) continue if self.max_runs and len(pp_run_names) >= self.max_runs: used_max = True break if not got_columns: # set x and y columns explicit = qfe.get_explicit_options() if not "x" in explicit: self.x_col = self.get_actual_x_column( metric_sets, self.x_col, self.col_names) if not self.col_names: # not specified by user, so build defaults self.col_names = self.get_default_y_columns( metric_sets, self.x_col) got_columns = True # merge metric sets into dfx for metric_set in metric_sets: # create a pandas DataFrame df = pd.DataFrame(metric_set["records"]) cols = str(list(df.columns)) # ensure this df has our x_col if self.x_col and not self.x_col in cols: continue # ensure this df has at least 1 y_col found_y = False for y in self.col_names: if y in cols: found_y = True break if not found_y: continue # add run_name column df["run"] = [run] * df.shape[0] df["node"] = [node] * df.shape[0] df["job"] = [job] * df.shape[0] df["experiment"] = [experiment] * df.shape[0] df["workspace"] = [workspace] * df.shape[0] if not cols in data_frames_by_cols: data_frames_by_cols[cols] = df else: dfx = data_frames_by_cols[cols] dfx = dfx.append(df) data_frames_by_cols[cols] = dfx pp_run_names.append(run) if no_metrics: console.print( "\nnote: following runs were skipped (currently have no logged metrics): \n {}\n" .format(", ".join(no_metrics))) if used_max: console.print( "plotting first {} runs (use --max-runs to override)".format( self.max_runs)) else: console.print("plotting {} runs...".format(len(pp_run_names))) # update our list of run_names to proces self.run_names = pp_run_names return data_frames_by_cols