def make_early_term_policy(self, policy_type, eval_interval=1, delay_eval=0, truncation_percentage=.1, slack_factor=None, slack_amount=None): from azureml.train.hyperdrive import BanditPolicy, MedianStoppingPolicy, TruncationSelectionPolicy, NoTerminationPolicy if policy_type == "bandit": policy = BanditPolicy(evaluation_interval=eval_interval, slack_factor=slack_factor, slack_amount=slack_amount, delay_eval=delay_eval) elif policy_type == "median": policy = MedianStoppingPolicy(evaluation_interval=eval_interval, delay_evaluation=delay_eval) elif policy_type == "truncation": policy = TruncationSelectionPolicy( truncation_percentage=truncation_percentage, evaluation_interval=eval_interval, delay_evaluation=delay_eval) elif policy_type == "none": policy = NoTerminationPolicy() else: errors.config_error("Unrecognized policy type=" + policy_type) return policy
def get_service_name(self): if not "service" in self.compute_def: errors.config_error( "missing 'service' property for xt config file compute target '{}'" .format(self.compute)) service_name = self.compute_def["service"] return service_name
def create_hyperdrive_trainer(self, estimator, hd_dict, search_type, metric_name, maximize_metric, early_term_policy, max_total_runs, max_concurrent_runs, max_minutes): from azureml.train.hyperdrive import RandomParameterSampling, GridParameterSampling, BayesianParameterSampling if search_type == "random": ps = RandomParameterSampling(hd_dict) elif search_type == "grid": ps = GridParameterSampling(hd_dict) elif search_type == "bayesian": ps = BayesianParameterSampling(hd_dict) else: errors.config_error( "Azure ML Hyperdrive search_type not supported: " + search_type) max_concurrent_runs = min(max_total_runs, max_concurrent_runs) from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal trainer = HyperDriveConfig( estimator=estimator, hyperparameter_sampling=ps, policy=early_term_policy, primary_metric_name=metric_name, primary_metric_goal=PrimaryMetricGoal.MAXIMIZE if maximize_metric else PrimaryMetricGoal.MINIMIZE, max_total_runs=max_total_runs, max_concurrent_runs=max_concurrent_runs, max_duration_minutes=max_minutes) return trainer
def get_registry_creds(self, compute, env): registry_creds = None if not env: compute_def = self.config.get_compute_def(compute) env = utils.safe_value(compute_def, "environment") if env and env != "none": env_def = self.config.get("dockers", env, default_value=None) if not env_def: errors.config_error( "docker '{}' not found in config file".format(env)) registry_name = env_def["registry"] # get REGISTRY credentials registry_creds = self.config.get("external-services", registry_name, suppress_warning=True) if not registry_creds: config_error( "'{}' must be specified in [external-services] section of XT config file" .format(registry_name)) return registry_creds
def build_actual_store(self): console.diag("start of build_actual_store") # validate USERNAME username = self.config.get("general", "username", suppress_warning=True) if not username: errors.config_error( "'username' must be set in the [general] section of XT config file" ) # STORAGE name/creds storage_creds = self.config.get_storage_creds() # MONGO name/creds mongo_creds, mongo_name = self.config.get_mongo_creds() run_cache_dir = self.config.get("general", "run-cache-dir") #store_key = storage_creds["key"] mongo_conn_str = mongo_creds["mongo-connection-string"] provider_code_path = self.config.get_storage_provider_code_path( storage_creds) self.store = Store(storage_creds, provider_code_path=provider_code_path, run_cache_dir=run_cache_dir, mongo_conn_str=mongo_conn_str) console.diag("end of build_actual_store") return self.store
def get_aml_ws(self, ws_name): creds = self.config.get("external-services", ws_name, suppress_warning=True) if not creds: errors.config_error( "Azure ML workspace '{}' is not defined in [external-services] section of the XT config file" .format(ws_name)) subscription_id = self.config.get_required_service_property( creds, "subscription-id", ws_name) resource_group = self.config.get_required_service_property( creds, "resource-group", ws_name) #from azureml.core.authentication import ServicePrincipalAuthentication # ws_ex = ws_name + "-ex" # svc_pr = None # if self.config.name_exists(section, ws_ex): # client_id = self.config.get(section, ws_ex, "client-id") # tenant_id = self.config.get(section, ws_ex, "tenant-id") # client_secret = self.config.get(section, ws_ex, "client-secret") # svc_pr = ServicePrincipalAuthentication(tenant_id=tenant_id, service_principal_id=client_id, service_principal_password=client_secret) ws = Workspace(subscription_id, resource_group, ws_name) # , auth=svc_pr) return ws
def get_required_service_property(self, creds, prop_name, service_name): value = utils.safe_value(creds, prop_name) if not value: errors.config_error( "Missing '{}' property for service '{}' defined in [external-services] section of the XT config file" .format(prop_name, service_name)) return value
def warning(self, *msg_args): msg = "WARNING: xt_config file -" for arg in msg_args: msg += " " + str(arg) if self.get("internal", "raise", suppress_warning=True): errors.config_error(msg) else: console.print(msg)
def get_storage_provider_code_path(self, storage_creds): # get the provider_code_path provider_name = storage_creds["provider"] providers = self.get("providers", "storage") if not provider_name in providers: errors.config_error( "{} provider='{}' not registered in XT config file".format( "storage", provider_name)) code_path = providers[provider_name] return code_path
def get_provider_class_ctr(self, provider_type, name): ''' return the class constructor method for the specified provider. ''' providers = self.get("providers", provider_type) if not name in providers: errors.config_error( "{} not registered in XT config file".format(name)) code_path = providers[name] return utils.get_class_ctr(code_path)
def get_external_service_from_target(self, target_name): target = self.get_compute_def(target_name) if not "service" in target: errors.config_error( "'service' property must be defined for target={} in the XT config file" .format(target)) service_name = target["service"] service = self.get_service(service_name) #self.expand_symbols_in_creds(service, service_name) return service
def get_service(self, service_name): service = self.get("external-services", service_name, suppress_warning=True) if not service: errors.config_error( "'{}' must be defined in the [external-services] section of XT config file" .format(service_name)) service["name"] = service_name #self.expand_symbols_in_creds(service, service_name) return service
def get_provider_code_path_from_context(context, provider_type, name): ''' return the class constructor method for the specified provider. ''' providers = context.providers[provider_type] if not name in providers: errors.config_error( "{} provider='{}' not registered in XT config file".format( provider_type, name)) code_path = providers[name] return code_path
def build_hyperdrive_dict_from_file(self, fn): ''' parse hyperdrive params from text file ''' hd = {} with open(fn, "rt") as infile: text_lines = infile.readlines() for text in text_lines: text = text.strip() if not text or text.startswith("#"): continue if "#" in text: # remove comment part of line index = text.index("#") text = text[0:index].strip() name, value = text.split("=") name = name.strip() value = value.strip() if value.startswith("@"): dist_name, values = value[1:].split("(") if not dist_name in utils.distribution_types: errors.config_error("Unsupported distribution type: " + dist_name) assert values.endswith(")") values = values[:-1] # remove ending paren # convert from comma sep. string to list of float values values = utils.get_number_or_string_list_from_text(values) #hd[name] = self.make_distribution(dist_name, values) hd[name] = hp_helper.build_dist_func_instance( name, dist_name, values) else: # convert from comma sep. string to list of float values values = utils.get_number_or_string_list_from_text(value) # treat as "choice" #hd[name] = self.make_distribution("choice", values) hd[name] = hp_helper.build_dist_func_instance( name, "choice", values) return hd
def get_mongo_creds(self): # validate MONGO service mongo_name = self.get("xt-services", "mongo", suppress_warning=True) if not mongo_name: errors.config_error( "'mongo' must be set in [xt-services] section of XT config file" ) # validate MONGO credentials mongo_creds = self.get("external-services", mongo_name, suppress_warning=True) if not mongo_creds: errors.config_error( "'{}' must be specified in [external-services] section of XT config file" .format(mongo_name)) #self.expand_symbols_in_creds(mongo_creds, mongo_name) return mongo_creds, mongo_name
def get_me_graph_property(self, token, property_name): #console.print("get_user_principle_name: token=", token) import requests import json endpoint = "https://graph.microsoft.com/v1.0/me" headers = {'Authorization': 'Bearer ' + token} graph_data = requests.get(endpoint, headers=headers).json() if "error" in graph_data: error = graph_data["error"] errors.config_error("{}: {}".format(error["code"], error["message"])) #console.print("get_user_principle_name: graph_data=", graph_data) upn = graph_data[property_name] return upn
def cancel_remote_controller(self, box_name, progress): # REMOTE BOX: check if controller is running box_addr = self.config.get("boxes", box_name, dict_key="address") if not box_addr: errors.config_error( "missing address property for box: {}".format(box_name)) # run PS on box to determine if controller is running box_cmd = "ps aux | grep controller" exit_code, output = process_utils.sync_run_ssh(self, box_addr, box_cmd) #console.print("result=\n", output) targets = [text for text in output.split("\n") if "python" in text] #console.print("targets=", targets) cancel_count = 0 if len(targets): for target in targets: parts = target.split(" ") # remove empty strings parts = list(filter(None, parts)) #console.print("parts=", parts) if len(parts) > 1: pid = parts[1].strip() # send "cancel" command to remote linux box box_cmd = 'kill -kill {}'.format(pid) progress(" killing remote process: {}".format(pid)) process_utils.sync_run_ssh(self, box_addr, box_cmd, report_error=True) cancel_count += 1 result = cancel_count > 0 if not result: progress(" remote XT controller not running") return result
def get_service_type(self, service_name): if service_name == "pool": service_type = "pool" else: service = self.get("external-services", service_name, suppress_warning=True) if not service: errors.config_error( "'{}' must be defined in the [external-services] section of XT config file" .format(service_name)) if not "type" in service: errors.config_error( "'type' must be defined for the '{}' service in the XT config file" .format(service_name)) service_type = service["type"] return service_type
def get_compute_def(self, target_name): target = self.get("compute-targets", target_name, suppress_warning=True) if not target: # is this target a box name? box_info = self.get("boxes", target_name, suppress_warning=True) if not box_info: errors.config_error( "target '{}' must be defined in the [compute-targets] section of XT config file (or be box name)" .format(target_name)) # make box look like a target target = {"service": "pool", "boxes": [target_name]} # use setup from first box if "setup" in box_info: target["setup"] = box_info["setup"] target["name"] = target_name #self.expand_symbols_in_creds(target, target_name) return target
def get_storage_creds(self): # validate STORAGE service storage_name = self.get("xt-services", "storage", suppress_warning=True) if not storage_name: errors.config_error( "'storage' must be set in [xt-services] section of XT config file" ) # validate STORAGE_NAME credentials storage_creds = self.get("external-services", storage_name, suppress_warning=True) if not storage_creds: errors.config_error( "'{}' must be specified in [external-services] section of XT config file" .format(storage_name)) #self.expand_symbols_in_creds(storage_creds, storage_name) storage_creds["name"] = storage_name return storage_creds
def yaml_to_dist_dict(self, fn): ''' args: fn: name of .yaml file processing: load data from .yaml file return: data ''' yd = file_utils.load_yaml(fn) if not constants.HPARAM_DIST in yd: errors.config_error( "hyperparmeter search file missing '{}' section: {}".format( constants.HPARAM_DIST, fn)) hparams = yd[constants.HPARAM_DIST] dd = {} for key, value in hparams.items(): dd[key] = hp_helper.parse_hp_dist(value) return dd
def get(self, group, name=None, dict_key=None, default_value=None, suppress_warning=False, group_error=None, prop_error=None, key_error=None): value = default_value if group in self.data: value = self.data[group] if name: if name in value: value = value[name] if dict_key: if dict_key in value: value = value[dict_key] else: if key_error: errors.config_error(key_error) if not suppress_warning: self.warning("GET option dict_key not found: ", group, name, dict_key, default_value) value = default_value else: if prop_error: errors.config_error(prop_error) if not suppress_warning: self.warning("GET option not found: ", group, name, dict_key, default_value) value = default_value else: if group_error: errors.config_error(group_error) if not suppress_warning: self.warning("GET option GROUP not found: ", group, name, dict_key, default_value) value = default_value # expand values containing a "$" id if isinstance(value, str) and "$" in value: value = self.expand_system_symbols(value, group, name) elif isinstance(value, dict): for key, val in value.items(): if isinstance(val, str) and "$" in val: val = self.expand_system_symbols(val, name, key) value[key] = val return value
def get_vault_url(self): # validate VAULT service vault_name = self.get("xt-services", "vault", suppress_warning=True) if not vault_name: errors.config_error( "'vault' property must be set in [xt-services] section of XT config file" ) # validate VAULT credentials vault_creds = self.get("external-services", vault_name, suppress_warning=True) if not vault_creds: errors.config_error( "'{}' must be specified in [external-services] section of XT config file" .format(vault_name)) if not "url" in vault_creds: errors.config_error( "URL not specified for '{}' in [external-services] section of XT config file" .format(vault_name)) url = vault_creds["url"] return url
def process_run_command(self, args): self.args = args # ensure workspace exists workspace = args['workspace'] dry_run = args['dry_run'] fake_submit = args["fake_submit"] if not fake_submit: self.store.ensure_workspace_exists(workspace, flag_as_error=False) # PRE-PROCESS ARGS service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, compute, compute_def = \ self.process_args(args) # create backend helper (pool, philly, batch, aml) cluster = utils.safe_value(compute_def, "cluster") vc = utils.safe_value(compute_def, "vc") self.backend = self.core.create_backend(compute, cluster=cluster, vc=vc, username=None) # add conda_packages and pip_packages from SETUP to ARGS setup_def = self.config.get_setup_from_target_def(compute_def) conda_packages = utils.safe_value(setup_def, "conda-packages") pip_packages = utils.safe_value(setup_def, "pip-packages") args["conda_packages"] = conda_packages if conda_packages else [] args["pip_packages"] = pip_packages if pip_packages else [] self.adjust_pip_packages(args) snapshot_dir = self.temp_dir if fake_submit: script_dir = snapshot_dir else: # note: always create a snapshot dir for backends to add needed files file_utils.ensure_dir_deleted(snapshot_dir) script_dir = self.snapshot_all_code(snapshot_dir, cmd_parts, args) self.script_dir = script_dir direct_run = args["direct_run"] # do we need to start the xt controller? use_controller = not direct_run adjustment_scripts = None # create a job_secret that can later be used to authenticate with the XT controller # NOTE: we currently log this secret as a job property, which allows all team members to view and control this job job_secret = str(uuid.uuid4()) # do we need to build a "docker run" command? if not self.backend.provides_container_support(): env = args["docker"] if not env: docker_name = utils.safe_value(compute_def, "docker") if docker_name and docker_name != "none": cmd_parts = self.build_docker_cmd(docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args) args["docker"] = docker_name # for use in building run context info # BUILD CMDS (from static hparam search, user multi cmds, or single user cmd) cmds, total_run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style = \ self.build_cmds_with_search(service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args) if dry_run: return # make new values available args["search_style"] = search_style args["total_run_count"] = total_run_count resume_name = args['resume_name'] keep_name = False # args['keep_name'] experiment = args['experiment'] is_distributed = args['distributed'] direct_run = args["direct_run"] # CREATE JOB to hold all runs if fake_submit: # use lastrun/lastjob info to get a fast incremental fake job number xtd = xt_dict.read_xt_dict() fake_job_num = xtd["fake_job_num"] if "fake_job_num" in xtd else 1 xtd["fake_job_num"] = fake_job_num + 1 xt_dict.write_xt_dict(xtd) job_id = "fake_job" + str(fake_job_num) else: job_id = self.store.create_job() fb.feedback(job_id) # start the feedback (by parts) fb.feedback("{}: {}".format("target", compute)) # write hparams to FILES boxes, num_boxes = self.write_hparams_to_files(job_id, cmds, fake_submit, using_hp, args) if sweeps_text and not fake_submit: self.upload_sweep_data(sweeps_text, experiment, job_id, args=args) # if num_boxes > 1 and service_type != "batch": # fb.feedback("", is_final=True) parent_name = None # BUILD RUNS, by box job_runs = [] run_count = 1 if is_distributed else len(boxes) secrets_by_node = {} remote_control = args["remote_control"] for i in range(run_count): box_name = boxes[i] # generate a box secret for talking to XT controller for this node box_secret = str(uuid.uuid4()) if remote_control else "" # build runs for box_name run_data = self.build_first_run_for_node(i, boxes[i], target_file, ps_path, using_hp, using_aml_hparam, run_specs, job_id, parent_name, cmds, pool_info, repeat_count, fake_submit, search_style, box_secret, args) # for now, adhere to the more general design of multiple runs per box box_runs = [run_data] job_runs.append(box_runs) node_id = utils.node_id(i) secrets_by_node[node_id] = box_secret # FEEDBACK ptype = "single " if search_style == "single" else "parent " if is_distributed: ptype = "master " if run_count == 1: node_msg = "creating {}run".format(ptype) else: node_msg = "creating {}runs: {}/{}".format(ptype, i+1, run_count) if service_type == "pool": node_msg += ", box: " + box_name fb.feedback(node_msg, id="node_msg") # , add_seperator=is_last) last_msg = node_msg # run the job # build box: runs dict for job info file runs_by_box, last_run = self.build_runs_by_box(job_runs, workspace) # now that we have run names for all static run names for all nodes, we can adjust cmds (and before files) for using the controller if use_controller: # we will create 2 temp. controller files in the CURRENT DIRECTORY (that will be captured to JOB) # this will also adjust commands for each node to run the XT controller adjustment_scripts = self.core.adjust_job_for_controller_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, search_style, args=args) else: adjustment_scripts = self.core.adjust_job_for_direct_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, search_style, args=args) # add env vars used by both controller and runs env_vars = args["env_vars"] # create a job guid to uniquely identify this job across all XT instances job_guid = str(uuid.uuid4()) # we add with "node0" and "job_secret", but backend service will override for each node scriptor.add_controller_env_vars(env_vars, self.config, None, "node0") data_local = args["data_local"] if "$scriptdir" in data_local: data_local = os.path.realpath(data_local.replace("$scriptdir", script_dir)) args["data_local"] = data_local model_local = args["model_local"] if "$scriptdir" in model_local: model_local = os.path.realpath(model_local.replace("$scriptdir", script_dir)) args["model_local"] = model_local # ADJUST CMDS: this allows backend to write scripts to snapshot dir, if needed, as a way of adjusting/wrapping run commands self.backend.adjust_run_commands(job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args=args) # upload CODE from snapshot_dir code_upload = args["code_upload"] code_omit = args["code_omit"] code_zip = args["code_zip"] if not fake_submit: if code_upload: self.core.upload_before_files_to_job(job_id, snapshot_dir, "before/code", code_omit, code_zip, "code", args) # upload DATA from data_local (do we need to keep this? should we upload to normal DATA location, vs. job?) data_upload = args["data_upload"] if data_upload: if not data_local: errors.config_error("cannot do data-upload because no data-local path is defined in the XT config file") data_omit = args["data_omit"] data_zip = "none" self.core.upload_before_files_to_job(job_id, data_local, "before/data", data_omit, data_zip, "data", args) # dispatch to BACKEND submitters ''' Note: backend submitter functions are responsible for: - submitting the job (for each node, queue runs for that node) - return service job id (or list of them if per node) NOTE: there is a timing issue where submitted job needs access to job info, but final piece of job info (service info) is only return after job is submitted. Therefore, we structure steps as follows: - primary job info is logged - job is submitted thru backend - service info for job is logged ''' # LOG PRIMARY JOB INFO dd = {} if not fake_submit: # mark runs as QUEUED for runs in runs_by_box.values(): first_run = runs[0] self.store.log_run_event(workspace, first_run["run_name"], "status-change", {"status": "queued"}) # write the job info file (now that backend has had a chance to update it) job_num = int(job_id[3:]) xt_cmd = args["xt_cmd"] schedule = args["schedule"] concurrent = args["concurrent"] # this job property is used to ensure we don't exceed the specified # of runs when using repeat_count on each node dynamic_runs_remaining = None if search_style == "single" else total_run_count node_count = len(runs_by_box) # static_runs_by_node = None # if schedule == "static": # static_runs_by_node = self.build_static_runs_by_node(total_run_count, node_count) #console.diag("static_runs_by_node=", static_runs_by_node) active_runs = mongo_run_index.build_active_runs(schedule, total_run_count, node_count) dd = {"job_id": job_id, "job_num": job_num, "compute": compute, "ws_name": workspace, "exper_name": experiment, "pool_info": compute_def, "runs_by_box": runs_by_box, "primary_metric": args["primary_metric"], "run_count": total_run_count, "repeat": repeat_count, "search_type": args["search_type"], "username": args["username"], "hold": args["hold"], "started": utils.get_time(), "job_status": "submitted", "running_nodes": 0, "running_runs": 0, "error_runs": 0, "completed_runs": 0, "job_guid": job_guid, "job_secret": job_secret, "dynamic_runs_remaining": dynamic_runs_remaining, "search_style": search_style, "active_runs": active_runs, "connect_info_by_node": {}, "secrets_by_node": secrets_by_node, "xt_cmd": xt_cmd, "schedule": schedule, "node_count": node_count, "concurrent": concurrent, "service_job_info": None, "service_info_by_node": None, } self.store.log_job_info(job_id, dd) # SUBMIT JOB # NOTE: we use "pool_info" here (vs. compute_def, which has not been updated with explicit args) service_job_info, service_info_by_node = self.backend.submit_job(job_id, job_runs, workspace, pool_info, resume_name, repeat_count, using_hp, runs_by_box, experiment, snapshot_dir, adjustment_scripts, args) # POST SUBMIT processing # update job info if not fake_submit: dd["service_job_info"] = service_job_info dd["service_info_by_node"] = service_info_by_node self.store.log_job_info(job_id, dd) # update lastrun/lastjob info xtd = xt_dict.read_xt_dict() xtd["last_run"] = last_run xtd["last_job"] = job_id xt_dict.write_xt_dict(xtd) # return values for API support (X) return cmds, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, job_id
def process_args(self, args): run_script = None parent_script = None run_cmd_from_script = None target_file = args["script"] target_args = args["script_args"] code_upload = args["code_upload"] # user may have wrong slashes for this OS target_file = file_utils.fix_slashes(target_file) if os.path.isabs(target_file): errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file)) is_rerun = "is_rerun" in args if is_rerun: # will be running from script dir, so remove any path to script file self.script_dir = os.path.dirname(target_file) target_file = os.path.basename(target_file) if target_file.endswith(".py"): # PYTHON target cmd_parts = ["python"] cmd_parts.append("-u") cmd_parts.append(target_file) else: cmd_parts = [target_file] if target_args: # split on unquoted spaces arg_parts = utils.cmd_split(target_args) cmd_parts += arg_parts if target_file == "docker": self.is_docker = True if not self.is_docker and code_upload and not os.path.exists(target_file): errors.env_error("script file not found: {}".format(target_file)) ps_path = args["parent_script"] if ps_path: parent_script = file_utils.read_text_file(ps_path, as_lines=True) if target_file.endswith(".bat") or target_file.endswith(".sh"): # a RUN SCRIPT was specified as the target run_script = file_utils.read_text_file(target_file, as_lines=True) run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script) compute = args["target"] box_def = self.config.get("boxes", compute, suppress_warning=True) setup = utils.safe_value(box_def, "setup") compute_def = self.config.get_compute_def(compute) if compute_def: # must be defined in [compute-targets] compute_def = self.config.get_compute_def(compute) if not "service" in compute_def: errors.config_error("compute target '{}' must define a 'service' property".format(compute)) service = compute_def["service"] if service in ["local", "pool"]: # its a list of box names boxes = compute_def["boxes"] if len(boxes)==1 and boxes[0] == "localhost": pool = None box = "local" service_type = "pool" else: pool = compute box = None service_type = "pool" else: # it a set of compute service properties pool = compute box = None service_name = compute_def["service"] service_type = self.config.get_service_type(service_name) elif box_def: # translate single box name to a compute_def box = compute pool = None service_type = "pool" compute_def = {"service": service_type, "boxes": [box], setup: setup} else: errors.config_error("unknown target or box: {}".format(compute)) args["target"] = compute args["compute_def"] = compute_def args["service_type"] = service_type # for legacy code args["box"] = box args["pool"] = pool return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \ compute, compute_def
def build_docker_cmd(self, docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args): for_windows = True docker_def = self.config.get("dockers", docker_name, default_value=None) if not docker_def: errors.config_error("docker '{}' not found in config file".format(docker_name)) registry_name = docker_def["registry"] image = docker_def["image"] if registry_name: # get REGISTRY credentials registry_creds = self.config.get("external-services", registry_name, suppress_warning=True) if not registry_creds: config_error("'{}' must be specified in [external-services] section of XT config file".format(registry_name)) login_server = registry_creds["login-server"] else: login_server = None #pwd = "%cd%" if for_windows else "$(pwd)" script_dir = file_utils.fix_slashes(script_dir, True) mappings = "-v {}:/usr/src".format(script_dir) options = "--rm" # collect env vars env_vars = {"XT_IN_DOCKER": 1, "XT_USERNAME": pc_utils.get_username()} scriptor.add_controller_env_vars(env_vars, self.config, job_secret, "node0") # fixup backslash char for target_file if ".py" in target_file: app = "python -u" #target_file = file_utils.fix_slashes(target_file, True) target_file = os.path.basename(target_file) else: app = target_file target_file = "" full_image = login_server + "/" + image if login_server else image # build a mapping for data? data_local = args["data_local"] if data_local: if "$scriptdir" in data_local: data_local = data_local.replace("$scriptdir", script_dir) data_local = os.path.realpath(data_local) mappings += " -v {}:/usr/data".format(data_local) env_vars["XT_DATA_DIR"] = "/usr/data" # write env vars to file in snapshot dir FN_EV = "__dockev__.txt" fn_env_var = os.path.join(snapshot_dir, FN_EV) lines = [name + "=" + str(value) for name,value in env_vars.items()] text = "\n".join(lines) file_utils.write_text_file(fn_env_var, text) # specify env var file (in current directory) to docker options += " --env-file={}".format(FN_EV) # inherit ENV VARS from running environment options += " -e XT_RUN_NAME -e XT_WORKSPACE_NAME -e XT_EXPERIMENT_NAME" docker_cmd = "docker run {} {} {} {} /usr/src/{}".format(options, mappings, full_image, app, target_file) new_parts = utils.cmd_split(docker_cmd) return new_parts
def submit_node_runs(self, job_id, node_runs, workspace, aml_ws_name, xt_exper_name, aml_exper_name, compute_def, resume_name, repeat_count, using_hp, compute, runs_by_box, code_dir, node_index, show_aml_run_name, nodes, args): first_run = node_runs[0] first_run_name = first_run["run_name"] fake_submit = args["fake_submit"] # this indicates we should make serializable versions of estimator and trainer self.submit_logs = True or fake_submit # must be true if we are using fake_submit self.serializable_estimator = None self.serializable_trainer = None box_name = first_run["box_name"] run_specs = first_run["run_specs"] cmd_parts = run_specs["cmd_parts"] target_fn = args["script"] node_id = "node" + str(node_index) assert cmd_parts[0] == "python" assert cmd_parts[1] == "-u" assert len(cmd_parts[2]) > 0 # update the target_fn (might have been switched to the xt controller) target_fn = cmd_parts[2] arg_parts = cmd_parts[3:] # parse target's cmdline args arg_dict = {} for ap in arg_parts: # arg name can start with or without "-" here if "=" in ap: name, value = ap.split("=") if not value.startswith('"[') and not value.startswith('"@'): arg_dict[name] = value else: # for unspecified values arg_dict[ap] = 1 compute_target = utils.safe_value(compute_def, "compute") if not compute_target: errors.config_error( "'compute' property missing on compute target '{}' in XT config file" .format(compute)) estimator, experiment = self.create_estimator( job_id, workspace, aml_ws_name, xt_exper_name, aml_exper_name, first_run_name, code_dir, target_fn, arg_dict, compute_target, node_id, nodes, fake_submit, args) hp_config = args["hp_config"] direct_run = args["direct_run"] if using_hp and direct_run: # EXPERIMENT with hyperdrive max_runs = args["max_runs"] max_minutes = args["max_minutes"] policy_name = args["early_policy"] eval_interval = args["evaluation_interval"] delay_eval = args["delay_evaluation"] truncation_percentage = args["truncation_percentage"] slack_factor = args["slack_factor"] slack_amount = args["slack_amount"] primary_metric = args["primary_metric"] maximize_metric = args["maximize_metric"] search_type = args["search_type"] concurrent = args["concurrent"] max_concurrent_runs = nodes * concurrent if max_minutes <= 0: #max_minutes = 43200 # aml workaround: None not supported, either is -1 or 0, so use max value max_minutes = 10080 # aml workaround: documented max not supported if hp_sets: hd_dict = self.build_hyperdrive_dict(hp_sets) else: hd_dict = self.build_hyperdrive_dict_from_file(hp_config) if not policy_name: # use default policy (not that same as no policy) early_term = None else: if self.submit_logs: early_term = { "policy_type": policy_name, "eval_interval": eval_interval, "delay_eval": delay_eval, "truncation_percentage": truncation_percentage, "slack_factor": slack_factor, "slack_amount": slack_amount } self.serializable_trainer = { "estimator": serializable_estimator, "hd_dict": hd_dict, "search_type": search_type, "primary_metric": primary_metric, "maximize_metric": maximize_metric, "early_term": serializable_early_term, "max_total_runs": max_runs, "max_concurrent_runs": max_concurrent_runs, "max_minutes": max_minutes } if fake_submit: trainer = self.serializable_trainer else: early_term = self.make_early_term_policy( policy_type=policy_name, eval_interval=eval_interval, delay_eval=delay_eval, truncation_percentage=truncation_percentage, slack_factor=slack_factor, slack_amount=slack_amount) trainer = self.create_hyperdrive_trainer( estimator, hd_dict, search_type, primary_metric, maximize_metric, early_term, max_total_runs=max_runs, max_concurrent_runs=max_concurrent_runs, max_minutes=max_minutes) else: # not using AML hyperdrive trainer = estimator run_name, monitor_cmd, aml_run_name, aml_run_number, aml_run_id = \ self.run_aml_job(job_id, workspace, aml_ws_name, trainer, experiment, xt_exper_name, aml_exper_name, compute_target, code_dir, first_run_name, box_name, node_index, repeat_count, fake_submit, args) if show_aml_run_name: fb.feedback("[aml: {}/Run {}], xt: {}/{} ".format( aml_exper_name, aml_run_number, workspace, run_name), is_final=True) else: fb.feedback("{}/{}".format(aml_exper_name, aml_run_number, workspace, run_name)) mongo = self.store.get_mongo() run_names = [] for run in node_runs: run_name = run["run_name"] run_names.append(run_name) node_info = {"ws": workspace} for run_name in run_names: # we only have 1 run, so OK to hold info in flat dict here node_info["aml_exper_name"] = aml_exper_name node_info["aml_run_number"] = aml_run_number node_info["aml_run_id"] = aml_run_id node_info["run_name"] = run_name # update mongo db info for run with cluster and service_job_id mongo.update_mongo_run_from_dict(workspace, run_name, { "aml_exper_name": aml_exper_name, "aml_run_number": aml_run_number }) if monitor_cmd: console.print("monitoring notebook created; to run:") console.print(" " + monitor_cmd) return node_info
def create_estimator(self, job_id, workspace, aml_ws_name, xt_exper_name, aml_exper_name, run_name, code_dir, target_fn, arg_dict, compute_target, node_id, nodes, fake_submit, args): config = self.config ps = None if not aml_exper_name: errors.config_error( "experiment name must be specified (thru config file or command line option '--experiment')" ) if fake_submit: # for speed of testing, avoid creating real Workspace, Experiment instances ws = {"name": aml_ws_name} experiment = {"ws": ws, "name": aml_exper_name} else: ws = self.get_aml_ws(aml_ws_name) experiment = Experiment(ws, name=aml_exper_name) if compute_target == "amlcompute": actual_target = "amlcompute" # AmlCompute(ws, None) else: if fake_submit: actual_target = "amlcompute" else: if not compute_target in ws.compute_targets: errors.config_error( "compute target '{}' does not exist in AML workspace '{}'" .format(compute_target, aml_ws_name)) actual_target = ws.compute_targets[compute_target] # build ENV VARS store_creds = self.config.get_storage_creds() # store_name = store_creds["name"] # store_key = store_creds["key"] provider_code_path = config.get_storage_provider_code_path(store_creds) mongo_creds, mongo_name = self.config.get_mongo_creds() mongo_conn_str = mongo_creds["mongo-connection-string"] username = args["username"] description = args["description"] aggregate_dest = args["aggregate_dest"] env_vars = self.build_env_vars(workspace, aml_ws_name, xt_exper_name, aml_exper_name, run_name, job_id=job_id, compute_target=compute_target, username=username, description=description, aggregate_dest=aggregate_dest, node_id=node_id, args=args) framework = args["framework"] framework = framework.lower() is_distributed = args['distributed'] dist_training = args["distributed_training"] dist_training = dist_training.lower() from azureml.train.estimator import Estimator, Mpi, Gloo, Nccl from azureml.train.dnn import PyTorch, Chainer, TensorFlow fw_dict = { "pytorch": PyTorch, "tensorflow": TensorFlow, "chainer": Chainer, "estimator": Estimator } dt_dict = {"mpi": Mpi, "gloo": Gloo, "nccl": Nccl} if not framework in fw_dict: errors.user_config_errorerror( "framework must be set to 'pytorch', 'tensorflow', 'chainer', or 'estimator'" ) estimator_ctr = fw_dict[framework] if is_distributed: if not dist_training in dt_dict: errors.config_error( "distributed-training must be set to 'mpi', 'gloo', or 'nccl'" ) distributed_ctr = dt_dict[dist_training] distributed_obj = distributed_ctr() else: distributed_obj = None compute_def = args["compute_def"] direct_run = args["direct_run"] if direct_run: # relying on AML for full control (not using XT controller) node_count = utils.safe_value(compute_def, "nodes") # did cmd line overwrite nodes? if args["nodes"]: node_count = args["nodes"] if node_count is None: errors.config_error( "must specify 'nodes' property for Azure ML service '{}' in XT config file or as --nodes option in cmd line" .format(args["target"])) else: # run as separate AML runs, each with a single node node_count = 1 vm_size = args["vm_size"] conda_packages = args["conda_packages"] pip_packages = args["pip_packages"] use_gpu = args["use_gpu"] framework_version = args["fw_version"] max_secs = args["max_seconds"] user_managed = args["user_managed"] activate_cmd = self.get_activate_cmd() if activate_cmd: # we have no way of running this on AML before conda_packages and pip_packages are installed (or used to build a docker image) errors.config_error( "setup.activate property cannot be specified for AML targets") #max_secs = 10080 if max_secs <= 0 else max_secs use_docker = False environment_name = utils.safe_value(compute_def, "docker") if environment_name: envrionment_def = self.config.get_docker_def(environment_name) if envrionment_def: use_docker = (envrionment_def["type"] == "docker") # workaround AML warning if not use_docker: use_docker = None if self.submit_logs: # for testing (this should match exact args used in estimator ctr below) self.serializable_estimator = { "source_directory": code_dir, "script_params": arg_dict, "compute_target": actual_target, "vm_size": vm_size, "entry_script": target_fn, "conda_packages": conda_packages, "pip_packages": pip_packages, "use_gpu": use_gpu, "use_docker": use_docker, "framework_version": framework_version, "user_managed": user_managed, "environment_variables": env_vars, "node_count": node_count, "distributed_training": {}, "max_run_duration_seconds": max_secs } if fake_submit: estimator = self.serializable_estimator else: estimator = estimator_ctr(source_directory=code_dir, script_params=arg_dict, compute_target=actual_target, vm_size=vm_size, entry_script=target_fn, conda_packages=conda_packages, pip_packages=pip_packages, use_gpu=use_gpu, use_docker=use_docker, framework_version=framework_version, user_managed=user_managed, environment_variables=env_vars, node_count=node_count, distributed_training=distributed_obj, max_run_duration_seconds=max_secs) return estimator, experiment