def cancel_runs_by_user(self, box_name): ''' Args: box_name: the name of the box the runs ran on (pool service) Returns: cancel_results: a list of kill results records (keys: workspace, run_name, exper_name, killed, status, before_status) ''' cancel_results = [] # get list of active jobs from batch active_jobs = self.get_active_jobs() console.diag("after get_active_jobs()") if active_jobs: for job_record in active_jobs: # watch out for older jobs that didn't have service_job_info/service_info_by_node properties service_job_info = utils.safe_value(job_record, "service_job_info") service_info_by_node = utils.safe_value( job_record, "service_info_by_node") if service_job_info and service_info_by_node: job_id = job_record["job_id"] cancel_result = self.cancel_job(service_job_info, service_info_by_node) for _, node_result in cancel_result.items(): cancel_results.append(node_result) return cancel_results
def download_files(self, wildcard, dest_folder): container, path, wc_target = self._get_container_path_target(wildcard) console.diag("container={}, path={}, wc_target={}, wildcard={}".format( container, path, wc_target, wildcard)) return self.store._download_files(container, path, wc_target, dest_folder)
def get_info_for_jobs(self, filter_dict, fields_dict=None): cursor = self.mongo_with_retries( "get_info_for_jobs", lambda: self.mongo_db["__jobs__"].find(filter_dict, fields_dict)) job_records = list(cursor) if cursor else [] console.diag("after get_info_for_jobs()") return job_records
def copy_data_to_submit_logs(args, data, fn): submit_logs = args["submit_logs"] if submit_logs: text = json.dumps(data) # copy text to submit logs fn_dest = os.path.join(submit_logs, os.path.basename(fn)) with open(fn_dest, "w") as outfile: outfile.write(text) console.diag("copied {} to: {}".format(fn, fn_dest))
def copy_to_submit_logs(args, fn, fnx=None): submit_logs = args["submit_logs"] if submit_logs: # copy file to submit logs if not fnx: fnx = fn fn_dest = os.path.join(submit_logs, os.path.basename(fn)) shutil.copyfile(fn, fn_dest) console.diag("copied {} to: {}".format(fn, fn_dest))
def get_filtered_sorted_limit_runs(store, config, show_gathering, col_dict=None, args=None): console.diag("start of: get_filtered_sorted_limit_runs") # required run_list = args["run_list"] # optional pool = utils.safe_value(args, "target") available = utils.safe_value(args, "available") workspace = utils.safe_value(args, "workspace") if workspace: store.ensure_workspace_exists(workspace, flag_as_error=True) mongo = store.get_mongo() # have MONGO update any old RUN documents to new format fixup_mongo_runs.fixup_runs_if_needed(mongo.mongo_db, workspace) # get info about run properties user_to_actual, std_cols_desc = get_run_property_dicts() actual_to_user = {value: key for key, value in user_to_actual.items()} builder = ReportBuilder(config, store, client=None) # get list of specified runs pure_run_list, actual_ws = expand_run_list(store, mongo, workspace, run_list) if run_list and not pure_run_list: errors.general_error("no run(s) found") # build a filter dict for all specified filters filter_dict = build_run_filter_dict(pure_run_list, user_to_actual, builder, args) # if show_gathering: # console.print("gathering run data...", flush=True) # get the mongo records for the matching RUNS records, using_default_last, last = builder.get_mongo_records( mongo, filter_dict, workspace, "runs", actual_to_user, col_dict=col_dict, args=args) console.diag("end of: get_filtered_sorted_limit_runs") return records, using_default_last, user_to_actual, available, builder, last, std_cols_desc
def make_local_snapshot(self, snapshot_dir, code_dir, dest_name, omit_list): ''' keep code simple (and BEFORE upload fast): - always copy code dir to temp dir - if needed, copy xtlib subdir - later: if needed, add 2 extra controller files - later: zip the whole thing at once & upload ''' if dest_name and dest_name != ".": snapshot_dir += "/" + dest_name console.diag("before create local snapshot") # fixup slashes for good comparison snapshot_dir = os.path.realpath(snapshot_dir) # fully qualify path to code_dir for simpler code & more informative logging code_dir = os.path.realpath(code_dir) recursive = True if code_dir.endswith("**"): code_dir = code_dir[:-2] # drop the ** elif code_dir.endswith("*"): recursive = False # copy user's source dir (as per config file options) if True: omit_list = utils.parse_list_option_value(omit_list) # build list of files matching both criteria filenames = file_helper.get_filenames_from_include_lists(None, omit_list, recursive=recursive, from_dir=code_dir) file_utils.ensure_dir_exists(snapshot_dir) prefix_len = 2 if code_dir == "." else len(code_dir) copy_count = 0 # copy files recursively, preserving subdir names for fn in filenames: fn = os.path.realpath(fn) # fix slashes if fn.startswith(code_dir) and fn != code_dir: fn_dest = snapshot_dir + "/" + fn[prefix_len:] file_utils.ensure_dir_exists(file=fn_dest) shutil.copyfile(fn, fn_dest) else: shutil.copy(fn, snapshot_dir) copy_count += 1 #console.diag("after snapshot copy of {} files".format(copy_count)) else: shutil.copytree(code_dir, snapshot_dir) return snapshot_dir
def get_info_for_runs(self, ws_name, filter_dict, fields_dict=None): # filter_dict = {} # filter_dict["run_name"] = {"$in": run_names} cursor = self.mongo_with_retries( "get_boxes_for_runs", lambda: self.mongo_db[ws_name].find(filter_dict, fields_dict)) run_records = list(cursor) if cursor else [] console.diag("after get_boxes_for_runs()") return run_records
def get_all_experiments_in_ws(self, ws_name): # cannot get "distinct" command to work ("command not supported") #cursor = db["__jobs__"].distinct("ws_name") cursor = self.mongo_with_retries( "get_all_experiments_in_ws", lambda: self.mongo_db["__jobs__"]. find({"ws_name": ws_name}, {"exper_name": 1})) exper_names = [ rec["exper_name"] for rec in cursor if "exper_name" in rec ] exper_names = list(set(exper_names)) # remove dups console.diag("after get_all_experiments()") return exper_names
def run_aml_job(self, job_id, workspace, aml_ws_name, trainer, experiment, xt_exper_name, aml_exper_name, compute_target, cwd, run_name, box_name, node_index, repeat, fake_submit, args): monitor_cmd = None console.diag("before AML experiment.submit(trainer)") # SUBMIT the run and return an AML run object if fake_submit: aml_run = None aml_run_id = "fake_aml_id" aml_run_number = 999 else: aml_run = experiment.submit(trainer) aml_run_id = aml_run.id aml_run_number = aml_run.number # copy to submit-logs utils.copy_data_to_submit_logs(args, self.serializable_trainer, "aml_submit.json") console.diag("after AML experiment.submit(trainer)") config = self.config username = args["username"] description = args["description"] aggregate_dest = args["aggregate_dest"] jupyter_monitor = args["jupyter_monitor"] aml_run_name = aml_exper_name + ".{}".format(run_name) # set "xt_run_name" property for fast access to run in future if not fake_submit: aml_run.add_properties({"xt_run_name": aml_run_name}) aml_run.set_tags({"xt_run_name": aml_run_name}) # # partially log the start of the RUN # self.store.start_run_core(workspace, run_name, exper_name=xt_exper_name, description=description, username=username, # box_name=box_name, app_name=None, repeat=repeat, is_parent=False, job_id=job_id, pool=compute_target, node_index=node_index, # aggregate_dest=aggregate_dest, path=cwd, aml_run_id=aml_run_id) if jupyter_monitor: fn = self.make_monitor_notebook(aml_ws_name, aml_run_name) dir = os.path.dirname(fn) #console.print("jupyter notebook written to: " + fn) monitor_cmd = "jupyter notebook --notebook-dir=" + dir return run_name, monitor_cmd, aml_run_name, aml_run_number, aml_run_id
def _read_blob(self, ws_name, blob_path): console.diag("_read_blob: ws_name={}, blob_path={}".format( ws_name, blob_path)) if not self.does_workspace_exist(ws_name): # avoid 10 retries and unfriendly storage errors errors.store_error("container doesn't exist: " + ws_name) if not self.provider.does_blob_exist(ws_name, blob_path): # avoid 10 retries and unfriendly storage errors errors.store_error( "blob doesn't exist: container={}, path={}".format( ws_name, blob_path)) blob_text = self.provider.get_blob_text(ws_name, blob_path) return blob_text
def parse_string_list(self, tok, scanner, pipe_objects_enabled=True): global pipe_object_list #print("parse_string_list, tok=", tok) if not tok: # empty string specified value = [] tok = scanner.scan() # skip over the empty string elif tok == "$": if pipe_objects_enabled: global pipe_object_list pipe_object_list = get_xt_objects_from_cmd_piping() console.diag("pipe_object_list: {}".format(pipe_object_list)) if pipe_objects_enabled and pipe_object_list: #print("found '*', pipe_object_list=", pipe_object_list) value = pipe_object_list console.print("replacing '$' with: ", value) else: errors.combo_error( "'$' can only be used for piping the output of a previous XT command into this run" ) # mark pipe objects as having been consumed by this parsing pipe_object_list = None tok = scanner.scan() # skip over the $ else: # scan a comma separated list of tokens (some of which can be single quoted strings) value = [] while tok != None: if tok.startswith("--"): break ev = self.expand_system_values(tok) value.append(ev) tok = scanner.scan() if tok != ",": break tok = scanner.scan() # skip over the comma return value, tok
def create_vault_if_needed(self): if not self.vault: console.diag("before vault login") from xtlib import xt_vault # create our vault manager vault_url = self.get_vault_url() team_name = self.get("general", "xt-team-name") azure_tenant_id = self.get("general", "azure-tenant-id") self.vault = xt_vault.XTVault(vault_url, team_name, azure_tenant_id=azure_tenant_id) authentication = self.get("general", "authentication") self.vault.init_creds(authentication) console.diag("after vault login")
def get_next_sequential_ws_id(self, ws_name, path, default_next_run): db = self.mongo_db assert not "/" in ws_name assert not "/" in path console.diag("ws={}, path={}, default_next_run={}".format( ws_name, path, default_next_run)) # does a counters doc exist for this ws_name? cursor = db.ws_counters.find({"_id": ws_name}).limit(1) if not cursor.count(): console.diag( "LEGACY ws={} found in get_next_sequential_ws_id".format( ws_name)) # we need BOTH next_run and next_end for a new record last_id = self.get_legacy_end_id(ws_name) default_next_end = 1 + last_id if last_id else 1 info = { "_id": ws_name, "next_run": default_next_run, "next_end": default_next_end, "next_child": {} } db.ws_counters.insert_one(info) document = db.ws_counters.find_and_modify({"_id": ws_name}, update={"$inc": { path: 1 }}, new=False) next_id = utils.safe_nested_value(document, path) if not next_id: # child id's start at 0; if we got that, skip it and get next one document = db.ws_counters.find_and_modify( {"_id": ws_name}, update={"$inc": { path: 1 }}, new=False) next_id = utils.safe_nested_value(document, path) return next_id
def download_runs(self, store, ws_name, run_group_name, run_group_type, hp_config_cloud_path, hp_config_local_dir): # Download the all_runs file local_cache_path = "{}/{}/{}/".format(hp_config_local_dir, ws_name, run_group_type) local_config_file_path = "{}{}".format(local_cache_path, "hp-config.yaml") if run_group_name == "experiment": console.print( "downloading runs for EXPERIMENT={}...".format(run_group_type)) # files are at EXPERIMENT LEVEL # read SWEEPS file if not store.does_experiment_file_exist(ws_name, run_group_type, hp_config_cloud_path): errors.store_error( "missing experiment hp_config file (ws={}, exper={}, fn={})" .format(ws_name, run_group_type, hp_config_cloud_path)) store.download_file_from_experiment(ws_name, run_group_type, hp_config_cloud_path, local_config_file_path) # read ALLRUNS info aggregated in EXPERIMENT allrun_records = store.get_all_runs(run_group_name, ws_name, run_group_type) else: console.print( "downloading runs for JOB={}...".format(run_group_type)) # files are at JOB LEVEL # read SWEEPS file if not store.does_job_file_exist(run_group_type, hp_config_cloud_path): errors.store_error( "missing job hp_config file (job={}, fn={})".format( run_group_type, hp_config_cloud_path)) store.download_file_from_job(run_group_type, hp_config_cloud_path, local_config_file_path) # read ALLRUNS info aggregated in JOB allrun_records = store.get_all_runs(run_group_name, ws_name, run_group_type) console.diag("after downloading all runs") return local_config_file_path, allrun_records
def remove_workspace(self, ws_name): self.remove_cache(ws_name) # remove associated mongo_db container container = self.mongo_db[ws_name] container.drop() count = container.count() console.diag(" after mongo_db container={} dropped, count={}=".format( container, count)) # remove counters for this workspace cmd = lambda: self.mongo_db.ws_counters.remove({"_id": ws_name}) self.mongo_with_retries("remove_workspace", cmd, ignore_error=True) # remove legacy counters for this workspace end_id = ws_name + "-end_id" cmd = lambda: self.mongo_db.ws_counters.remove({"_id": end_id}) self.mongo_with_retries("remove_workspace", cmd)
def _list_directories(self, container, path, wc_target, subdirs=0): console.diag( "_list_directories: container={}, path={}, wc_target={}, subdirs={}" .format(container, path, wc_target, subdirs)) service_name = self.provider.get_service_name() dd = {"store_name": "XT Store ({})".format(service_name)} #console.print("dd=", dd) if not container: # get a list of all containers is a special case if path: errors.syntax_error( "path can not be set when the container is set to '/'") folder, folder_names = self._get_root_folders() folders = [folder] if subdirs: base_path = "" for ws_name in folder_names: # get blobs from AZURE console.diag("reading blobs for ws={}".format(ws_name)) blobs = self.provider.list_blobs(ws_name, path=None, return_names=False) blobs = list(blobs) ws_folders = self._build_folders_from_blobs( blobs, ws_name, base_path, subdirs) folders += ws_folders else: # get blobs from AZURE actual_path = path if path else None blobs = self.provider.list_blobs(container, path=actual_path, return_names=False) blobs = list(blobs) if wc_target: # apply filter blobs = [ blob for blob in blobs if fnmatch(blob.name, wc_target) ] console.diag("list_blobs returned: len(blobs)={}".format( len(blobs))) folders = self._build_folders_from_blobs(blobs, container, path, subdirs) # filter folders as per subdirs if not subdirs is True: # subdirs is set to an int value #console.print("filtering by subdirs=", subdirs) folders = [f for f in folders if f["level"] <= subdirs] dd["folders"] = folders return dd
def cancel_run(self, ws_name, run_name): console.diag("start of azure_ml.cancel_run()") target_run = self.get_run(ws_name, run_name) if not target_run: errors.store_error("run not found: {}".format(run_name)) console.diag("after get_run() call") before_status = target_run.status.lower() if before_status in ["preparing", "queued"]: target_run.cancel() killed = True status = "cancelled" elif before_status in ["starting", "running"]: target_run.cancel() killed = True status = "cancelled" else: killed = False status = target_run.status console.diag("after run.cancel() call") return { "workspace": ws_name, "run_name": run_name, "cancelled": killed, "status": status }
def is_controller_running(self, box_name, box_addr, port=constants.CONTROLLER_PORT): if not port: port = constants.CONTROLLER_PORT # KISS: just try to connect is_running = False try: ip_addr = self.core.get_ip_addr_from_box_addr(box_addr) console.diag( " trying to connect with: ip_addr={}, port={}".format( ip_addr, port)) self.connect(box_name, ip_addr, port=port) is_running = True except BaseException as ex: console.diag(" received exception: " + str(ex)) is_running = False #raise ex # uncomment to see the stack trace console.diag(" is_controller_running: " + str(is_running)) return is_running
def snapshot_all_code(self, snapshot_dir, cmd_parts, args): ''' make local snapshot of each code_dir (and xtlib, if needed) ''' code_dirs = args["code_dirs"] xtlib_capture = args["xtlib_upload"] code_omit = args["code_omit"] script_dir = None code_upload = args["code_upload"] # this step should always be done so that script_dir is removed from cmd_parts script_dir = self.remove_script_dir_from_parts(cmd_parts) if code_upload: for i, code_dir in enumerate(code_dirs): # fixup "$scriptdir" relative paths if "$scriptdir" in code_dir: code_dir = code_dir.replace("$scriptdir", script_dir) if "::" in code_dir: code_dir, dest_dir = code_dir.split("::") else: dest_dir = "." self.make_local_snapshot(snapshot_dir, code_dir, dest_dir, code_omit) else: script_dir = snapshot_dir if xtlib_capture: # copy XTLIB directory to "xtlib" subdir of temp xtlib_dir = file_utils.get_xtlib_dir() dest_dir = snapshot_dir + "/xtlib" file_utils.ensure_dir_deleted(dest_dir) # don't copy the "demo_files" directory shutil.copytree(xtlib_dir, dest_dir, ignore=shutil.ignore_patterns("demo_files")) console.diag("after create local snapshot") return script_dir
def import_jobs_to_mongo_if_needed(self, mongo): console.diag("before mongo import check") found = mongo.does_jobs_exist() console.diag("after mongo import check") if not found: # first time we have seen this data; import all jobs into mongo-db now console.print("one-time import of jobs data into mongo-db:") job_names = self.get_job_names() if job_names: console.print(" {:,} jobs will be imported".format( len(job_names))) count = 0 for job_id in job_names: job_json = self.read_job_info_file(job_id) dd = json.loads(job_json) mongo.update_job_info(job_id, dd) count += 1 if count % 100 == 0: console.print(" " + job_id) console.print(" {} jobs imported".format(count))
def _download_files(self, container, path, wc_target, dest_folder): #console.print("ws_name=", ws_name, ", ws_wildcard=", ws_wildcard) files_copied = [] names = self._list_wild_blobs(container, path, wc_target, include_folder_names=True) console.diag("_download_files: names=", names) blob_dir = path bd_index = 1 + len(blob_dir) # add for for trailing slash #console.print("blob_dir=", blob_dir, ", bd_index=", bd_index) for bn in names: base_bn = bn[bd_index:] dest_fn = dest_folder + "/" + base_bn console.detail("_download_files: bn=", bn, ", dest_fn=", dest_fn) file_utils.ensure_dir_exists(file=dest_fn) self.provider.get_blob_to_path(container, bn, dest_fn) files_copied.append(dest_fn) return files_copied
def connect_to_controller(self, box_name=None, ip_addr=None, port=None): ''' establish communication with the XT controller process on the specified box. return True if connection established, False otherwise. ''' connected = False console.diag("init_controler: box_name={}".format(box_name)) if self.conn == box_name: connected = True else: if ip_addr: box_addr = ip_addr else: info = box_information.get_box_addr(self.config, box_name, self.store) box_addr = info["box_addr"] controller_port = info["controller_port"] self.token = info["box_secret"] ip_addr = self.core.get_ip_addr_from_box_addr(box_addr) port = controller_port if controller_port else constants.CONTROLLER_PORT # the controller should now be running - try to connect try: console.diag(" connecting to controller") self.connect(box_name, ip_addr, port=port) console.diag(" connection successful!") # magic step: allows our callback to work correctly! # this must always be executed (even if self.conn is already true) bgsrv = rpyc.BgServingThread(self.conn) console.diag(" now running BgServingThread") connected = True except BaseException as ex: #self.report_controller_init_failure(box_name, box_addr, self.port, ex) # most common reasons for failure: not yet running (backend service) or finished running pass return connected
def get_run(self, ws_name, run_name): if not "." in run_name: errors.general_error( "Azure ML run name must be of the form: exper.runname") ws = self.get_aml_ws(ws_name) console.diag("after get_aml_ws() call") exper_name, run_part = run_name.split(".") experiment = Experiment(ws, name=exper_name) runs = experiment.get_runs(properties={"xt_run_name": run_name}) console.diag("after experiment.get_runs() call") runs = list(runs) console.diag("after list(runs), len={}".format(len(runs))) # run_number = int(run_part[3:]) # target_run = None #runs = [run for run in runs if run.number == run_number] target_run = runs[0] if len(runs) else None return target_run
def set_timer(timeout): console.print("set_timer called: timeout=", timeout) time.sleep(timeout) console.diag("timer triggered!") plt.close("all")
def create_run(self, job_id, user_cmd_parts, box_name="local", parent_name=None, rerun_name=None, node_index=0, using_hp=False, repeat=None, app_info=None, path=None, exper_name=None, pool_info=None, fake_submit=False, search_style=None, args=None): ''' 'create_run' does the following: - creates a new run name (and matching run directory in the store) - logs a "created" record in the run log - logs a "created" record in the workspace summary log - logs a "cmd" record in the run log - log an optional "notes" record in the run log - captures the run's "before" files to the store's run directory ''' console.diag("create_run: start") app_name = None # app_info.app_name box_nane = args["box"] pool = args["pool"] run_name = "" log_to_store = self.config.get("logging", "log") aggregate_dest = args["aggregate_dest"] if log_to_store: if not exper_name: exper_name = input("experiment name (for grouping this run): ") #console.print("calling store.start_run with exper_name=", exper_name) username = args["username"] description = args["description"] workspace = args["workspace"] console.diag("create_run: before start_run") service_type = args["service_type"] compute = args["target"] search_type = args["search_type"] sku = args["sku"] if not sku: # make default sku explicit if pool_info and "sku" in pool_info: sku = pool_info["sku"].lower() # create RUN in store if fake_submit: run_name = "fake_run123" else: if parent_name: run_name = self.store.start_child_run(workspace, parent_name, box_name=box_name, username=username, exper_name=exper_name, app_name=app_name, pool=pool, job_id=job_id, node_index=node_index, sku=sku, description=description, aggregate_dest=aggregate_des, path=path, compute=compute, service_type=service_type, search_style=search_style) else: is_parent = search_style != "single" run_name = self.store.start_run(workspace, exper_name=exper_name, box_name=box_name, app_name=app_name, username=username, repeat=repeat, pool=pool, job_id=job_id, node_index=node_index, sku=sku, description=description, aggregate_dest=aggregate_dest, path=path, compute=compute, service_type=service_type, search_style=search_style, is_parent=is_parent) console.diag("create_run: after start_run") # always log cmd (for re-run purposes) xt_cmd = args["xt_cmd"] if not fake_submit: self.store.log_run_event(workspace, run_name, "cmd", {"cmd": user_cmd_parts, "xt_cmd": xt_cmd }) # for now, don't log args (contain private credentials and not clear if we really need it) # record all "args" (from cmd line, user config, default config) in log (for audit/re-run purposes) #self.store.log_run_event(workspace, run_name, "args", args) store_type = self.config.get_storage_type() full_run_name = utils.format_workspace_exper_run(store_type, workspace, exper_name, run_name) # log NOTES record if not fake_submit: if self.config.get("logging", "notes") in ["before", "all"]: text = input("Notes: ") if text: self.store.log_run_event(workspace, run_name, "notes", {"notes": text}) else: full_run_name = "" console.diag("create_run: after logging") workspace = args['workspace'] return run_name, full_run_name, box_name, pool
def dispatch(self, args, is_rerun=False, capture_output=False, raise_syntax_exception=False): self.raise_syntax_exception = raise_syntax_exception # TODO: change to cmd_parts parsing, which naturally separates options cleanly (utils.cmd_split) # be sure to reset this for each parse (for multi-command XT sessions) global explict_options explict_options = {} orig_text = " ".join(args) self.dispatch_cmd = orig_text text = self.replace_curlies_with_quotes(orig_text) console.diag("fixed cmd={}".format(text)) scanner = Scanner(text) tok = scanner.scan() #console.print("first tok=", tok) # process any ROOT FLAGS if root_cmd_info: tok = self.process_root_options(scanner, tok) else: # there is no command to process --console, so set it explictly now console.set_level("normal") console.diag("start of command parsing: {}".format(text)) # process any options before the cmd as RAW options # raw_options = [] # tok = self.collect_raw_options(raw_options, scanner, tok) # process COMMAND keywords cmd_info, tok = self.get_cmd_info(tok, scanner) self.cmd_info = cmd_info if "kwgroup_name" in cmd_info: cmd_info = get_command("help") self.cmd_info = cmd_info # # user type incomplete command - display appropriate help # if raise_syntax_exception: # errors.syntax_error("incomplete command") # if command_help_func: # # parse any help-specific options # help_options = {} # self.parse_options(help_options, options, scanner, tok) # caller = self.impl_dict[command_help_func.__module__] # kwgroup_help_func(caller, cmd_info) # return # else: # errors.env_error("no registered 'help' command") cmd_name = cmd_info["name"] self.cmd_words = cmd_name.replace("_", " ") func = cmd_info["func"] options = cmd_info["options"] arguments = cmd_info["arguments"] options_before_args = cmd_info["options_before_args"] # command-specific help? # if "help" in raw_options: # help_value = raw_options["help"] # if help_value != None: # self.syntax_error("unexpected text after '--help': " + help_value) if tok == "--help": help_value = scanner.scan() if help_value != None: self.syntax_error("unexpected text after '--help': " + help_value) caller = self.impl_dict[command_help_func.__module__] if self.preprocessor: self.preprocessor(caller, arg_dict) command_help_func("help", caller, cmd_info) return # build a dictionary of arguments and options to be passed arg_dict = {} if options_before_args: # options come before arguments tok = self.parse_options(arg_dict, options, scanner, tok) tok = self.process_arguments(scanner, tok, arguments, arg_dict) else: # arguments come before options tok = self.process_arguments(scanner, tok, arguments, arg_dict) tok = self.parse_options(arg_dict, options, scanner, tok) # there should be no remaining tokens if tok: errors.argument_error("end of input", tok) full_arg_dict = self.validate_and_add_defaults(arguments, options, arg_dict) console.diag("dispatching to command func") # select the caller using function's module name caller = self.impl_dict[func.__module__] if capture_output: caller.set_capture_output(True) if is_rerun: full_arg_dict["is_rerun"] = 1 # call the matching command function with collected func args if self.preprocessor: self.preprocessor("command", caller, full_arg_dict) if cmd_info["pass_by_args"]: func(caller, args=full_arg_dict) else: func(caller, **full_arg_dict) console.diag("end of command processing") output = None if capture_output: output = caller.set_capture_output(False) return output
def main(cmd=None, disable_quickstart=False, capture_output=False, mini=False): utils.init_logging(constants.FN_XT_EVENTS, logger, "XT session") # fix artifact if no args passed, we end up with python's first arg if cmd: # treat as if it came from the shell (for consistent debugging/support) console.diag("orig cmd={}".format(cmd)) # shlex and linux loses single quotes around strings, but windows does not orig_args = shlex.split(cmd) console.diag("shlex args={}".format(orig_args)) else: orig_args = sys.argv[1:] console.diag("orig_args={}".format(orig_args)) cmd = " ".join(orig_args) cmd = cmd.strip() use_server = "--quic" in cmd if not use_server: from .helpers.xt_config import XTConfig config = XTConfig(create_if_needed=True) use_server = config.get("general", "quick-start") mid_elapsed = time.time() - xt_start_time #console.print("mid_elapsed={:.2f}".format(mid_elapsed)) if not use_server or disable_quickstart: # NORMAL start-up mode from xtlib import xt_cmds output = xt_cmds.main(cmd, capture_output=capture_output, mini=mini, raise_syntax_exception=False) else: # QUICK-START mode output = None log.info("using xt_server") import psutil need_start = True for proc in psutil.process_iter(): try: # Check if process name contains the given name string. ptext = str(proc.cmdline()) # if "python" in ptext: # console.print(ptext) if "python" in ptext and "xt_server.py" in ptext: need_start = False break except BaseException as ex: logger.exception( "Error while enumerating processes looking for xt_server, ex={}" .format(ex)) pass if need_start: from .cmd_core import CmdCore CmdCore.start_xt_server() # for now, always turn on stack traces for server-run cmd cmd = "--stack-trace " + cmd cmd_dict = {"text": cmd, "cwd": os.getcwd()} # retry up to 5 secs (to handle case where xt_server is being restarted) retry_count = 0 for i in range(5): try: run_cmd_on_server(cmd_dict, retry_count) break except BaseException as ex: logger.exception( "Error retry exceeded sending cmd to xt_server. Last ex={}" .format(ex)) console.print(".", end="", flush=True) #console.print(ex) time.sleep(1) retry_count += 1 elapsed = time.time() - xt_start_time #console.print("(elapsed: {:.2f} secs)".format(elapsed)) # add adjustment for average exit time console.diag("end of xt_run (includes exit time={:.2f})".format(EXIT_TIME), exit_time=EXIT_TIME) # don't return output if we were called from xt.exe (it will console.print a confusing "[]" to output) return output if capture_output else None
def main(cmd=None, new_start_time=None, capture_output=False, mini=False, raise_syntax_exception=True): ''' This is the XT app, used to manage and scale ML experiments, support various backends (Philly, Azure Batch, Azure ML). ''' if new_start_time: global xt_start_time xt_start_time = new_start_time import numpy as np seed = 5 if seed: np.random.seed(seed) np.random.RandomState(seed) if cmd: cmd = cmd.strip() if cmd.startswith("xt "): cmd = cmd[3:] elif cmd == "xt": cmd = "" args = utils.cmd_split(cmd) # remove empty args args = [arg for arg in args if arg] else: # if caller did not supply cmd args = sys.argv[1:] # when executing multiple commands, reset the feedback for each command feedback.reset_feedback() #console.print("cmd=", cmd, ", args=", args) console.diag("in xt_cmds.main") #console.print("config=", config) fn_local_config = get_fn_local_config(args) impl_shared = ImplShared() config = impl_shared.init_config(fn_local_config, mini=mini) store = impl_shared.store mini = config.mini_mode cmd_providers = config.get("providers", "command") impl_dict = {} for name, code_path in cmd_providers.items(): package, class_name = code_path.rsplit(".", 1) module = importlib.import_module(package) impl_class = getattr(module, class_name) impl = impl_class(config, store) impl_dict[package] = impl if name == "help": impl.set_mini_mode(mini) # this enables QFE to match a function by its module name, to the class instance to process the command # impl_dict = {"xtlib.impl_utilities": utilities, "xtlib.impl_storage": storage, # "xtlib.impl_compute": compute, "xtlib.impl_help": help_impl} # this parses args and calls the correct command function with its args and options correctly set. # the config object supplies the default value for most options and flags. dispatcher = qfe.Dispatcher( impl_dict, config, preprocessor=impl_shared.pre_dispatch_processing) if mini: # a dict of commands + arg/options to be surfaced (None means use all args/options) show_commands = { "cancel_all": ["target"], "cancel_job": ["job-id"], "cancel_run": ["run-names"], "clear_credentials": [], "config_cmd": ["default", "create", "reset"], "create_demo": ["destination", "response", "overwrite"], "create_services_template": [], "download": ["local-path", "store-path"], "extract": ["runs", "dest-dir", "browse", "workspace"], "help": ["command", "about", "browse", "version"], "help_topics": ["topic", "browse"], "list_blobs": ["path"], "list_jobs": [ "job-list", "experiment", "all", "first", "last", "filter", "sort", "reverse", "status", "available" ], "list_runs": [ "run-list", "job", "experiment", "all", "first", "last", "filter", "sort", "reverse", "status", "available" ], "monitor": ["name"], "run": [ "script", "script-args", "experiment", "hp-config", "max-runs", "nodes", "runs", "search-type", "target" ], "upload": ["local-path", "store-path"], "view_console": ["name", "target", "workspace", "node-index"], "view_metrics": ["runs", "metrics"], "view_run": ["run-name"] } dispatcher.show_commands(show_commands) qfe.remove_hidden_commands() # hide under-development commands hide_commands = [ "collect_logs", "start_tensorboard", "stop_tensorboard", "zip", "unzip", "wget" ] # hide internal cmds (for xt development use only) hide_commands.append("generate_help") dispatcher.hide_commands(hide_commands) # expand symbols like $lastjob, $lastrun impl_shared.expand_xt_symbols(args) # this is the NORMAL outer exeception handling block, but # also see the client/server exception handling in xt_run.py try: text = dispatcher.dispatch( args, capture_output=capture_output, raise_syntax_exception=raise_syntax_exception) except BaseException as ex: #console.print("in Exception Handler: utils.show_stack_trace=", utils.show_stack_trace) # does user want a stack-trace? logger.exception( "Error during displatcher.dispatch, args={}".format(args)) exc_type, exc_value, exc_traceback = sys.exc_info() errors.process_exception(exc_type, exc_value, exc_traceback) return text
def get_all_runs(self, aggregator_dest, ws_name, job_or_exper_name, filter_dict=None, fields_dict=None, use_cache=True, fn_cache=None, first_count=None, last_count=None, sort_dict=None): ''' cache design: - organized all cached run information by the way it was accessed: a folder for each workspace (created on demand), and under each, a folder specifying the filter_dict and fields_dict. This way, we only use cache records for exactly matching query info. - whenever sort, first_count, or last_count is used (that is, included in the mongo db query), we should set "use_cache" to False. - note: since Azure Cosmos version of mongo-db doesn't correctly support sort/first/last (totally busted as of Aug 2019), we never include sort/first/last in mongo db query. - as of 12/20/2019, the only code that correctly uses the fn_cache is hparam_search. all other code should call with use_cache=False. ''' # PERF-critical function # below code not yet cache-compliant use_cache = False records = [] target = 0 cache = None if use_cache and not fn_cache: # fn_cache = self.run_cache_dir + "/" + constants.ALL_RUNS_CACHE_FN # fn_cache = fn_cache.replace("$aggregator", ws_name) use_cache = False # play it safe for now if use_cache and os.path.exists(fn_cache): # read CACHED runs started = time.time() cache = utils.load(fn_cache) elapsed = time.time() - started target = max( [rec["end_id"] if "end_id" in rec else 0 for rec in cache]) console.print( "loaded {:,} records in {:.2f} secs from cache: {}".format( len(cache), elapsed, fn_cache)) if not filter_dict: if aggregator_dest == "job": filter_dict = {"job_id": job_or_exper_name} elif aggregator_dest == "experiment": filter_dict = {"exper_name": job_or_exper_name} # if not fields_dict: # # by default, do NOT return inner log records # fields_dict = {"log_records": 0} # adjust filter to get only missing records if target: filter_dict["end_id"] = {"$gt": target} #console.print(" mongo: filter: {}, fields: {}, sort: {}".format(filter_dict, fields_dict, sort_dict)) console.diag(" mongo: filter: {}, fields: {}".format( filter_dict, fields_dict)) # limit query to avoid "message max exceeded" errors max_query_records = 3000 started = time.time() #records = self.mongo_db[ws_name].find(filter_dict, fields_dict) cmd_func = lambda: self.mongo_db[ws_name].find(filter_dict, fields_dict ) cursor = self.mongo_with_retries("get_all_runs", cmd_func) # SORT TOTALLY BUSTED ON COSMOS: # - sort of "-id" returns random order each time # - sort of "test-acc" returns 0 records (if ANY missing values, NO records returned) # - docs say pass a dict, but code wants list of 2-tuples (pymongo library) # if sort_dict: # items = list(sort_dict.items()) # key, value = items[0] # import pymongo # cursor = cursor.sort("job", 1) # key, value) # adjust cursor per first_count, last_count # because SORT is busted, we can't use mongo for first/last either # if last_count: # if last_count is True: # last_count = 25 # avail = cursor.count() # skip_count = avail - last_count # if skip_count > 0: # cursor = cursor.skip(skip_count) # elif first_count: # if first_count is True: # first_count = 25 # cursor = cursor.limit(first_count) records = list(cursor) return_count = len(records) total_count = self.mongo_db[ws_name].count() elapsed = time.time() - started console.diag( " mongo query returned {} records (of {}), took: {:2f} secs". format(return_count, total_count, elapsed)) if cache: cache += records records = cache if return_count and use_cache: # write to cache started = time.time() utils.save(records, fn_cache) elapsed = time.time() - started console.print( "wrote {:,} records to cache, took: {:2f} secs".format( len(records), elapsed)) return records