def init_cuda(self, args): #---- CUDA init ---- cuda_avail = torch.cuda.is_available() use_cuda = cuda_avail and args.cuda gpu_count = torch.cuda.device_count() if use_cuda and not args.parallel: torch.cuda.set_device(args.gpu) print(" cuda_avail={}, GPU count={}, use_cuda={}, gpu={} ---".format( cuda_avail, gpu_count, use_cuda, args.gpu)) if use_cuda and not cuda_avail: # if we cannot find a GPU, consider that a hard error (used to detect problems with seeing Philly GPUs) errors.env_error("CUDA not available on this platform") if args.distributed: # Initialize Horovod global hvd import horovod.torch as hvd hvd.init() # Pin GPU to be used to process local rank (one GPU per process) print(" distributed: rank={}, size={}".format( hvd.rank(), hvd.size())) device = torch.device("cuda:" + str(hvd.local_rank())) # only log HPARAMS and METRICS for job if running as rank 0 logging = (hvd.rank() == 0) else: device = torch.device("cuda" if use_cuda else "cpu") logging = True return use_cuda, device, logging
def help_topics(self, topic, browse, prefix="topics", title="help topics"): # build list of help topics from xtlib/help_topics directory topics_dir = os.path.join(file_utils.get_xtlib_dir(), "help_topics", prefix) if not os.path.isdir(topics_dir): errors.env_error("Missing help topics dir: {}".format(topics_dir)) topic_files, _ = file_utils.get_local_filenames(topics_dir) # build a map from topic names to the files topic_map = {file_utils.root_name(fn): fn for fn in topic_files} if not topic: console.print("available {}:".format(title)) keys = list(topic_map.keys()) keys.sort() for topic_name in keys: console.print(" {}".format(topic_name)) console.print() console.print( "To display a help topic, use 'xt help topic <topic name>'") else: # print a specific topic topic_low = topic.lower() if not topic_low in topic_map: errors.general_error( "help topic not found: {}".format(topic_low)) text = file_utils.read_text_file(topic_map[topic_low]) print(text)
def config_cmd(self, response, default, create, reset): ''' The --create option accepts a template name to create a new local XT config file. The currently available templates are: - philly (create config file for Philly users) - batch (create config file for Azure Batch users) - aml (create config file for Azure Machine Learning users) - pool (create config file for users running ML apps on local machines) - all (create config file for users who want to have access to all backend services) - empty (create an empty config file) ''' if default and reset: xt_config.overwrite_default_config() else: if default: fn = get_default_config_path() if not os.path.exists(fn): errors.env_error( "the XT default config file is missing: {}".format(fn)) else: fn = constants.FN_CONFIG_FILE edit = True if create: if os.path.exists(fn): console.print( "the local config file already exists: {}".format(fn)) answer = pc_utils.input_response( "OK to overwrite? (y/n) [n]: ", response) if answer == "y": self.create_local_config_file(fn, create) else: edit = False else: self.create_local_config_file(fn, create) elif not os.path.exists(fn): console.print("the config file doesn't exist: {}".format(fn)) answer = pc_utils.input_response("OK to create? (y/n) [y]: ", response) if answer in ["", "y"]: self.create_local_config_file(fn, "empty") else: edit = False if edit: console.print( "invoking your default .yaml editor on: {}".format(fn)) from xtlib import process_utils process_utils.open_file_with_default_app(fn)
def docker_login(self, target, docker): reg_creds = self.get_registry_creds(target, docker) if not reg_creds: if docker: errors.env_error( "no dockers entry defined for docker '{}'".format(docker)) else: errors.env_error( "no docker property defined for target '{}'".format( target)) server = reg_creds["login-server"] username = reg_creds["username"] password = reg_creds["password"] text = self.core.docker_login(server, username, password) console.print(text)
def adjust_pip_packages(self, args): ''' convert any package=* in pip-packages to use local machine version (from pip freeze) ''' pip_packages = args["pip_packages"] new_pip_packages = [] for pp in pip_packages: if pp.endswith("==*"): package = pp[:-3] version = get_installed_package_version(package) if not version: errors.env_error("version number for specified pip package not found in environment: " + package) pp = package + "==" + version new_pip_packages.append(pp) args["pip_packages"] = new_pip_packages
def monitor_work(): nonlocal attach_attempts connected = self.xtc.connect() #azure_task_state, connected, box_name, job_id = self.connect_to_box_for_run(ws, run_name) azure_task_state = None box_name = self.xtc.box_name job_id = "xxxxx" # TODO attach_attempts += 1 if azure_task_state: #console.print("azure_task_state=", azure_task_state) # its an azure-batch controlled run if azure_task_state == "active": text = "Waiting for run to start: {} ({} in azure-batch)".format( run_name.upper(), job_id) elif azure_task_state == "running" and not connected: text = "Waiting for run to initialize: {} ({} in azure-batch)".format( run_name.upper(), job_id) else: # exit monitor loop return azure_task_state, connected, box_name, job_id, attach_attempts else: # its a normal box-controller run if not connected: errors.env_error("could not connect to box: " + box_name) # we are connected, but has run started yet? status_dict = self.xtc.get_status_of_runs(ws, [run_name]) # controller may not have heard of run yet (if we were fast) status = status_dict[ run_name] if run_name in status_dict else "created" if status in ["created", "queued"]: text = "Waiting for run to start: {} (queued to run on {})".format( run_name.upper(), box_name) else: # status is one of running, killed, completed, spawning, ... # exit monitor loop return azure_task_state, connected, box_name, job_id, attach_attempts return text
def process_args(self, args): run_script = None parent_script = None run_cmd_from_script = None target_file = args["script"] target_args = args["script_args"] code_upload = args["code_upload"] # user may have wrong slashes for this OS target_file = file_utils.fix_slashes(target_file) if os.path.isabs(target_file): errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file)) is_rerun = "is_rerun" in args if is_rerun: # will be running from script dir, so remove any path to script file self.script_dir = os.path.dirname(target_file) target_file = os.path.basename(target_file) if target_file.endswith(".py"): # PYTHON target cmd_parts = ["python"] cmd_parts.append("-u") cmd_parts.append(target_file) else: cmd_parts = [target_file] if target_args: # split on unquoted spaces arg_parts = utils.cmd_split(target_args) cmd_parts += arg_parts if target_file == "docker": self.is_docker = True if not self.is_docker and code_upload and not os.path.exists(target_file): errors.env_error("script file not found: {}".format(target_file)) ps_path = args["parent_script"] if ps_path: parent_script = file_utils.read_text_file(ps_path, as_lines=True) if target_file.endswith(".bat") or target_file.endswith(".sh"): # a RUN SCRIPT was specified as the target run_script = file_utils.read_text_file(target_file, as_lines=True) run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script) compute = args["target"] box_def = self.config.get("boxes", compute, suppress_warning=True) setup = utils.safe_value(box_def, "setup") compute_def = self.config.get_compute_def(compute) if compute_def: # must be defined in [compute-targets] compute_def = self.config.get_compute_def(compute) if not "service" in compute_def: errors.config_error("compute target '{}' must define a 'service' property".format(compute)) service = compute_def["service"] if service in ["local", "pool"]: # its a list of box names boxes = compute_def["boxes"] if len(boxes)==1 and boxes[0] == "localhost": pool = None box = "local" service_type = "pool" else: pool = compute box = None service_type = "pool" else: # it a set of compute service properties pool = compute box = None service_name = compute_def["service"] service_type = self.config.get_service_type(service_name) elif box_def: # translate single box name to a compute_def box = compute pool = None service_type = "pool" compute_def = {"service": service_type, "boxes": [box], setup: setup} else: errors.config_error("unknown target or box: {}".format(compute)) args["target"] = compute args["compute_def"] = compute_def args["service_type"] = service_type # for legacy code args["box"] = box args["pool"] = pool return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \ compute, compute_def
def hex(self, fn): if not os.path.exists(fn): errors.env_error("cannot open file: " + fn) hex_dump(fn)
def upload(self, local_path, store_path, share, workspace, experiment, job, run, feedback, show_output=True): use_blobs = True use_multi = True upload_count = 0 # exapnd ~/ in front of local path local_path = os.path.expanduser(local_path) if os.path.exists(local_path) and os.path.isfile(local_path): use_multi = False #console.print("local_path=", local_path) # if directory, default to copy nested if os.path.isdir(local_path): local_path += "/**" use_multi = True if not store_path or store_path == ".": if not use_multi: # single file defaults to the base name of the local file store_path = os.path.basename(local_path) else: store_path = "." fs = self.create_file_accessor(use_blobs, share, workspace, experiment, job, run) uri = fs.get_uri(store_path) actual_path, _ = file_utils.split_wc_path(local_path) actual_path = file_utils.relative_path(actual_path) actual_path = file_utils.fix_slashes(actual_path) if not os.path.exists(actual_path): errors.env_error( "Cannot find the local file/folder: {}".format(actual_path)) feedback_progress = FeedbackProgress(feedback, show_output) progress_callback = feedback_progress.progress if feedback else None if use_multi: # upload MULTIPLE files/blobs file_names, local_path = file_utils.get_local_filenames(local_path) what = "blobs" if use_blobs else "files" if len(file_names) == 0: if show_output: console.print("no matching files found in: {}".format( what, actual_path)) return elif len(file_names) == 1: what = "blob" if use_blobs else "file" if show_output: console.print("\nto {}, uploading {} {}:".format( uri, len(file_names), what)) #file_utils.ensure_dir_exists(local_path) max_name_len = max([len(name) for name in file_names]) name_width = 1 + max_name_len #console.print("max_name_len=", max_name_len, ", name_width=", name_width) for f, fn in enumerate(file_names): blob_path = self.make_dest_fn(local_path, fn, store_path) actual_fn = file_utils.fix_slashes(fn) if show_output: file_msg = "file {}/{}".format(1 + f, len(file_names)) console.print(" {2:}: {1:<{0:}} ".format( name_width, actual_fn + ":", file_msg), end="", flush=True) feedback_progress.start() fs.upload_file(blob_path, actual_fn, progress_callback=progress_callback) feedback_progress.end() upload_count += 1 else: # upload SINGLE file/blob what = "blob" if use_blobs else "file" if show_output: console.print("\nto: {}, uploading {}:".format(uri, what)) blob_name = os.path.basename(local_path) local_path = file_utils.fix_slashes(local_path) if show_output: #console.print("store_path=", store_path, ", local_path=", local_path) console.print(" {}: ".format(local_path), end="", flush=True) feedback_progress.start() fs.upload_file(store_path, local_path, progress_callback=progress_callback) feedback_progress.end() upload_count += 1 return upload_count
def keysend(self, box_name): box_addr = self.config.get("boxes", box_name, dict_key="address", default_value=box_name) box_os = self.config.get("boxes", box_name, dict_key="os", default_value="linux") #console.print("box_addr=", box_addr) fn_local_key = os.path.expanduser(constants.LOCAL_KEYPAIR_PUBLIC) #fn_log = utils.expand_vars(TEMP_SSH_LOG) if not os.path.exists(fn_local_key): errors.env_error( "xt keypair not yet created; please run the 'xt keygen' command first" ) # copy the key to a temp file location on the box if box_os == "windows": temp_key_fn = "temp_key_file" else: temp_key_fn = "/tmp/temp_key_file" # NOTE: the "-o IdentitiesOnly=yes" option of is used to prevent the "too many authentication errors" problem #cmd = 'scp -o IdentitiesOnly=yes "{}" {}:{}'.format(fn_local_key, box_addr, temp_key_fn) cmd_parts = [ "scp", "-o", "IdentitiesOnly=yes", fn_local_key, "{}:{}".format(box_addr, temp_key_fn) ] console.diag(" copying key file to box: cmd={}".format(cmd_parts)) # SCP COPY exit_code, output = process_utils.sync_run(cmd_parts) if exit_code: console.print(output) return False # now, run commands on box to append the temp file to ~/.ssh/authorized_keys if box_os == "windows": AUTHORIZED_KEYS_FILE = ".ssh/authorized_keys" cmds = [ "mkdir .ssh", # ensure directory exists (if first key) "del {}".format(AUTHORIZED_KEYS_FILE), "type {} >> {}".format( temp_key_fn, AUTHORIZED_KEYS_FILE), # append key to file "del {}".format(temp_key_fn) # remove temp file ] cmdline = "&".join(cmds) else: AUTHORIZED_KEYS_FILE = "~/.ssh/authorized_keys" cmds = [ "mkdir -p ~/.ssh", # ensure directory exists (if first key) "cat {} >> {}".format( temp_key_fn, AUTHORIZED_KEYS_FILE), # append key to file "rm {}".format(temp_key_fn) # remove temp file ] cmdline = ";".join(cmds) # NOTE: the "-o IdentitiesOnly=yes" option of is used to prevent the "too many authentication errors" problem #cmd = 'ssh -o IdentitiesOnly=yes {} "{}"'.format(box_addr, cmdline) cmd_parts = ['ssh', '-o', 'IdentitiesOnly=yes', box_addr, cmdline] console.diag(" running cmds on box={}".format(cmd_parts)) # SSH COMMANDS exit_code, output = process_utils.sync_run(cmd_parts) if exit_code: console.print(output) return False return True