Esempio n. 1
0
    def init_cuda(self, args):
        #---- CUDA init ----
        cuda_avail = torch.cuda.is_available()
        use_cuda = cuda_avail and args.cuda
        gpu_count = torch.cuda.device_count()

        if use_cuda and not args.parallel:
            torch.cuda.set_device(args.gpu)

        print("  cuda_avail={}, GPU count={}, use_cuda={}, gpu={} ---".format(
            cuda_avail, gpu_count, use_cuda, args.gpu))

        if use_cuda and not cuda_avail:
            # if we cannot find a GPU, consider that a hard error (used to detect problems with seeing Philly GPUs)
            errors.env_error("CUDA not available on this platform")

        if args.distributed:
            # Initialize Horovod
            global hvd
            import horovod.torch as hvd

            hvd.init()
            # Pin GPU to be used to process local rank (one GPU per process)
            print("  distributed: rank={}, size={}".format(
                hvd.rank(), hvd.size()))
            device = torch.device("cuda:" + str(hvd.local_rank()))

            # only log HPARAMS and METRICS for job if running as rank 0
            logging = (hvd.rank() == 0)
        else:
            device = torch.device("cuda" if use_cuda else "cpu")
            logging = True

        return use_cuda, device, logging
Esempio n. 2
0
    def help_topics(self, topic, browse, prefix="topics", title="help topics"):

        # build list of help topics from xtlib/help_topics directory
        topics_dir = os.path.join(file_utils.get_xtlib_dir(), "help_topics",
                                  prefix)
        if not os.path.isdir(topics_dir):
            errors.env_error("Missing help topics dir: {}".format(topics_dir))
        topic_files, _ = file_utils.get_local_filenames(topics_dir)

        # build a map from topic names to the files
        topic_map = {file_utils.root_name(fn): fn for fn in topic_files}

        if not topic:
            console.print("available {}:".format(title))
            keys = list(topic_map.keys())
            keys.sort()

            for topic_name in keys:
                console.print("  {}".format(topic_name))

            console.print()
            console.print(
                "To display a help topic, use 'xt help topic <topic name>'")
        else:
            # print a specific topic
            topic_low = topic.lower()
            if not topic_low in topic_map:
                errors.general_error(
                    "help topic not found: {}".format(topic_low))

            text = file_utils.read_text_file(topic_map[topic_low])
            print(text)
    def config_cmd(self, response, default, create, reset):
        '''
        The --create option accepts a template name to create a new local XT config file.  
        
        The currently available templates are:
            - philly   (create config file for Philly users)
            - batch    (create config file for Azure Batch users)
            - aml      (create config file for Azure Machine Learning users)
            - pool     (create config file for users running ML apps on local machines)
            - all      (create config file for users who want to have access to all backend services)
            - empty    (create an empty config file)
        '''

        if default and reset:
            xt_config.overwrite_default_config()
        else:
            if default:
                fn = get_default_config_path()
                if not os.path.exists(fn):
                    errors.env_error(
                        "the XT default config file is missing: {}".format(fn))
            else:
                fn = constants.FN_CONFIG_FILE

            edit = True

            if create:
                if os.path.exists(fn):
                    console.print(
                        "the local config file already exists: {}".format(fn))

                    answer = pc_utils.input_response(
                        "OK to overwrite?  (y/n) [n]: ", response)
                    if answer == "y":
                        self.create_local_config_file(fn, create)
                    else:
                        edit = False
                else:
                    self.create_local_config_file(fn, create)

            elif not os.path.exists(fn):
                console.print("the config file doesn't exist: {}".format(fn))

                answer = pc_utils.input_response("OK to create?  (y/n) [y]: ",
                                                 response)
                if answer in ["", "y"]:
                    self.create_local_config_file(fn, "empty")
                else:
                    edit = False

            if edit:
                console.print(
                    "invoking your default .yaml editor on: {}".format(fn))
                from xtlib import process_utils
                process_utils.open_file_with_default_app(fn)
    def docker_login(self, target, docker):
        reg_creds = self.get_registry_creds(target, docker)
        if not reg_creds:
            if docker:
                errors.env_error(
                    "no dockers entry defined for docker '{}'".format(docker))
            else:
                errors.env_error(
                    "no docker property defined for target '{}'".format(
                        target))

        server = reg_creds["login-server"]
        username = reg_creds["username"]
        password = reg_creds["password"]

        text = self.core.docker_login(server, username, password)
        console.print(text)
Esempio n. 5
0
    def adjust_pip_packages(self, args):
        '''
        convert any package=* in pip-packages to use local machine version (from pip freeze)
        '''
        pip_packages = args["pip_packages"]
        new_pip_packages = []

        for pp in pip_packages:
            if pp.endswith("==*"):
                package = pp[:-3]
                version = get_installed_package_version(package)
                if not version:
                    errors.env_error("version number for specified pip package not found in environment: " + package)
                pp = package + "==" + version

            new_pip_packages.append(pp)

        args["pip_packages"] = new_pip_packages
Esempio n. 6
0
        def monitor_work():
            nonlocal attach_attempts

            connected = self.xtc.connect()
            #azure_task_state, connected, box_name, job_id = self.connect_to_box_for_run(ws, run_name)
            azure_task_state = None
            box_name = self.xtc.box_name
            job_id = "xxxxx"  # TODO

            attach_attempts += 1

            if azure_task_state:
                #console.print("azure_task_state=", azure_task_state)
                # its an azure-batch controlled run
                if azure_task_state == "active":
                    text = "Waiting for run to start: {} ({} in azure-batch)".format(
                        run_name.upper(), job_id)
                elif azure_task_state == "running" and not connected:
                    text = "Waiting for run to initialize: {} ({} in azure-batch)".format(
                        run_name.upper(), job_id)
                else:
                    # exit monitor loop
                    return azure_task_state, connected, box_name, job_id, attach_attempts
            else:
                # its a normal box-controller run
                if not connected:
                    errors.env_error("could not connect to box: " + box_name)
                # we are connected, but has run started yet?
                status_dict = self.xtc.get_status_of_runs(ws, [run_name])
                # controller may not have heard of run yet (if we were fast)
                status = status_dict[
                    run_name] if run_name in status_dict else "created"
                if status in ["created", "queued"]:
                    text = "Waiting for run to start: {} (queued to run on {})".format(
                        run_name.upper(), box_name)
                else:
                    # status is one of running, killed, completed, spawning, ...
                    # exit monitor loop
                    return azure_task_state, connected, box_name, job_id, attach_attempts
            return text
Esempio n. 7
0
    def process_args(self, args):

        run_script = None
        parent_script = None
        run_cmd_from_script = None
        target_file = args["script"]
        target_args = args["script_args"]
        code_upload = args["code_upload"]

        # user may have wrong slashes for this OS
        target_file = file_utils.fix_slashes(target_file)

        if os.path.isabs(target_file):
            errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file))

        is_rerun = "is_rerun" in args
        if is_rerun:
            # will be running from script dir, so remove any path to script file
            self.script_dir = os.path.dirname(target_file)
            target_file = os.path.basename(target_file)

        if target_file.endswith(".py"):
            # PYTHON target
            cmd_parts = ["python"]
            cmd_parts.append("-u")
            cmd_parts.append(target_file)
        else:
            cmd_parts = [target_file] 

        if target_args:
            # split on unquoted spaces
            arg_parts = utils.cmd_split(target_args)
            cmd_parts += arg_parts

        if target_file == "docker":
            self.is_docker = True
            
        if not self.is_docker and code_upload and not os.path.exists(target_file):
            errors.env_error("script file not found: {}".format(target_file))

        ps_path = args["parent_script"]
        if ps_path:
            parent_script = file_utils.read_text_file(ps_path, as_lines=True)

        if target_file.endswith(".bat") or target_file.endswith(".sh"):
            # a RUN SCRIPT was specified as the target
            run_script = file_utils.read_text_file(target_file, as_lines=True)
            run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script)

        compute = args["target"]
        box_def = self.config.get("boxes", compute, suppress_warning=True)
        setup = utils.safe_value(box_def, "setup")

        compute_def = self.config.get_compute_def(compute)        
        if compute_def:
            # must be defined in [compute-targets]
            compute_def = self.config.get_compute_def(compute)

            if not "service" in compute_def:
                errors.config_error("compute target '{}' must define a 'service' property".format(compute))

            service = compute_def["service"]
            if service in ["local", "pool"]:
                # its a list of box names
                boxes = compute_def["boxes"]
                if len(boxes)==1 and boxes[0] == "localhost":
                    pool = None
                    box = "local"
                    service_type = "pool"
                else:
                    pool = compute
                    box = None
                    service_type = "pool"
            else:
                # it a set of compute service properties
                pool = compute
                box = None
                service_name = compute_def["service"]
                service_type = self.config.get_service_type(service_name)
        elif box_def:
            # translate single box name to a compute_def
            box = compute
            pool = None
            service_type = "pool"
            compute_def = {"service": service_type, "boxes": [box], setup: setup}
        else:
            errors.config_error("unknown target or box: {}".format(compute))

        args["target"] = compute
        args["compute_def"] = compute_def
        args["service_type"] = service_type

        # for legacy code
        args["box"] = box
        args["pool"] = pool

        return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \
            compute, compute_def
    def hex(self, fn):
        if not os.path.exists(fn):
            errors.env_error("cannot open file: " + fn)

        hex_dump(fn)
Esempio n. 9
0
    def upload(self,
               local_path,
               store_path,
               share,
               workspace,
               experiment,
               job,
               run,
               feedback,
               show_output=True):

        use_blobs = True
        use_multi = True
        upload_count = 0

        # exapnd ~/ in front of local path
        local_path = os.path.expanduser(local_path)

        if os.path.exists(local_path) and os.path.isfile(local_path):
            use_multi = False

        #console.print("local_path=", local_path)

        # if directory, default to copy nested
        if os.path.isdir(local_path):
            local_path += "/**"
            use_multi = True

        if not store_path or store_path == ".":
            if not use_multi:
                # single file defaults to the base name of the local file
                store_path = os.path.basename(local_path)
            else:
                store_path = "."

        fs = self.create_file_accessor(use_blobs, share, workspace, experiment,
                                       job, run)
        uri = fs.get_uri(store_path)
        actual_path, _ = file_utils.split_wc_path(local_path)

        actual_path = file_utils.relative_path(actual_path)
        actual_path = file_utils.fix_slashes(actual_path)

        if not os.path.exists(actual_path):
            errors.env_error(
                "Cannot find the local file/folder: {}".format(actual_path))

        feedback_progress = FeedbackProgress(feedback, show_output)
        progress_callback = feedback_progress.progress if feedback else None

        if use_multi:
            # upload MULTIPLE files/blobs
            file_names, local_path = file_utils.get_local_filenames(local_path)
            what = "blobs" if use_blobs else "files"

            if len(file_names) == 0:
                if show_output:
                    console.print("no matching files found in: {}".format(
                        what, actual_path))
                return
            elif len(file_names) == 1:
                what = "blob" if use_blobs else "file"

            if show_output:
                console.print("\nto {}, uploading {} {}:".format(
                    uri, len(file_names), what))

            #file_utils.ensure_dir_exists(local_path)
            max_name_len = max([len(name) for name in file_names])
            name_width = 1 + max_name_len
            #console.print("max_name_len=", max_name_len, ", name_width=", name_width)

            for f, fn in enumerate(file_names):
                blob_path = self.make_dest_fn(local_path, fn, store_path)
                actual_fn = file_utils.fix_slashes(fn)

                if show_output:
                    file_msg = "file {}/{}".format(1 + f, len(file_names))
                    console.print("  {2:}: {1:<{0:}} ".format(
                        name_width, actual_fn + ":", file_msg),
                                  end="",
                                  flush=True)

                feedback_progress.start()
                fs.upload_file(blob_path,
                               actual_fn,
                               progress_callback=progress_callback)
                feedback_progress.end()

                upload_count += 1
        else:
            # upload SINGLE file/blob
            what = "blob" if use_blobs else "file"

            if show_output:
                console.print("\nto: {}, uploading {}:".format(uri, what))

            blob_name = os.path.basename(local_path)
            local_path = file_utils.fix_slashes(local_path)

            if show_output:
                #console.print("store_path=", store_path, ", local_path=", local_path)
                console.print("  {}:    ".format(local_path),
                              end="",
                              flush=True)

            feedback_progress.start()
            fs.upload_file(store_path,
                           local_path,
                           progress_callback=progress_callback)
            feedback_progress.end()

            upload_count += 1

        return upload_count
Esempio n. 10
0
    def keysend(self, box_name):
        box_addr = self.config.get("boxes",
                                   box_name,
                                   dict_key="address",
                                   default_value=box_name)
        box_os = self.config.get("boxes",
                                 box_name,
                                 dict_key="os",
                                 default_value="linux")

        #console.print("box_addr=", box_addr)
        fn_local_key = os.path.expanduser(constants.LOCAL_KEYPAIR_PUBLIC)
        #fn_log = utils.expand_vars(TEMP_SSH_LOG)

        if not os.path.exists(fn_local_key):
            errors.env_error(
                "xt keypair not yet created; please run the 'xt keygen' command first"
            )

        # copy the key to a temp file location on the box
        if box_os == "windows":
            temp_key_fn = "temp_key_file"
        else:
            temp_key_fn = "/tmp/temp_key_file"

        # NOTE: the "-o IdentitiesOnly=yes" option of is used to prevent the "too many authentication errors" problem
        #cmd = 'scp -o IdentitiesOnly=yes "{}" {}:{}'.format(fn_local_key, box_addr, temp_key_fn)
        cmd_parts = [
            "scp", "-o", "IdentitiesOnly=yes", fn_local_key,
            "{}:{}".format(box_addr, temp_key_fn)
        ]
        console.diag("  copying key file to box: cmd={}".format(cmd_parts))

        # SCP COPY
        exit_code, output = process_utils.sync_run(cmd_parts)
        if exit_code:
            console.print(output)
            return False

        # now, run commands on box to append the temp file to ~/.ssh/authorized_keys

        if box_os == "windows":
            AUTHORIZED_KEYS_FILE = ".ssh/authorized_keys"
            cmds = [
                "mkdir .ssh",  # ensure directory exists (if first key)
                "del {}".format(AUTHORIZED_KEYS_FILE),
                "type {} >> {}".format(
                    temp_key_fn, AUTHORIZED_KEYS_FILE),  # append key to file
                "del {}".format(temp_key_fn)  # remove temp file
            ]
            cmdline = "&".join(cmds)
        else:
            AUTHORIZED_KEYS_FILE = "~/.ssh/authorized_keys"
            cmds = [
                "mkdir -p ~/.ssh",  # ensure directory exists (if first key)
                "cat {} >> {}".format(
                    temp_key_fn, AUTHORIZED_KEYS_FILE),  # append key to file
                "rm {}".format(temp_key_fn)  # remove temp file
            ]
            cmdline = ";".join(cmds)

        # NOTE: the "-o IdentitiesOnly=yes" option of is used to prevent the "too many authentication errors" problem
        #cmd = 'ssh -o IdentitiesOnly=yes {} "{}"'.format(box_addr, cmdline)
        cmd_parts = ['ssh', '-o', 'IdentitiesOnly=yes', box_addr, cmdline]
        console.diag("  running cmds on box={}".format(cmd_parts))

        # SSH COMMANDS
        exit_code, output = process_utils.sync_run(cmd_parts)
        if exit_code:
            console.print(output)
            return False

        return True