Esempio n. 1
0
    def fixup_script_in_cmd(self, cmd):
        cmd_parts = utils.cmd_split(cmd)
        self.remove_script_dir_from_parts(cmd_parts)

        # add "-u" for python cmds
        if len(cmd_parts) > 1 and cmd_parts[0].startswith("python") and cmd_parts[1] != "-u":
            cmd_parts.insert(1, "-u")

        new_cmd = " ".join(cmd_parts)
        return new_cmd
Esempio n. 2
0
def get_fn_local_config(args):
    # default value
    fn = os.path.join(".", constants.FN_CONFIG_FILE)

    # is this a run cmd whose script is a .yaml file?
    cmd = " ".join(args)
    parts = utils.cmd_split(cmd)
    found_run = False

    for part in parts:
        if found_run:
            if part.endswith(".yaml"):
                fn = part
            break
        if not part.startswith("--"):
            if part == "run":
                found_run = True
            else:
                break

    return fn
Esempio n. 3
0
    def distribute_cmds_to_nodes(self, cmds, num_nodes):
        cmds_by_node = {}
        
        # set current node
        node_index = 0

        # build cmd_parts and distribute them among nodes
        for cmd in cmds:
            node_id = "node" + str(node_index)

            if not node_id in cmds_by_node:
                cmds_by_node[node_id] = []

            # split on unquoted spaces
            cmd_parts = utils.cmd_split(cmd)
            cmds_by_node[node_id].append(cmd_parts)

            node_index += 1

            if node_index >= num_nodes:
                node_index = 0

        return cmds_by_node
Esempio n. 4
0
    def build_cmds_with_search(self, service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args):
        '''
        args:
            - service_type: the type of backend service being used (aml, batch, etc.)
            - cmd_parts: list of the user's ML app and arg/options 
            - parent_script: user-specified script that needs to be run to configure box for all child runs
            - run_script: if user app is a shell script or command line .bat file, the text of file
            - run_cmd_from_script: if user's ML app is a shell or command line script, the run command located within it
            - use_controller: if False, XT controller is not being used (direct run)
            - dry_run: if True, job will not be submitted (user just wants to see list of static runs)

        processing:
            - determine the search_style needed, the associated list of user commands, and the total number of runs

        returns:
            - cmds: the list of 1 or more commands to be run
            - run_count: to total number runs to be executed
            - repeat_count: if number of runs per node (approximately)
            - run_specs: a dictionary of run information (easier to pass around)
            - using_hp: if True, a static or dynamic hyperparameter search is being done
            - using_aml_hparam: if True, we are doing a direct-run AML hyperparameter search
            - sweeps_text: hyperparameter search specs 
            - pool_info: information about the service/pool target
            - search_style: one of: single, multi, repeat, static, dynamic
        '''
        using_hp = False
        show_run_report = True
        repeat_count = None
        using_aml_hparam = False
        search_style = None
        cmds = None

        # get run_cmd
        run_cmd = run_cmd_from_script
        if not run_cmd:
            run_cmd = " ".join(cmd_parts)

        # by default, we return same cmd
        new_run_cmd = run_cmd

        is_aml = (service_type == "aml")        # self.is_aml_ws(workspace)
        use_aml_for_hparam = (is_aml and not use_controller)

        # get info about nodes/boxes
        boxes, pool_info, service_type = box_information.get_box_list(self.core, args=args)
        node_count = len(boxes)

        # HPARAM SEARCH
        cmds, sweeps_text, new_run_cmd = self.build_static_hparam_cmds(run_cmd, node_count, args)
            
        using_hp = not(not sweeps_text)
        if using_hp and use_aml_for_hparam:
            using_aml_hparam = True
            # for AML hyperdrive, we pass only constant args from cmd_parts
            #cmd_parts = [tp for tp in template_parts if tp != '{}']

        if cmds:
            # STATIC HPARAM SEARCH
            run_count = len(cmds)
            search_style = "static"

        runs = args["runs"]
        max_runs = args["max_runs"]

        # USER MULTI CMDS
        multi_cmds = self.read_user_multi_commands(using_hp, run_script, cmd_parts, args)
        if multi_cmds:
            if cmds:
                errors.ComboError("cannot specify both --multi with hyperparameter search")

            cmds = multi_cmds
            if runs:
                run_count = runs
            elif max_runs:
                run_count = min(max_runs, len(cmds))
            else:
                run_count = len(cmds)

            search_style = "multi"
            new_run_cmd = cmds[0]

        if not cmds:
            # SINGLE CMD 
            # DYNAMIC HPARAM or REPEAT or SINGLE search style

            # we will use repeat_count on each node, as needed, to reach specified runs
            run_count = runs if runs else node_count 
            
            if using_hp:
                search_style = "dynamic"
            else:
                search_style = "repeat" if run_count > 1 else "single"

            if search_style != "single":
                repeat_count = math.ceil(run_count / node_count)

            cmds = [new_run_cmd]
            show_run_report = False

        if show_run_report:
            console.print()   
            dr = " (dry-run)" if dry_run else ""
            search_type = args["search_type"]
            stype = "(search-type=" + search_type + ") " if search_style=="static" else ""

            console.print("{} {}runs{}:".format(search_style, stype, dr))

            for i, run_cmd_parts in enumerate(cmds):
                console.print("  {}. {}".format(i+1, run_cmd_parts))

            console.print()   

        # finally, package info into run_specs to make info easier to pass thru various APIs
        new_cmd_parts = utils.cmd_split(new_run_cmd)
        run_specs = {"cmd_parts": new_cmd_parts, "run_script": run_script, "run_cmd": new_run_cmd, "parent_script": parent_script}

        return cmds, run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style
Esempio n. 5
0
    def process_args(self, args):

        run_script = None
        parent_script = None
        run_cmd_from_script = None
        target_file = args["script"]
        target_args = args["script_args"]
        code_upload = args["code_upload"]

        # user may have wrong slashes for this OS
        target_file = file_utils.fix_slashes(target_file)

        if os.path.isabs(target_file):
            errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file))

        is_rerun = "is_rerun" in args
        if is_rerun:
            # will be running from script dir, so remove any path to script file
            self.script_dir = os.path.dirname(target_file)
            target_file = os.path.basename(target_file)

        if target_file.endswith(".py"):
            # PYTHON target
            cmd_parts = ["python"]
            cmd_parts.append("-u")
            cmd_parts.append(target_file)
        else:
            cmd_parts = [target_file] 

        if target_args:
            # split on unquoted spaces
            arg_parts = utils.cmd_split(target_args)
            cmd_parts += arg_parts

        if target_file == "docker":
            self.is_docker = True
            
        if not self.is_docker and code_upload and not os.path.exists(target_file):
            errors.env_error("script file not found: {}".format(target_file))

        ps_path = args["parent_script"]
        if ps_path:
            parent_script = file_utils.read_text_file(ps_path, as_lines=True)

        if target_file.endswith(".bat") or target_file.endswith(".sh"):
            # a RUN SCRIPT was specified as the target
            run_script = file_utils.read_text_file(target_file, as_lines=True)
            run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script)

        compute = args["target"]
        box_def = self.config.get("boxes", compute, suppress_warning=True)
        setup = utils.safe_value(box_def, "setup")

        compute_def = self.config.get_compute_def(compute)        
        if compute_def:
            # must be defined in [compute-targets]
            compute_def = self.config.get_compute_def(compute)

            if not "service" in compute_def:
                errors.config_error("compute target '{}' must define a 'service' property".format(compute))

            service = compute_def["service"]
            if service in ["local", "pool"]:
                # its a list of box names
                boxes = compute_def["boxes"]
                if len(boxes)==1 and boxes[0] == "localhost":
                    pool = None
                    box = "local"
                    service_type = "pool"
                else:
                    pool = compute
                    box = None
                    service_type = "pool"
            else:
                # it a set of compute service properties
                pool = compute
                box = None
                service_name = compute_def["service"]
                service_type = self.config.get_service_type(service_name)
        elif box_def:
            # translate single box name to a compute_def
            box = compute
            pool = None
            service_type = "pool"
            compute_def = {"service": service_type, "boxes": [box], setup: setup}
        else:
            errors.config_error("unknown target or box: {}".format(compute))

        args["target"] = compute
        args["compute_def"] = compute_def
        args["service_type"] = service_type

        # for legacy code
        args["box"] = box
        args["pool"] = pool

        return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \
            compute, compute_def
Esempio n. 6
0
    def build_docker_cmd(self, docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args):
        for_windows = True

        docker_def = self.config.get("dockers", docker_name, default_value=None)
        if not docker_def:
            errors.config_error("docker '{}' not found in config file".format(docker_name))

        registry_name = docker_def["registry"]
        image = docker_def["image"]
        
        if registry_name:
            # get REGISTRY credentials
            registry_creds = self.config.get("external-services", registry_name, suppress_warning=True)
            if not registry_creds:
                config_error("'{}' must be specified in [external-services] section of XT config file".format(registry_name))

            login_server = registry_creds["login-server"]
        else:
            login_server = None

        #pwd = "%cd%" if for_windows else "$(pwd)"
        script_dir = file_utils.fix_slashes(script_dir, True)
        mappings = "-v {}:/usr/src".format(script_dir)
        options = "--rm"

        # collect env vars 
        env_vars = {"XT_IN_DOCKER": 1, "XT_USERNAME": pc_utils.get_username()}
        scriptor.add_controller_env_vars(env_vars, self.config, job_secret, "node0")

        # fixup backslash char for target_file
        if ".py" in target_file:
            app = "python -u"
            #target_file = file_utils.fix_slashes(target_file, True)
            target_file = os.path.basename(target_file)
        else:
            app = target_file
            target_file = ""

        full_image = login_server + "/" + image if login_server else image

        # build a mapping for data?
        data_local = args["data_local"]
        if data_local:
            if "$scriptdir" in data_local:
                data_local = data_local.replace("$scriptdir", script_dir)

            data_local = os.path.realpath(data_local)
            mappings += " -v {}:/usr/data".format(data_local)
            env_vars["XT_DATA_DIR"] = "/usr/data"

        # write env vars to file in snapshot dir
        FN_EV = "__dockev__.txt"
        fn_env_var = os.path.join(snapshot_dir, FN_EV)
        lines = [name + "=" + str(value) for name,value in env_vars.items()]
        text = "\n".join(lines)
        file_utils.write_text_file(fn_env_var, text)

        # specify env var file (in current directory) to docker
        options += " --env-file={}".format(FN_EV)

        # inherit ENV VARS from running environment
        options += " -e XT_RUN_NAME -e XT_WORKSPACE_NAME -e XT_EXPERIMENT_NAME"

        docker_cmd = "docker run {} {} {} {} /usr/src/{}".format(options, mappings, full_image, app, target_file)
        new_parts = utils.cmd_split(docker_cmd)
        return new_parts
Esempio n. 7
0
def main(cmd=None,
         new_start_time=None,
         capture_output=False,
         mini=False,
         raise_syntax_exception=True):
    '''
    This is the XT app, used to manage and scale ML experiments, support various backends (Philly, Azure Batch, Azure ML).
    '''
    if new_start_time:
        global xt_start_time
        xt_start_time = new_start_time

    import numpy as np
    seed = 5
    if seed:
        np.random.seed(seed)
        np.random.RandomState(seed)

    if cmd:
        cmd = cmd.strip()

        if cmd.startswith("xt "):
            cmd = cmd[3:]
        elif cmd == "xt":
            cmd = ""

        args = utils.cmd_split(cmd)

        # remove empty args
        args = [arg for arg in args if arg]
    else:
        # if caller did not supply cmd
        args = sys.argv[1:]

    # when executing multiple commands, reset the feedback for each command
    feedback.reset_feedback()

    #console.print("cmd=", cmd, ", args=", args)
    console.diag("in xt_cmds.main")

    #console.print("config=", config)
    fn_local_config = get_fn_local_config(args)

    impl_shared = ImplShared()
    config = impl_shared.init_config(fn_local_config, mini=mini)
    store = impl_shared.store
    mini = config.mini_mode

    cmd_providers = config.get("providers", "command")
    impl_dict = {}

    for name, code_path in cmd_providers.items():
        package, class_name = code_path.rsplit(".", 1)
        module = importlib.import_module(package)
        impl_class = getattr(module, class_name)

        impl = impl_class(config, store)
        impl_dict[package] = impl

        if name == "help":
            impl.set_mini_mode(mini)

    # this enables QFE to match a function by its module name, to the class instance to process the command
    # impl_dict = {"xtlib.impl_utilities": utilities, "xtlib.impl_storage": storage,
    #     "xtlib.impl_compute": compute, "xtlib.impl_help": help_impl}

    # this parses args and calls the correct command function with its args and options correctly set.
    # the config object supplies the default value for most options and flags.
    dispatcher = qfe.Dispatcher(
        impl_dict, config, preprocessor=impl_shared.pre_dispatch_processing)

    if mini:
        # a dict of commands + arg/options to be surfaced (None means use all args/options)
        show_commands = {
            "cancel_all": ["target"],
            "cancel_job": ["job-id"],
            "cancel_run": ["run-names"],
            "clear_credentials": [],
            "config_cmd": ["default", "create", "reset"],
            "create_demo": ["destination", "response", "overwrite"],
            "create_services_template": [],
            "download": ["local-path", "store-path"],
            "extract": ["runs", "dest-dir", "browse", "workspace"],
            "help": ["command", "about", "browse", "version"],
            "help_topics": ["topic", "browse"],
            "list_blobs": ["path"],
            "list_jobs": [
                "job-list", "experiment", "all", "first", "last", "filter",
                "sort", "reverse", "status", "available"
            ],
            "list_runs": [
                "run-list", "job", "experiment", "all", "first", "last",
                "filter", "sort", "reverse", "status", "available"
            ],
            "monitor": ["name"],
            "run": [
                "script", "script-args", "experiment", "hp-config", "max-runs",
                "nodes", "runs", "search-type", "target"
            ],
            "upload": ["local-path", "store-path"],
            "view_console": ["name", "target", "workspace", "node-index"],
            "view_metrics": ["runs", "metrics"],
            "view_run": ["run-name"]
        }

        dispatcher.show_commands(show_commands)

        qfe.remove_hidden_commands()

    # hide under-development commands
    hide_commands = [
        "collect_logs", "start_tensorboard", "stop_tensorboard", "zip",
        "unzip", "wget"
    ]

    # hide internal cmds (for xt development use only)
    hide_commands.append("generate_help")
    dispatcher.hide_commands(hide_commands)

    # expand symbols like $lastjob, $lastrun
    impl_shared.expand_xt_symbols(args)

    # this is the NORMAL outer exeception handling block, but
    # also see the client/server exception handling in xt_run.py
    try:
        text = dispatcher.dispatch(
            args,
            capture_output=capture_output,
            raise_syntax_exception=raise_syntax_exception)
    except BaseException as ex:
        #console.print("in Exception Handler: utils.show_stack_trace=", utils.show_stack_trace)
        # does user want a stack-trace?
        logger.exception(
            "Error during displatcher.dispatch, args={}".format(args))

        exc_type, exc_value, exc_traceback = sys.exc_info()
        errors.process_exception(exc_type, exc_value, exc_traceback)

    return text