Example #1
0
    def make_early_term_policy(self,
                               policy_type,
                               eval_interval=1,
                               delay_eval=0,
                               truncation_percentage=.1,
                               slack_factor=None,
                               slack_amount=None):
        from azureml.train.hyperdrive import BanditPolicy, MedianStoppingPolicy, TruncationSelectionPolicy, NoTerminationPolicy

        if policy_type == "bandit":
            policy = BanditPolicy(evaluation_interval=eval_interval,
                                  slack_factor=slack_factor,
                                  slack_amount=slack_amount,
                                  delay_eval=delay_eval)
        elif policy_type == "median":
            policy = MedianStoppingPolicy(evaluation_interval=eval_interval,
                                          delay_evaluation=delay_eval)
        elif policy_type == "truncation":
            policy = TruncationSelectionPolicy(
                truncation_percentage=truncation_percentage,
                evaluation_interval=eval_interval,
                delay_evaluation=delay_eval)
        elif policy_type == "none":
            policy = NoTerminationPolicy()
        else:
            errors.config_error("Unrecognized policy type=" + policy_type)

        return policy
Example #2
0
 def get_service_name(self):
     if not "service" in self.compute_def:
         errors.config_error(
             "missing 'service' property for xt config file compute target '{}'"
             .format(self.compute))
     service_name = self.compute_def["service"]
     return service_name
Example #3
0
    def create_hyperdrive_trainer(self, estimator, hd_dict, search_type,
                                  metric_name, maximize_metric,
                                  early_term_policy, max_total_runs,
                                  max_concurrent_runs, max_minutes):

        from azureml.train.hyperdrive import RandomParameterSampling, GridParameterSampling, BayesianParameterSampling

        if search_type == "random":
            ps = RandomParameterSampling(hd_dict)
        elif search_type == "grid":
            ps = GridParameterSampling(hd_dict)
        elif search_type == "bayesian":
            ps = BayesianParameterSampling(hd_dict)
        else:
            errors.config_error(
                "Azure ML Hyperdrive search_type not supported: " +
                search_type)

        max_concurrent_runs = min(max_total_runs, max_concurrent_runs)

        from azureml.train.hyperdrive import HyperDriveConfig, PrimaryMetricGoal

        trainer = HyperDriveConfig(
            estimator=estimator,
            hyperparameter_sampling=ps,
            policy=early_term_policy,
            primary_metric_name=metric_name,
            primary_metric_goal=PrimaryMetricGoal.MAXIMIZE
            if maximize_metric else PrimaryMetricGoal.MINIMIZE,
            max_total_runs=max_total_runs,
            max_concurrent_runs=max_concurrent_runs,
            max_duration_minutes=max_minutes)

        return trainer
    def get_registry_creds(self, compute, env):
        registry_creds = None

        if not env:
            compute_def = self.config.get_compute_def(compute)
            env = utils.safe_value(compute_def, "environment")

        if env and env != "none":
            env_def = self.config.get("dockers", env, default_value=None)
            if not env_def:
                errors.config_error(
                    "docker '{}' not found in config file".format(env))

            registry_name = env_def["registry"]

            # get REGISTRY credentials
            registry_creds = self.config.get("external-services",
                                             registry_name,
                                             suppress_warning=True)
            if not registry_creds:
                config_error(
                    "'{}' must be specified in [external-services] section of XT config file"
                    .format(registry_name))

        return registry_creds
Example #5
0
    def build_actual_store(self):
        console.diag("start of build_actual_store")
        # validate USERNAME
        username = self.config.get("general",
                                   "username",
                                   suppress_warning=True)
        if not username:
            errors.config_error(
                "'username' must be set in the [general] section of XT config file"
            )

        # STORAGE name/creds
        storage_creds = self.config.get_storage_creds()

        # MONGO name/creds
        mongo_creds, mongo_name = self.config.get_mongo_creds()

        run_cache_dir = self.config.get("general", "run-cache-dir")

        #store_key = storage_creds["key"]
        mongo_conn_str = mongo_creds["mongo-connection-string"]
        provider_code_path = self.config.get_storage_provider_code_path(
            storage_creds)

        self.store = Store(storage_creds,
                           provider_code_path=provider_code_path,
                           run_cache_dir=run_cache_dir,
                           mongo_conn_str=mongo_conn_str)
        console.diag("end of build_actual_store")

        return self.store
Example #6
0
    def get_aml_ws(self, ws_name):

        creds = self.config.get("external-services",
                                ws_name,
                                suppress_warning=True)
        if not creds:
            errors.config_error(
                "Azure ML workspace '{}' is not defined in [external-services] section of the XT config file"
                .format(ws_name))

        subscription_id = self.config.get_required_service_property(
            creds, "subscription-id", ws_name)
        resource_group = self.config.get_required_service_property(
            creds, "resource-group", ws_name)

        #from azureml.core.authentication import ServicePrincipalAuthentication
        # ws_ex = ws_name + "-ex"
        # svc_pr = None
        # if self.config.name_exists(section, ws_ex):
        #     client_id = self.config.get(section, ws_ex, "client-id")
        #     tenant_id = self.config.get(section, ws_ex, "tenant-id")
        #     client_secret = self.config.get(section, ws_ex, "client-secret")
        #     svc_pr = ServicePrincipalAuthentication(tenant_id=tenant_id, service_principal_id=client_id, service_principal_password=client_secret)

        ws = Workspace(subscription_id, resource_group,
                       ws_name)  # , auth=svc_pr)
        return ws
Example #7
0
    def get_required_service_property(self, creds, prop_name, service_name):
        value = utils.safe_value(creds, prop_name)
        if not value:
            errors.config_error(
                "Missing '{}' property for service '{}' defined in [external-services] section of the XT config file"
                .format(prop_name, service_name))

        return value
Example #8
0
 def warning(self, *msg_args):
     msg = "WARNING: xt_config file -"
     for arg in msg_args:
         msg += " " + str(arg)
     if self.get("internal", "raise", suppress_warning=True):
         errors.config_error(msg)
     else:
         console.print(msg)
Example #9
0
    def get_storage_provider_code_path(self, storage_creds):
        # get the provider_code_path
        provider_name = storage_creds["provider"]
        providers = self.get("providers", "storage")
        if not provider_name in providers:
            errors.config_error(
                "{} provider='{}' not registered in XT config file".format(
                    "storage", provider_name))

        code_path = providers[provider_name]
        return code_path
Example #10
0
    def get_provider_class_ctr(self, provider_type, name):
        '''
        return the class constructor method for the specified provider.
        '''
        providers = self.get("providers", provider_type)

        if not name in providers:
            errors.config_error(
                "{} not registered in XT config file".format(name))

        code_path = providers[name]
        return utils.get_class_ctr(code_path)
Example #11
0
    def get_external_service_from_target(self, target_name):
        target = self.get_compute_def(target_name)

        if not "service" in target:
            errors.config_error(
                "'service' property must be defined for target={} in the XT config file"
                .format(target))
        service_name = target["service"]

        service = self.get_service(service_name)
        #self.expand_symbols_in_creds(service, service_name)
        return service
Example #12
0
    def get_service(self, service_name):
        service = self.get("external-services",
                           service_name,
                           suppress_warning=True)
        if not service:
            errors.config_error(
                "'{}' must be defined in the [external-services] section of XT config file"
                .format(service_name))

        service["name"] = service_name
        #self.expand_symbols_in_creds(service, service_name)
        return service
Example #13
0
def get_provider_code_path_from_context(context, provider_type, name):
    '''
    return the class constructor method for the specified provider.
    '''
    providers = context.providers[provider_type]

    if not name in providers:
        errors.config_error(
            "{} provider='{}' not registered in XT config file".format(
                provider_type, name))

    code_path = providers[name]
    return code_path
Example #14
0
    def build_hyperdrive_dict_from_file(self, fn):
        ''' parse hyperdrive params from text file '''
        hd = {}

        with open(fn, "rt") as infile:
            text_lines = infile.readlines()

        for text in text_lines:
            text = text.strip()
            if not text or text.startswith("#"):
                continue

            if "#" in text:
                # remove comment part of line
                index = text.index("#")
                text = text[0:index].strip()

            name, value = text.split("=")
            name = name.strip()
            value = value.strip()

            if value.startswith("@"):
                dist_name, values = value[1:].split("(")
                if not dist_name in utils.distribution_types:
                    errors.config_error("Unsupported distribution type: " +
                                        dist_name)

                assert values.endswith(")")
                values = values[:-1]  # remove ending paren

                # convert from comma sep. string to list of float values
                values = utils.get_number_or_string_list_from_text(values)

                #hd[name] = self.make_distribution(dist_name, values)
                hd[name] = hp_helper.build_dist_func_instance(
                    name, dist_name, values)
            else:
                # convert from comma sep. string to list of float values
                values = utils.get_number_or_string_list_from_text(value)
                # treat as "choice"
                #hd[name] = self.make_distribution("choice", values)
                hd[name] = hp_helper.build_dist_func_instance(
                    name, "choice", values)

        return hd
Example #15
0
    def get_mongo_creds(self):
        # validate MONGO service
        mongo_name = self.get("xt-services", "mongo", suppress_warning=True)
        if not mongo_name:
            errors.config_error(
                "'mongo' must be set in [xt-services] section of XT config file"
            )

        # validate MONGO credentials
        mongo_creds = self.get("external-services",
                               mongo_name,
                               suppress_warning=True)
        if not mongo_creds:
            errors.config_error(
                "'{}' must be specified in [external-services] section of XT config file"
                .format(mongo_name))

        #self.expand_symbols_in_creds(mongo_creds, mongo_name)
        return mongo_creds, mongo_name
Example #16
0
    def get_me_graph_property(self, token, property_name):
        #console.print("get_user_principle_name: token=", token)

        import requests
        import json

        endpoint = "https://graph.microsoft.com/v1.0/me"
        headers = {'Authorization': 'Bearer ' + token}

        graph_data = requests.get(endpoint, headers=headers).json()
        if "error" in graph_data:
            error = graph_data["error"]
            errors.config_error("{}: {}".format(error["code"],
                                                error["message"]))

        #console.print("get_user_principle_name: graph_data=", graph_data)

        upn = graph_data[property_name]
        return upn
Example #17
0
    def cancel_remote_controller(self, box_name, progress):
        # REMOTE BOX: check if controller is running
        box_addr = self.config.get("boxes", box_name, dict_key="address")
        if not box_addr:
            errors.config_error(
                "missing address property for box: {}".format(box_name))

        # run PS on box to determine if controller is running
        box_cmd = "ps aux | grep controller"
        exit_code, output = process_utils.sync_run_ssh(self, box_addr, box_cmd)

        #console.print("result=\n", output)
        targets = [text for text in output.split("\n") if "python" in text]
        #console.print("targets=", targets)

        cancel_count = 0

        if len(targets):
            for target in targets:
                parts = target.split(" ")

                # remove empty strings
                parts = list(filter(None, parts))

                #console.print("parts=", parts)
                if len(parts) > 1:
                    pid = parts[1].strip()

                    # send "cancel" command to remote linux box
                    box_cmd = 'kill -kill {}'.format(pid)
                    progress("  killing remote process: {}".format(pid))
                    process_utils.sync_run_ssh(self,
                                               box_addr,
                                               box_cmd,
                                               report_error=True)

                    cancel_count += 1

        result = cancel_count > 0
        if not result:
            progress("  remote XT controller not running")

        return result
Example #18
0
    def get_service_type(self, service_name):
        if service_name == "pool":
            service_type = "pool"
        else:
            service = self.get("external-services",
                               service_name,
                               suppress_warning=True)
            if not service:
                errors.config_error(
                    "'{}' must be defined in the [external-services] section of XT config file"
                    .format(service_name))

            if not "type" in service:
                errors.config_error(
                    "'type' must be defined for the '{}' service in the XT config file"
                    .format(service_name))

            service_type = service["type"]

        return service_type
Example #19
0
    def get_compute_def(self, target_name):
        target = self.get("compute-targets",
                          target_name,
                          suppress_warning=True)

        if not target:
            # is this target a box name?
            box_info = self.get("boxes", target_name, suppress_warning=True)
            if not box_info:
                errors.config_error(
                    "target '{}' must be defined in the [compute-targets] section of XT config file (or be box name)"
                    .format(target_name))
            # make box look like a target
            target = {"service": "pool", "boxes": [target_name]}

            # use setup from first box
            if "setup" in box_info:
                target["setup"] = box_info["setup"]

        target["name"] = target_name
        #self.expand_symbols_in_creds(target, target_name)
        return target
Example #20
0
    def get_storage_creds(self):
        # validate STORAGE service
        storage_name = self.get("xt-services",
                                "storage",
                                suppress_warning=True)
        if not storage_name:
            errors.config_error(
                "'storage' must be set in [xt-services] section of XT config file"
            )

        # validate STORAGE_NAME credentials
        storage_creds = self.get("external-services",
                                 storage_name,
                                 suppress_warning=True)
        if not storage_creds:
            errors.config_error(
                "'{}' must be specified in [external-services] section of XT config file"
                .format(storage_name))

        #self.expand_symbols_in_creds(storage_creds, storage_name)
        storage_creds["name"] = storage_name
        return storage_creds
Example #21
0
    def yaml_to_dist_dict(self, fn):
        '''
        args:
            fn: name of .yaml file

        processing:
            load data from .yaml file

        return:
            data
        '''
        yd = file_utils.load_yaml(fn)
        if not constants.HPARAM_DIST in yd:
            errors.config_error(
                "hyperparmeter search file missing '{}' section: {}".format(
                    constants.HPARAM_DIST, fn))

        hparams = yd[constants.HPARAM_DIST]
        dd = {}
        for key, value in hparams.items():
            dd[key] = hp_helper.parse_hp_dist(value)

        return dd
Example #22
0
    def get(self,
            group,
            name=None,
            dict_key=None,
            default_value=None,
            suppress_warning=False,
            group_error=None,
            prop_error=None,
            key_error=None):

        value = default_value

        if group in self.data:
            value = self.data[group]
            if name:
                if name in value:
                    value = value[name]
                    if dict_key:
                        if dict_key in value:
                            value = value[dict_key]
                        else:
                            if key_error:
                                errors.config_error(key_error)
                            if not suppress_warning:
                                self.warning("GET option dict_key not found: ",
                                             group, name, dict_key,
                                             default_value)
                            value = default_value
                else:
                    if prop_error:
                        errors.config_error(prop_error)
                    if not suppress_warning:
                        self.warning("GET option not found: ", group, name,
                                     dict_key, default_value)
                    value = default_value
        else:
            if group_error:
                errors.config_error(group_error)
            if not suppress_warning:
                self.warning("GET option GROUP not found: ", group, name,
                             dict_key, default_value)
            value = default_value

        # expand values containing a "$" id
        if isinstance(value, str) and "$" in value:
            value = self.expand_system_symbols(value, group, name)
        elif isinstance(value, dict):
            for key, val in value.items():
                if isinstance(val, str) and "$" in val:
                    val = self.expand_system_symbols(val, name, key)
                    value[key] = val

        return value
Example #23
0
    def get_vault_url(self):
        # validate VAULT service
        vault_name = self.get("xt-services", "vault", suppress_warning=True)
        if not vault_name:
            errors.config_error(
                "'vault' property must be set in [xt-services] section of XT config file"
            )

        # validate VAULT credentials
        vault_creds = self.get("external-services",
                               vault_name,
                               suppress_warning=True)
        if not vault_creds:
            errors.config_error(
                "'{}' must be specified in [external-services] section of XT config file"
                .format(vault_name))

        if not "url" in vault_creds:
            errors.config_error(
                "URL not specified for '{}' in [external-services] section of XT config file"
                .format(vault_name))

        url = vault_creds["url"]
        return url
Example #24
0
    def process_run_command(self, args):
        self.args = args

        # ensure workspace exists
        workspace = args['workspace']
        dry_run = args['dry_run']
        fake_submit = args["fake_submit"]

        if not fake_submit:
            self.store.ensure_workspace_exists(workspace, flag_as_error=False)

        # PRE-PROCESS ARGS
        service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, compute, compute_def = \
            self.process_args(args)

        # create backend helper (pool, philly, batch, aml)
        cluster = utils.safe_value(compute_def, "cluster")
        vc = utils.safe_value(compute_def, "vc")
        self.backend = self.core.create_backend(compute, cluster=cluster, vc=vc, username=None)

        # add conda_packages and pip_packages from SETUP to ARGS
        setup_def = self.config.get_setup_from_target_def(compute_def)

        conda_packages = utils.safe_value(setup_def, "conda-packages")
        pip_packages = utils.safe_value(setup_def, "pip-packages")

        args["conda_packages"] = conda_packages if conda_packages else []
        args["pip_packages"] = pip_packages if pip_packages else []

        self.adjust_pip_packages(args)

        snapshot_dir = self.temp_dir

        if fake_submit:
            script_dir = snapshot_dir
        else:
            # note: always create a snapshot dir for backends to add needed files
            file_utils.ensure_dir_deleted(snapshot_dir)
            script_dir = self.snapshot_all_code(snapshot_dir, cmd_parts, args)

        self.script_dir = script_dir
        direct_run = args["direct_run"]

        # do we need to start the xt controller?
        use_controller = not direct_run
        adjustment_scripts = None

        # create a job_secret that can later be used to authenticate with the XT controller
        # NOTE: we currently log this secret as a job property, which allows all team members to view and control this job
        job_secret = str(uuid.uuid4())

        # do we need to build a "docker run" command?
        if not self.backend.provides_container_support():
            env = args["docker"]
            if not env:
                docker_name = utils.safe_value(compute_def, "docker")
            if docker_name and docker_name != "none":
                cmd_parts = self.build_docker_cmd(docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args)
                args["docker"] = docker_name     # for use in building run context info

        # BUILD CMDS (from static hparam search, user multi cmds, or single user cmd)
        cmds, total_run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style = \
            self.build_cmds_with_search(service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args)

        if dry_run:
            return

        # make new values available
        args["search_style"] = search_style
        args["total_run_count"] = total_run_count

        resume_name = args['resume_name']
        keep_name = False  # args['keep_name']
        experiment = args['experiment']
        is_distributed = args['distributed']
        direct_run = args["direct_run"]

        # CREATE JOB to hold all runs
        if fake_submit:
            # use lastrun/lastjob info to get a fast incremental fake job number
            xtd = xt_dict.read_xt_dict()
            fake_job_num = xtd["fake_job_num"] if "fake_job_num" in xtd else 1
            xtd["fake_job_num"] = fake_job_num + 1
            xt_dict.write_xt_dict(xtd)
            job_id = "fake_job" + str(fake_job_num)
        else:
            job_id = self.store.create_job()
        fb.feedback(job_id)

        # start the feedback (by parts)
        fb.feedback("{}: {}".format("target", compute))

        # write hparams to FILES
        boxes, num_boxes = self.write_hparams_to_files(job_id, cmds, fake_submit, using_hp, args)

        if sweeps_text and not fake_submit:
            self.upload_sweep_data(sweeps_text, experiment, job_id, args=args)

        # if num_boxes > 1 and service_type != "batch":
        #     fb.feedback("", is_final=True)

        parent_name = None

        # BUILD RUNS, by box
        job_runs = []
        run_count = 1 if is_distributed else len(boxes) 
        secrets_by_node = {}
        remote_control = args["remote_control"]

        for i in range(run_count):
            box_name = boxes[i]

            # generate a box secret for talking to XT controller for this node
            box_secret =  str(uuid.uuid4()) if remote_control else ""

            # build runs for box_name
            run_data = self.build_first_run_for_node(i, boxes[i], target_file, ps_path, using_hp, using_aml_hparam, run_specs, job_id, 
                parent_name, cmds, pool_info, repeat_count, fake_submit, search_style, box_secret, args)

            # for now, adhere to the more general design of multiple runs per box
            box_runs = [run_data]      
            job_runs.append(box_runs)

            node_id = utils.node_id(i)            
            secrets_by_node[node_id] = box_secret

            # FEEDBACK 
            ptype = "single " if search_style == "single" else "parent "
            if is_distributed:
                ptype = "master "

            if run_count == 1:
                node_msg = "creating {}run".format(ptype)
            else:
                node_msg = "creating {}runs: {}/{}".format(ptype, i+1, run_count)

            if service_type == "pool":
                node_msg += ", box: " + box_name

            fb.feedback(node_msg, id="node_msg")  # , add_seperator=is_last)
            last_msg = node_msg

            # run the job

        # build box: runs dict for job info file
        runs_by_box, last_run = self.build_runs_by_box(job_runs, workspace)

        # now that we have run names for all static run names for all nodes, we can adjust cmds (and before files) for using the controller
        if use_controller:
            # we will create 2 temp. controller files in the CURRENT DIRECTORY (that will be captured to JOB)
            # this will also adjust commands for each node to run the XT controller
            adjustment_scripts = self.core.adjust_job_for_controller_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, 
                search_style, args=args)

        else:
            adjustment_scripts = self.core.adjust_job_for_direct_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, 
                search_style, args=args)

        # add env vars used by both controller and runs
        env_vars = args["env_vars"]

        # create a job guid to uniquely identify this job across all XT instances
        job_guid = str(uuid.uuid4())

        # we add with "node0" and "job_secret", but backend service will override for each node
        scriptor.add_controller_env_vars(env_vars, self.config, None, "node0")

        data_local = args["data_local"]
        if "$scriptdir" in data_local:
            data_local = os.path.realpath(data_local.replace("$scriptdir", script_dir))
            args["data_local"] = data_local

        model_local = args["model_local"]
        if "$scriptdir" in model_local:
            model_local = os.path.realpath(model_local.replace("$scriptdir", script_dir))
            args["model_local"] = model_local

        # ADJUST CMDS: this allows backend to write scripts to snapshot dir, if needed, as a way of adjusting/wrapping run commands
        self.backend.adjust_run_commands(job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args=args)

        # upload CODE from snapshot_dir
        code_upload = args["code_upload"]
        code_omit = args["code_omit"]
        code_zip = args["code_zip"]
    
        if not fake_submit:
            if code_upload:
                self.core.upload_before_files_to_job(job_id, snapshot_dir, "before/code", code_omit, code_zip, "code", args)

            # upload DATA from data_local (do we need to keep this?  should we upload to normal DATA location, vs. job?)
            data_upload = args["data_upload"]
            if data_upload:
                if not data_local:
                    errors.config_error("cannot do data-upload because no data-local path is defined in the XT config file")

                data_omit = args["data_omit"]
                data_zip = "none"

                self.core.upload_before_files_to_job(job_id, data_local, "before/data", data_omit, data_zip, "data", args)
        
        # dispatch to BACKEND submitters
        '''
        Note: backend submitter functions are responsible for:
            - submitting the job (for each node, queue runs for that node)
            - return service job id (or list of them if per node)

        NOTE: there is a timing issue where submitted job needs access to job info, but final piece
        of job info (service info) is only return after job is submitted.  Therefore, we structure steps as follows:

            - primary job info is logged
            - job is submitted thru backend
            - service info for job is logged
        '''

        # LOG PRIMARY JOB INFO
        dd = {}

        if not fake_submit:
            # mark runs as QUEUED
            for runs in runs_by_box.values():
                first_run = runs[0]
                self.store.log_run_event(workspace, first_run["run_name"], "status-change", {"status": "queued"}) 

            # write the job info file (now that backend has had a chance to update it)
            job_num = int(job_id[3:])

            xt_cmd = args["xt_cmd"]
            schedule = args["schedule"]
            concurrent = args["concurrent"]

            # this job property is used to ensure we don't exceed the specified # of runs when using repeat_count on each node
            dynamic_runs_remaining = None if search_style == "single" else total_run_count
            node_count = len(runs_by_box)

            # static_runs_by_node = None
            # if schedule == "static":
            #     static_runs_by_node = self.build_static_runs_by_node(total_run_count, node_count)
            #console.diag("static_runs_by_node=", static_runs_by_node)

            active_runs = mongo_run_index.build_active_runs(schedule, total_run_count, node_count)

            dd = {"job_id": job_id, "job_num": job_num, "compute": compute, "ws_name": workspace, "exper_name": experiment, 
                "pool_info": compute_def, "runs_by_box": runs_by_box, 
                "primary_metric": args["primary_metric"], 
                "run_count": total_run_count, "repeat": repeat_count, "search_type": args["search_type"], 
                "username": args["username"], "hold": args["hold"], "started": utils.get_time(),
                "job_status": "submitted", "running_nodes": 0, 
                "running_runs": 0, "error_runs": 0, "completed_runs": 0, "job_guid": job_guid, "job_secret": job_secret,
                "dynamic_runs_remaining": dynamic_runs_remaining, "search_style": search_style,     
                "active_runs": active_runs,  "connect_info_by_node": {}, "secrets_by_node": secrets_by_node,  
                "xt_cmd": xt_cmd, "schedule": schedule, "node_count": node_count, "concurrent": concurrent,
                "service_job_info": None, "service_info_by_node": None,
            }

            self.store.log_job_info(job_id, dd)

        # SUBMIT JOB 
        # NOTE: we use "pool_info" here (vs. compute_def, which has not been updated with explicit args)
        service_job_info, service_info_by_node = self.backend.submit_job(job_id, job_runs, workspace, pool_info, resume_name, 
            repeat_count, using_hp, runs_by_box, experiment, snapshot_dir, adjustment_scripts, args)

        # POST SUBMIT processing

        # update job info 
        if not fake_submit:
            dd["service_job_info"] = service_job_info
            dd["service_info_by_node"] = service_info_by_node
            self.store.log_job_info(job_id, dd)

        # update lastrun/lastjob info
        xtd = xt_dict.read_xt_dict()
        xtd["last_run"] = last_run
        xtd["last_job"] = job_id
        xt_dict.write_xt_dict(xtd)

        # return values for API support (X)
        return cmds, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, job_id 
Example #25
0
    def process_args(self, args):

        run_script = None
        parent_script = None
        run_cmd_from_script = None
        target_file = args["script"]
        target_args = args["script_args"]
        code_upload = args["code_upload"]

        # user may have wrong slashes for this OS
        target_file = file_utils.fix_slashes(target_file)

        if os.path.isabs(target_file):
            errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file))

        is_rerun = "is_rerun" in args
        if is_rerun:
            # will be running from script dir, so remove any path to script file
            self.script_dir = os.path.dirname(target_file)
            target_file = os.path.basename(target_file)

        if target_file.endswith(".py"):
            # PYTHON target
            cmd_parts = ["python"]
            cmd_parts.append("-u")
            cmd_parts.append(target_file)
        else:
            cmd_parts = [target_file] 

        if target_args:
            # split on unquoted spaces
            arg_parts = utils.cmd_split(target_args)
            cmd_parts += arg_parts

        if target_file == "docker":
            self.is_docker = True
            
        if not self.is_docker and code_upload and not os.path.exists(target_file):
            errors.env_error("script file not found: {}".format(target_file))

        ps_path = args["parent_script"]
        if ps_path:
            parent_script = file_utils.read_text_file(ps_path, as_lines=True)

        if target_file.endswith(".bat") or target_file.endswith(".sh"):
            # a RUN SCRIPT was specified as the target
            run_script = file_utils.read_text_file(target_file, as_lines=True)
            run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script)

        compute = args["target"]
        box_def = self.config.get("boxes", compute, suppress_warning=True)
        setup = utils.safe_value(box_def, "setup")

        compute_def = self.config.get_compute_def(compute)        
        if compute_def:
            # must be defined in [compute-targets]
            compute_def = self.config.get_compute_def(compute)

            if not "service" in compute_def:
                errors.config_error("compute target '{}' must define a 'service' property".format(compute))

            service = compute_def["service"]
            if service in ["local", "pool"]:
                # its a list of box names
                boxes = compute_def["boxes"]
                if len(boxes)==1 and boxes[0] == "localhost":
                    pool = None
                    box = "local"
                    service_type = "pool"
                else:
                    pool = compute
                    box = None
                    service_type = "pool"
            else:
                # it a set of compute service properties
                pool = compute
                box = None
                service_name = compute_def["service"]
                service_type = self.config.get_service_type(service_name)
        elif box_def:
            # translate single box name to a compute_def
            box = compute
            pool = None
            service_type = "pool"
            compute_def = {"service": service_type, "boxes": [box], setup: setup}
        else:
            errors.config_error("unknown target or box: {}".format(compute))

        args["target"] = compute
        args["compute_def"] = compute_def
        args["service_type"] = service_type

        # for legacy code
        args["box"] = box
        args["pool"] = pool

        return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \
            compute, compute_def
Example #26
0
    def build_docker_cmd(self, docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args):
        for_windows = True

        docker_def = self.config.get("dockers", docker_name, default_value=None)
        if not docker_def:
            errors.config_error("docker '{}' not found in config file".format(docker_name))

        registry_name = docker_def["registry"]
        image = docker_def["image"]
        
        if registry_name:
            # get REGISTRY credentials
            registry_creds = self.config.get("external-services", registry_name, suppress_warning=True)
            if not registry_creds:
                config_error("'{}' must be specified in [external-services] section of XT config file".format(registry_name))

            login_server = registry_creds["login-server"]
        else:
            login_server = None

        #pwd = "%cd%" if for_windows else "$(pwd)"
        script_dir = file_utils.fix_slashes(script_dir, True)
        mappings = "-v {}:/usr/src".format(script_dir)
        options = "--rm"

        # collect env vars 
        env_vars = {"XT_IN_DOCKER": 1, "XT_USERNAME": pc_utils.get_username()}
        scriptor.add_controller_env_vars(env_vars, self.config, job_secret, "node0")

        # fixup backslash char for target_file
        if ".py" in target_file:
            app = "python -u"
            #target_file = file_utils.fix_slashes(target_file, True)
            target_file = os.path.basename(target_file)
        else:
            app = target_file
            target_file = ""

        full_image = login_server + "/" + image if login_server else image

        # build a mapping for data?
        data_local = args["data_local"]
        if data_local:
            if "$scriptdir" in data_local:
                data_local = data_local.replace("$scriptdir", script_dir)

            data_local = os.path.realpath(data_local)
            mappings += " -v {}:/usr/data".format(data_local)
            env_vars["XT_DATA_DIR"] = "/usr/data"

        # write env vars to file in snapshot dir
        FN_EV = "__dockev__.txt"
        fn_env_var = os.path.join(snapshot_dir, FN_EV)
        lines = [name + "=" + str(value) for name,value in env_vars.items()]
        text = "\n".join(lines)
        file_utils.write_text_file(fn_env_var, text)

        # specify env var file (in current directory) to docker
        options += " --env-file={}".format(FN_EV)

        # inherit ENV VARS from running environment
        options += " -e XT_RUN_NAME -e XT_WORKSPACE_NAME -e XT_EXPERIMENT_NAME"

        docker_cmd = "docker run {} {} {} {} /usr/src/{}".format(options, mappings, full_image, app, target_file)
        new_parts = utils.cmd_split(docker_cmd)
        return new_parts
Example #27
0
    def submit_node_runs(self, job_id, node_runs, workspace, aml_ws_name,
                         xt_exper_name, aml_exper_name, compute_def,
                         resume_name, repeat_count, using_hp, compute,
                         runs_by_box, code_dir, node_index, show_aml_run_name,
                         nodes, args):

        first_run = node_runs[0]
        first_run_name = first_run["run_name"]
        fake_submit = args["fake_submit"]

        # this indicates we should make serializable versions of estimator and trainer
        self.submit_logs = True or fake_submit  # must be true if we are using fake_submit

        self.serializable_estimator = None
        self.serializable_trainer = None

        box_name = first_run["box_name"]

        run_specs = first_run["run_specs"]
        cmd_parts = run_specs["cmd_parts"]
        target_fn = args["script"]
        node_id = "node" + str(node_index)

        assert cmd_parts[0] == "python"
        assert cmd_parts[1] == "-u"
        assert len(cmd_parts[2]) > 0

        # update the target_fn (might have been switched to the xt controller)
        target_fn = cmd_parts[2]
        arg_parts = cmd_parts[3:]

        # parse target's cmdline args
        arg_dict = {}
        for ap in arg_parts:
            # arg name can start with or without "-" here
            if "=" in ap:
                name, value = ap.split("=")
                if not value.startswith('"[') and not value.startswith('"@'):
                    arg_dict[name] = value
            else:
                # for unspecified values
                arg_dict[ap] = 1

        compute_target = utils.safe_value(compute_def, "compute")
        if not compute_target:
            errors.config_error(
                "'compute' property missing on compute target '{}' in XT config file"
                .format(compute))

        estimator, experiment = self.create_estimator(
            job_id, workspace, aml_ws_name, xt_exper_name, aml_exper_name,
            first_run_name, code_dir, target_fn, arg_dict, compute_target,
            node_id, nodes, fake_submit, args)

        hp_config = args["hp_config"]
        direct_run = args["direct_run"]

        if using_hp and direct_run:
            # EXPERIMENT with hyperdrive
            max_runs = args["max_runs"]
            max_minutes = args["max_minutes"]

            policy_name = args["early_policy"]
            eval_interval = args["evaluation_interval"]
            delay_eval = args["delay_evaluation"]
            truncation_percentage = args["truncation_percentage"]
            slack_factor = args["slack_factor"]
            slack_amount = args["slack_amount"]

            primary_metric = args["primary_metric"]
            maximize_metric = args["maximize_metric"]
            search_type = args["search_type"]
            concurrent = args["concurrent"]

            max_concurrent_runs = nodes * concurrent

            if max_minutes <= 0:
                #max_minutes = 43200   # aml workaround: None not supported, either is -1 or 0, so use max value
                max_minutes = 10080  # aml workaround: documented max not supported

            if hp_sets:
                hd_dict = self.build_hyperdrive_dict(hp_sets)
            else:
                hd_dict = self.build_hyperdrive_dict_from_file(hp_config)

            if not policy_name:
                # use default policy (not that same as no policy)
                early_term = None
            else:
                if self.submit_logs:
                    early_term = {
                        "policy_type": policy_name,
                        "eval_interval": eval_interval,
                        "delay_eval": delay_eval,
                        "truncation_percentage": truncation_percentage,
                        "slack_factor": slack_factor,
                        "slack_amount": slack_amount
                    }

                    self.serializable_trainer = {
                        "estimator": serializable_estimator,
                        "hd_dict": hd_dict,
                        "search_type": search_type,
                        "primary_metric": primary_metric,
                        "maximize_metric": maximize_metric,
                        "early_term": serializable_early_term,
                        "max_total_runs": max_runs,
                        "max_concurrent_runs": max_concurrent_runs,
                        "max_minutes": max_minutes
                    }

                if fake_submit:
                    trainer = self.serializable_trainer
                else:
                    early_term = self.make_early_term_policy(
                        policy_type=policy_name,
                        eval_interval=eval_interval,
                        delay_eval=delay_eval,
                        truncation_percentage=truncation_percentage,
                        slack_factor=slack_factor,
                        slack_amount=slack_amount)

                    trainer = self.create_hyperdrive_trainer(
                        estimator,
                        hd_dict,
                        search_type,
                        primary_metric,
                        maximize_metric,
                        early_term,
                        max_total_runs=max_runs,
                        max_concurrent_runs=max_concurrent_runs,
                        max_minutes=max_minutes)
        else:
            # not using AML hyperdrive
            trainer = estimator

        run_name, monitor_cmd, aml_run_name, aml_run_number, aml_run_id = \
            self.run_aml_job(job_id, workspace, aml_ws_name, trainer, experiment, xt_exper_name, aml_exper_name, compute_target, code_dir, first_run_name,
                box_name, node_index, repeat_count, fake_submit, args)

        if show_aml_run_name:
            fb.feedback("[aml: {}/Run {}], xt: {}/{} ".format(
                aml_exper_name, aml_run_number, workspace, run_name),
                        is_final=True)
        else:
            fb.feedback("{}/{}".format(aml_exper_name, aml_run_number,
                                       workspace, run_name))

        mongo = self.store.get_mongo()
        run_names = []
        for run in node_runs:
            run_name = run["run_name"]
            run_names.append(run_name)

        node_info = {"ws": workspace}

        for run_name in run_names:
            # we only have 1 run, so OK to hold info in flat dict here
            node_info["aml_exper_name"] = aml_exper_name
            node_info["aml_run_number"] = aml_run_number
            node_info["aml_run_id"] = aml_run_id
            node_info["run_name"] = run_name

            # update mongo db info for run with cluster and service_job_id
            mongo.update_mongo_run_from_dict(workspace, run_name, {
                "aml_exper_name": aml_exper_name,
                "aml_run_number": aml_run_number
            })

        if monitor_cmd:
            console.print("monitoring notebook created; to run:")
            console.print("  " + monitor_cmd)

        return node_info
Example #28
0
    def create_estimator(self, job_id, workspace, aml_ws_name, xt_exper_name,
                         aml_exper_name, run_name, code_dir, target_fn,
                         arg_dict, compute_target, node_id, nodes, fake_submit,
                         args):

        config = self.config
        ps = None

        if not aml_exper_name:
            errors.config_error(
                "experiment name must be specified (thru config file or command line option '--experiment')"
            )

        if fake_submit:
            # for speed of testing, avoid creating real Workspace, Experiment instances
            ws = {"name": aml_ws_name}
            experiment = {"ws": ws, "name": aml_exper_name}
        else:
            ws = self.get_aml_ws(aml_ws_name)
            experiment = Experiment(ws, name=aml_exper_name)

        if compute_target == "amlcompute":
            actual_target = "amlcompute"  # AmlCompute(ws, None)
        else:
            if fake_submit:
                actual_target = "amlcompute"
            else:
                if not compute_target in ws.compute_targets:
                    errors.config_error(
                        "compute target '{}' does not exist in AML workspace '{}'"
                        .format(compute_target, aml_ws_name))

                actual_target = ws.compute_targets[compute_target]

        # build ENV VARS
        store_creds = self.config.get_storage_creds()

        # store_name = store_creds["name"]
        # store_key = store_creds["key"]

        provider_code_path = config.get_storage_provider_code_path(store_creds)

        mongo_creds, mongo_name = self.config.get_mongo_creds()
        mongo_conn_str = mongo_creds["mongo-connection-string"]

        username = args["username"]
        description = args["description"]
        aggregate_dest = args["aggregate_dest"]

        env_vars = self.build_env_vars(workspace,
                                       aml_ws_name,
                                       xt_exper_name,
                                       aml_exper_name,
                                       run_name,
                                       job_id=job_id,
                                       compute_target=compute_target,
                                       username=username,
                                       description=description,
                                       aggregate_dest=aggregate_dest,
                                       node_id=node_id,
                                       args=args)

        framework = args["framework"]
        framework = framework.lower()

        is_distributed = args['distributed']
        dist_training = args["distributed_training"]
        dist_training = dist_training.lower()

        from azureml.train.estimator import Estimator, Mpi, Gloo, Nccl
        from azureml.train.dnn import PyTorch, Chainer, TensorFlow

        fw_dict = {
            "pytorch": PyTorch,
            "tensorflow": TensorFlow,
            "chainer": Chainer,
            "estimator": Estimator
        }
        dt_dict = {"mpi": Mpi, "gloo": Gloo, "nccl": Nccl}

        if not framework in fw_dict:
            errors.user_config_errorerror(
                "framework must be set to 'pytorch', 'tensorflow', 'chainer', or 'estimator'"
            )

        estimator_ctr = fw_dict[framework]

        if is_distributed:
            if not dist_training in dt_dict:
                errors.config_error(
                    "distributed-training must be set to 'mpi', 'gloo', or 'nccl'"
                )

            distributed_ctr = dt_dict[dist_training]
            distributed_obj = distributed_ctr()
        else:
            distributed_obj = None

        compute_def = args["compute_def"]
        direct_run = args["direct_run"]

        if direct_run:
            # relying on AML for full control (not using XT controller)
            node_count = utils.safe_value(compute_def, "nodes")

            # did cmd line overwrite nodes?
            if args["nodes"]:
                node_count = args["nodes"]

            if node_count is None:
                errors.config_error(
                    "must specify 'nodes' property for Azure ML service '{}' in XT config file or as --nodes option in cmd line"
                    .format(args["target"]))
        else:
            # run as separate AML runs, each with a single node
            node_count = 1

        vm_size = args["vm_size"]
        conda_packages = args["conda_packages"]
        pip_packages = args["pip_packages"]
        use_gpu = args["use_gpu"]
        framework_version = args["fw_version"]
        max_secs = args["max_seconds"]
        user_managed = args["user_managed"]

        activate_cmd = self.get_activate_cmd()
        if activate_cmd:
            # we have no way of running this on AML before conda_packages and pip_packages are installed (or used to build a docker image)
            errors.config_error(
                "setup.activate property cannot be specified for AML targets")

        #max_secs = 10080 if max_secs <= 0 else max_secs

        use_docker = False
        environment_name = utils.safe_value(compute_def, "docker")
        if environment_name:
            envrionment_def = self.config.get_docker_def(environment_name)
            if envrionment_def:
                use_docker = (envrionment_def["type"] == "docker")

        # workaround AML warning
        if not use_docker:
            use_docker = None

        if self.submit_logs:
            # for testing (this should match exact args used in estimator ctr below)
            self.serializable_estimator = {
                "source_directory": code_dir,
                "script_params": arg_dict,
                "compute_target": actual_target,
                "vm_size": vm_size,
                "entry_script": target_fn,
                "conda_packages": conda_packages,
                "pip_packages": pip_packages,
                "use_gpu": use_gpu,
                "use_docker": use_docker,
                "framework_version": framework_version,
                "user_managed": user_managed,
                "environment_variables": env_vars,
                "node_count": node_count,
                "distributed_training": {},
                "max_run_duration_seconds": max_secs
            }

        if fake_submit:
            estimator = self.serializable_estimator
        else:
            estimator = estimator_ctr(source_directory=code_dir,
                                      script_params=arg_dict,
                                      compute_target=actual_target,
                                      vm_size=vm_size,
                                      entry_script=target_fn,
                                      conda_packages=conda_packages,
                                      pip_packages=pip_packages,
                                      use_gpu=use_gpu,
                                      use_docker=use_docker,
                                      framework_version=framework_version,
                                      user_managed=user_managed,
                                      environment_variables=env_vars,
                                      node_count=node_count,
                                      distributed_training=distributed_obj,
                                      max_run_duration_seconds=max_secs)

        return estimator, experiment