def cancel_runs_by_user(self, box_name):
        '''
        Args:
            box_name: the name of the box the runs ran on (pool service)
        Returns:
            cancel_results: a list of kill results records 
                (keys: workspace, run_name, exper_name, killed, status, before_status)
        '''
        cancel_results = []

        # get list of active jobs from batch
        active_jobs = self.get_active_jobs()
        console.diag("after get_active_jobs()")

        if active_jobs:
            for job_record in active_jobs:
                # watch out for older jobs that didn't have service_job_info/service_info_by_node properties
                service_job_info = utils.safe_value(job_record,
                                                    "service_job_info")
                service_info_by_node = utils.safe_value(
                    job_record, "service_info_by_node")

                if service_job_info and service_info_by_node:
                    job_id = job_record["job_id"]
                    cancel_result = self.cancel_job(service_job_info,
                                                    service_info_by_node)
                    for _, node_result in cancel_result.items():
                        cancel_results.append(node_result)

        return cancel_results
def get_client_cs(core, job_id, node_index):
    '''
    instantiate the backend service that owns the specified job node and 
    request it's client connection string
    '''
    cs = None
    box_secret = None

    filter = {"_id": job_id}
    jobs = core.store.mongo.get_info_for_jobs(filter, None)
    if not jobs:
        errors.store_error("unknown job_id: {}".format(job_id))

    job = jobs[0]
    node_id = utils.node_id(node_index)

    compute = utils.safe_value(job, "compute")
    secrets_by_node = utils.safe_value(job, "secrets_by_node")
    if not secrets_by_node:
        errors.store_error("unknown node_index={} for job={}".format(
            node_index, job_id))

    box_secret = utils.safe_value(secrets_by_node, node_id)

    service_info_by_node = utils.safe_value(job, "service_info_by_node")
    node_info = utils.safe_value(service_info_by_node, node_id)

    if compute and node_info:
        backend = core.create_backend(compute)
        cs = backend.get_client_cs(node_info)

    cs_plus = {"cs": cs, "box_secret": box_secret, "job": job}
    return cs_plus
Exemple #3
0
def get_filtered_sorted_limit_runs(store,
                                   config,
                                   show_gathering,
                                   col_dict=None,
                                   args=None):

    console.diag("start of: get_filtered_sorted_limit_runs")
    # required
    run_list = args["run_list"]

    # optional
    pool = utils.safe_value(args, "target")
    available = utils.safe_value(args, "available")
    workspace = utils.safe_value(args, "workspace")

    if workspace:
        store.ensure_workspace_exists(workspace, flag_as_error=True)

    mongo = store.get_mongo()

    # have MONGO update any old RUN documents to new format
    fixup_mongo_runs.fixup_runs_if_needed(mongo.mongo_db, workspace)

    # get info about run properties
    user_to_actual, std_cols_desc = get_run_property_dicts()
    actual_to_user = {value: key for key, value in user_to_actual.items()}

    builder = ReportBuilder(config, store, client=None)

    # get list of specified runs
    pure_run_list, actual_ws = expand_run_list(store, mongo, workspace,
                                               run_list)
    if run_list and not pure_run_list:
        errors.general_error("no run(s) found")

    # build a filter dict for all specified filters
    filter_dict = build_run_filter_dict(pure_run_list, user_to_actual, builder,
                                        args)

    # if show_gathering:
    #     console.print("gathering run data...", flush=True)

    # get the mongo records for the matching RUNS
    records, using_default_last, last = builder.get_mongo_records(
        mongo,
        filter_dict,
        workspace,
        "runs",
        actual_to_user,
        col_dict=col_dict,
        args=args)

    console.diag("end of: get_filtered_sorted_limit_runs")

    return records, using_default_last, user_to_actual, available, builder, last, std_cols_desc
Exemple #4
0
def build_cmds(auto_mode, quick_test, monitoropt, nogui, philly=1):
    config = xt_config.get_merged_config()
    mini_mode = not config.get("general", "advanced-mode")

    is_windows = (os.name == "nt")
    has_gui = pc_utils.has_gui() and not nogui
    browse_flag = "--browse" if has_gui else ""
    browse_opt = "" if auto_mode or not has_gui else "--browse"
    timeout_opt = "--timeout=5" if auto_mode else ""
    monitor_opt = "--monitor=none " if nomonitor else ""
    templ = "{run}_{target}_lr={hparams.lr}_mo={hparams.momentum}_opt={hparams.optimizer}_tt={logdir}"

    # SET THESE before each demo (exper24 should be a multi-service set of simple runs)
    prev_exper = "exper18"
    curr_exper = "exper26"

    if mini_mode:
        command_dicts = commands_basic.get_command_dicts(
            prev_exper,
            curr_exper,
            browse_flag,
            browse_opt,
            timeout_opt,
            templ,
            ARCHIVES_DIR, 
            monitor_opt)
    else:
        command_dicts = commands_advanced.get_command_dicts(
            prev_exper,
            curr_exper,
            browse_flag,
            browse_opt,
            timeout_opt,
            templ,
            ARCHIVES_DIR,
            monitor_opt)

    if not has_gui:
        command_dicts = list(filter(
            lambda c_dict: not utils.safe_value(c_dict, "needs_gui", default=False),
            command_dicts))

    if philly == 0:
        command_dicts = filter(
            lambda c_dict: not utils.safe_value(c_dict, "needs_philly", default=False),
            command_dicts)

    list(map(
        lambda cmd_dict: add_cmd(cmd_dict["title"], cmd_dict["xt_cmd"]),
        command_dicts
    ))
    def get_registry_creds(self, compute, env):
        registry_creds = None

        if not env:
            compute_def = self.config.get_compute_def(compute)
            env = utils.safe_value(compute_def, "environment")

        if env and env != "none":
            env_def = self.config.get("dockers", env, default_value=None)
            if not env_def:
                errors.config_error(
                    "docker '{}' not found in config file".format(env))

            registry_name = env_def["registry"]

            # get REGISTRY credentials
            registry_creds = self.config.get("external-services",
                                             registry_name,
                                             suppress_warning=True)
            if not registry_creds:
                config_error(
                    "'{}' must be specified in [external-services] section of XT config file"
                    .format(registry_name))

        return registry_creds
    def cancel_runs_by_names(self, workspace, run_names, box_name):
        '''
        Args:
            workspace: the name of the workspace containing the run_names
            run_names: a list of run names
            box_name: the name of the box the runs ran on (pool service)
        Returns:
            cancel_results: a list of kill results records 
                (keys: workspace, run_name, exper_name, killed, status, before_status)
        '''

        # our strategy for this API:
        #   - use the XT controller to kill specified runs (when controller is available)
        #   - use batch_client "cancel node" if controller not available

        # we build service-based box names to have 3 parts
        job_id, service_name, node_index = box_name.split("-")
        active_jobs = self.get_active_jobs()
        cancel_results = []
        if active_jobs:
            for job_record in active_jobs:
                # watch out for older jobs that didn't have service_job_info/service_info_by_node properties
                service_info_by_node = utils.safe_value(
                    job_record, "service_info_by_node")

                if service_info_by_node:
                    for node, node_service_info in service_info_by_node.items(
                    ):
                        if node_service_info.get("run_name") in run_names:
                            cancel_result = self.cancel_node(node_service_info)
                            cancel_results.append(cancel_result)

        return cancel_results
Exemple #7
0
    def get_user_columns(self, args):
        requested_list = args["columns"]
        add_cols = utils.safe_value(args, "add_columns")
        if add_cols:
            requested_list += add_cols

        return requested_list
    def get_required_service_property(self, creds, prop_name, service_name):
        value = utils.safe_value(creds, prop_name)
        if not value:
            errors.config_error(
                "Missing '{}' property for service '{}' defined in [external-services] section of the XT config file"
                .format(prop_name, service_name))

        return value
    def validate_storage_and_mongo(self, mongo):
        '''
        1. ensure storage has been initialized for XT
        2. ensure mongo and storage point to each other
        3. update storage format if needed
        4. update mongo format if needed
        '''

        # ensure storage has been initialized for XT
        self._create_info_container_if_needed()

        # ensure mongo points to our storage
        storage_name = self.provider.get_service_name()
        connected_mongo = mongo.get_service_name()

        mongo_info = mongo.get_mongo_info()
        paired_storage = utils.safe_value(mongo_info, "paired_storage")
        if paired_storage and storage_name != paired_storage:
            errors.combo_error("mongo paired with storage service='{}', but passed XT storage service='{}'".format(  \
                paired_storage, storage_name))

        storage_info = self._get_storage_info()
        paired_mongo = utils.safe_value(storage_info, "paired_mongo")
        if paired_mongo and connected_mongo != paired_mongo:
            errors.combo_error("this storage paired with mongo service='{}', but passed connection string for mongo service='{}'".format(  \
                connected_mongo, paired_mongo))

        if not paired_storage:
            mongo_info = {
                "paired_storage": storage_name,
                "storage_version": constants.STORAGE_VERSION
            }
            mongo.set_mongo_info(mongo_info)

        if not paired_mongo:
            storage_info = {
                "paired_mongo": connected_mongo,
                "storage_version": constants.STORAGE_VERSION
            }
            self._set_storage_info(storage_info)

        # only check once, (takes .5 secs if already imported)
        # remove this check after all XT users have imported (approx. Dec 2019)
        # but keep around (good for mongodb repair, if needed)
        self.import_jobs_to_mongo_if_needed(mongo)
Exemple #10
0
def get_client_cs(core, ws, run_name):

    cs = None
    box_secret = None

    filter = {"_id": run_name}
    runs = core.store.mongo.get_info_for_runs(ws, filter, {"run_logs": 0})
    if not runs:
        errors.store_error("Unknown run: {}/{}".format(ws, run_name))

    if runs:
        from xtlib import job_helper

        run = runs[0]
        job_id = utils.safe_value(run, "job_id")
        node_index = utils.safe_value(run, "node_index")

        cs_plus = job_helper.get_client_cs(core, job_id, node_index)
        cs = cs_plus["cs"]
        box_secret = cs_plus["box_secret"]

    return cs, box_secret
    def get_node_run(self, service_node_info):
        # create aml workspace
        aml_ws_name = utils.safe_value(self.compute_def, "service")
        ws = self.get_aml_ws(aml_ws_name)

        # create aml experiment
        aml_exper_name = service_node_info["aml_exper_name"]
        experiment = Experiment(ws, name=aml_exper_name)

        # create aml run
        aml_run_id = service_node_info["aml_run_id"]
        run = Run(experiment, aml_run_id)

        return run
Exemple #12
0
    def gen_args(self, args, gen_docs=False):
        text = "\n"

        visible_args = [
            arg for arg in args if not utils.safe_value(arg, "hidden")
        ]

        if visible_args:
            if gen_docs:
                text += "Arguments::\n\n"
            else:
                text += "Arguments:\n"
            text += self.gen_name_help_aligned(visible_args, separator="-")
            #text += "\n"
        return text
Exemple #13
0
    def get_activate_cmd(self):

        setup_def = self.config.get_setup_from_target_def(self.compute_def)

        if pc_utils.is_windows:
            activate_cmd = utils.safe_value(setup_def, "activate")
        else:
            # Attempting to activate the Conda shell from within a bash script
            # fails, with Conda saying that the bash environment has not
            # been correctly initialized to use Conda.
            # This thread https://stackoverflow.com/questions/34534513/calling-conda-source-activate-from-bash-script
            # eventually led me to the following command which is taken
            # from the lines of bash script that Conda appends to your
            # .bashrc file upon installation. This command is what
            # allows you to activate the Conda environment within a
            # bash shell. It returns a script generated by Conda
            # which is executed, and which stes up the conda
            # activate / deactivate commands in the encironment.
            conda_shell_bash_hook_cmd = 'eval "$(conda shell.bash hook)"'
            activate_cmd = utils.safe_value(setup_def, "activate")
            activate_cmd = "{} && {}".format(conda_shell_bash_hook_cmd,
                                             activate_cmd)

        return activate_cmd
Exemple #14
0
    def get_first_last(self, args):
        first = utils.safe_value(args, "first")
        last = utils.safe_value(args, "last")
        show_all = utils.safe_value(args, "all")

        explict = qfe.get_explicit_options()

        # explict overrides default for all/first/last
        if "all" in explict:
            first = None
            last = None
        elif "first" in explict:
            show_all = None
            last = None
        elif "last" in explict:
            show_all = None
            first = None
        else:
            # priority if no explict options set
            if show_all:
                first = None
                last = None

        return first, last
Exemple #15
0
    def import_run_mongo_document(self, mongo_run_fn, workspace, new_workspace,
                                  prev_job_id, new_job_id, run_name):
        text = file_utils.read_text_file(mongo_run_fn)
        run = json.loads(text)

        # update job_id
        run["job_id"] = new_job_id

        # update workspace
        run["ws"] = new_workspace

        # add to mongo
        self.store.mongo.update_run_info(new_workspace, run_name, run)

        end_id = utils.safe_value(run, "end_id")
        return end_id
    def submit_node_runs(self, job_id, node_runs, workspace, aml_ws_name,
                         xt_exper_name, aml_exper_name, compute_def,
                         resume_name, repeat_count, using_hp, compute,
                         runs_by_box, code_dir, node_index, show_aml_run_name,
                         nodes, args):

        first_run = node_runs[0]
        first_run_name = first_run["run_name"]
        fake_submit = args["fake_submit"]

        # this indicates we should make serializable versions of estimator and trainer
        self.submit_logs = True or fake_submit  # must be true if we are using fake_submit

        self.serializable_estimator = None
        self.serializable_trainer = None

        box_name = first_run["box_name"]

        run_specs = first_run["run_specs"]
        cmd_parts = run_specs["cmd_parts"]
        target_fn = args["script"]
        node_id = "node" + str(node_index)

        assert cmd_parts[0] == "python"
        assert cmd_parts[1] == "-u"
        assert len(cmd_parts[2]) > 0

        # update the target_fn (might have been switched to the xt controller)
        target_fn = cmd_parts[2]
        arg_parts = cmd_parts[3:]

        # parse target's cmdline args
        arg_dict = {}
        for ap in arg_parts:
            # arg name can start with or without "-" here
            if "=" in ap:
                name, value = ap.split("=")
                if not value.startswith('"[') and not value.startswith('"@'):
                    arg_dict[name] = value
            else:
                # for unspecified values
                arg_dict[ap] = 1

        compute_target = utils.safe_value(compute_def, "compute")
        if not compute_target:
            errors.config_error(
                "'compute' property missing on compute target '{}' in XT config file"
                .format(compute))

        estimator, experiment = self.create_estimator(
            job_id, workspace, aml_ws_name, xt_exper_name, aml_exper_name,
            first_run_name, code_dir, target_fn, arg_dict, compute_target,
            node_id, nodes, fake_submit, args)

        hp_config = args["hp_config"]
        direct_run = args["direct_run"]

        if using_hp and direct_run:
            # EXPERIMENT with hyperdrive
            max_runs = args["max_runs"]
            max_minutes = args["max_minutes"]

            policy_name = args["early_policy"]
            eval_interval = args["evaluation_interval"]
            delay_eval = args["delay_evaluation"]
            truncation_percentage = args["truncation_percentage"]
            slack_factor = args["slack_factor"]
            slack_amount = args["slack_amount"]

            primary_metric = args["primary_metric"]
            maximize_metric = args["maximize_metric"]
            search_type = args["search_type"]
            concurrent = args["concurrent"]

            max_concurrent_runs = nodes * concurrent

            if max_minutes <= 0:
                #max_minutes = 43200   # aml workaround: None not supported, either is -1 or 0, so use max value
                max_minutes = 10080  # aml workaround: documented max not supported

            if hp_sets:
                hd_dict = self.build_hyperdrive_dict(hp_sets)
            else:
                hd_dict = self.build_hyperdrive_dict_from_file(hp_config)

            if not policy_name:
                # use default policy (not that same as no policy)
                early_term = None
            else:
                if self.submit_logs:
                    early_term = {
                        "policy_type": policy_name,
                        "eval_interval": eval_interval,
                        "delay_eval": delay_eval,
                        "truncation_percentage": truncation_percentage,
                        "slack_factor": slack_factor,
                        "slack_amount": slack_amount
                    }

                    self.serializable_trainer = {
                        "estimator": serializable_estimator,
                        "hd_dict": hd_dict,
                        "search_type": search_type,
                        "primary_metric": primary_metric,
                        "maximize_metric": maximize_metric,
                        "early_term": serializable_early_term,
                        "max_total_runs": max_runs,
                        "max_concurrent_runs": max_concurrent_runs,
                        "max_minutes": max_minutes
                    }

                if fake_submit:
                    trainer = self.serializable_trainer
                else:
                    early_term = self.make_early_term_policy(
                        policy_type=policy_name,
                        eval_interval=eval_interval,
                        delay_eval=delay_eval,
                        truncation_percentage=truncation_percentage,
                        slack_factor=slack_factor,
                        slack_amount=slack_amount)

                    trainer = self.create_hyperdrive_trainer(
                        estimator,
                        hd_dict,
                        search_type,
                        primary_metric,
                        maximize_metric,
                        early_term,
                        max_total_runs=max_runs,
                        max_concurrent_runs=max_concurrent_runs,
                        max_minutes=max_minutes)
        else:
            # not using AML hyperdrive
            trainer = estimator

        run_name, monitor_cmd, aml_run_name, aml_run_number, aml_run_id = \
            self.run_aml_job(job_id, workspace, aml_ws_name, trainer, experiment, xt_exper_name, aml_exper_name, compute_target, code_dir, first_run_name,
                box_name, node_index, repeat_count, fake_submit, args)

        if show_aml_run_name:
            fb.feedback("[aml: {}/Run {}], xt: {}/{} ".format(
                aml_exper_name, aml_run_number, workspace, run_name),
                        is_final=True)
        else:
            fb.feedback("{}/{}".format(aml_exper_name, aml_run_number,
                                       workspace, run_name))

        mongo = self.store.get_mongo()
        run_names = []
        for run in node_runs:
            run_name = run["run_name"]
            run_names.append(run_name)

        node_info = {"ws": workspace}

        for run_name in run_names:
            # we only have 1 run, so OK to hold info in flat dict here
            node_info["aml_exper_name"] = aml_exper_name
            node_info["aml_run_number"] = aml_run_number
            node_info["aml_run_id"] = aml_run_id
            node_info["run_name"] = run_name

            # update mongo db info for run with cluster and service_job_id
            mongo.update_mongo_run_from_dict(workspace, run_name, {
                "aml_exper_name": aml_exper_name,
                "aml_run_number": aml_run_number
            })

        if monitor_cmd:
            console.print("monitoring notebook created; to run:")
            console.print("  " + monitor_cmd)

        return node_info
    def create_estimator(self, job_id, workspace, aml_ws_name, xt_exper_name,
                         aml_exper_name, run_name, code_dir, target_fn,
                         arg_dict, compute_target, node_id, nodes, fake_submit,
                         args):

        config = self.config
        ps = None

        if not aml_exper_name:
            errors.config_error(
                "experiment name must be specified (thru config file or command line option '--experiment')"
            )

        if fake_submit:
            # for speed of testing, avoid creating real Workspace, Experiment instances
            ws = {"name": aml_ws_name}
            experiment = {"ws": ws, "name": aml_exper_name}
        else:
            ws = self.get_aml_ws(aml_ws_name)
            experiment = Experiment(ws, name=aml_exper_name)

        if compute_target == "amlcompute":
            actual_target = "amlcompute"  # AmlCompute(ws, None)
        else:
            if fake_submit:
                actual_target = "amlcompute"
            else:
                if not compute_target in ws.compute_targets:
                    errors.config_error(
                        "compute target '{}' does not exist in AML workspace '{}'"
                        .format(compute_target, aml_ws_name))

                actual_target = ws.compute_targets[compute_target]

        # build ENV VARS
        store_creds = self.config.get_storage_creds()

        # store_name = store_creds["name"]
        # store_key = store_creds["key"]

        provider_code_path = config.get_storage_provider_code_path(store_creds)

        mongo_creds, mongo_name = self.config.get_mongo_creds()
        mongo_conn_str = mongo_creds["mongo-connection-string"]

        username = args["username"]
        description = args["description"]
        aggregate_dest = args["aggregate_dest"]

        env_vars = self.build_env_vars(workspace,
                                       aml_ws_name,
                                       xt_exper_name,
                                       aml_exper_name,
                                       run_name,
                                       job_id=job_id,
                                       compute_target=compute_target,
                                       username=username,
                                       description=description,
                                       aggregate_dest=aggregate_dest,
                                       node_id=node_id,
                                       args=args)

        framework = args["framework"]
        framework = framework.lower()

        is_distributed = args['distributed']
        dist_training = args["distributed_training"]
        dist_training = dist_training.lower()

        from azureml.train.estimator import Estimator, Mpi, Gloo, Nccl
        from azureml.train.dnn import PyTorch, Chainer, TensorFlow

        fw_dict = {
            "pytorch": PyTorch,
            "tensorflow": TensorFlow,
            "chainer": Chainer,
            "estimator": Estimator
        }
        dt_dict = {"mpi": Mpi, "gloo": Gloo, "nccl": Nccl}

        if not framework in fw_dict:
            errors.user_config_errorerror(
                "framework must be set to 'pytorch', 'tensorflow', 'chainer', or 'estimator'"
            )

        estimator_ctr = fw_dict[framework]

        if is_distributed:
            if not dist_training in dt_dict:
                errors.config_error(
                    "distributed-training must be set to 'mpi', 'gloo', or 'nccl'"
                )

            distributed_ctr = dt_dict[dist_training]
            distributed_obj = distributed_ctr()
        else:
            distributed_obj = None

        compute_def = args["compute_def"]
        direct_run = args["direct_run"]

        if direct_run:
            # relying on AML for full control (not using XT controller)
            node_count = utils.safe_value(compute_def, "nodes")

            # did cmd line overwrite nodes?
            if args["nodes"]:
                node_count = args["nodes"]

            if node_count is None:
                errors.config_error(
                    "must specify 'nodes' property for Azure ML service '{}' in XT config file or as --nodes option in cmd line"
                    .format(args["target"]))
        else:
            # run as separate AML runs, each with a single node
            node_count = 1

        vm_size = args["vm_size"]
        conda_packages = args["conda_packages"]
        pip_packages = args["pip_packages"]
        use_gpu = args["use_gpu"]
        framework_version = args["fw_version"]
        max_secs = args["max_seconds"]
        user_managed = args["user_managed"]

        activate_cmd = self.get_activate_cmd()
        if activate_cmd:
            # we have no way of running this on AML before conda_packages and pip_packages are installed (or used to build a docker image)
            errors.config_error(
                "setup.activate property cannot be specified for AML targets")

        #max_secs = 10080 if max_secs <= 0 else max_secs

        use_docker = False
        environment_name = utils.safe_value(compute_def, "docker")
        if environment_name:
            envrionment_def = self.config.get_docker_def(environment_name)
            if envrionment_def:
                use_docker = (envrionment_def["type"] == "docker")

        # workaround AML warning
        if not use_docker:
            use_docker = None

        if self.submit_logs:
            # for testing (this should match exact args used in estimator ctr below)
            self.serializable_estimator = {
                "source_directory": code_dir,
                "script_params": arg_dict,
                "compute_target": actual_target,
                "vm_size": vm_size,
                "entry_script": target_fn,
                "conda_packages": conda_packages,
                "pip_packages": pip_packages,
                "use_gpu": use_gpu,
                "use_docker": use_docker,
                "framework_version": framework_version,
                "user_managed": user_managed,
                "environment_variables": env_vars,
                "node_count": node_count,
                "distributed_training": {},
                "max_run_duration_seconds": max_secs
            }

        if fake_submit:
            estimator = self.serializable_estimator
        else:
            estimator = estimator_ctr(source_directory=code_dir,
                                      script_params=arg_dict,
                                      compute_target=actual_target,
                                      vm_size=vm_size,
                                      entry_script=target_fn,
                                      conda_packages=conda_packages,
                                      pip_packages=pip_packages,
                                      use_gpu=use_gpu,
                                      use_docker=use_docker,
                                      framework_version=framework_version,
                                      user_managed=user_managed,
                                      environment_variables=env_vars,
                                      node_count=node_count,
                                      distributed_training=distributed_obj,
                                      max_run_duration_seconds=max_secs)

        return estimator, experiment
Exemple #18
0
    def get_client_context(self,
                           exper_name,
                           run_name,
                           app_info,
                           box_info,
                           job_id,
                           node_index,
                           run_specs,
                           resume_name=None,
                           using_hp=False,
                           repeat=None,
                           args=None):
        '''
        this function gathers up all of the job-level context needed to run the job on the specified node (node_index).
        '''
        config = self.config
        cmd_parts = run_specs["cmd_parts"]
        workspace = args['workspace']
        working_dir = args['working_dir']

        context = Bag()

        context.ws = workspace
        context.working_dir = working_dir
        context.exper_name = exper_name
        context.run_name = run_name
        context.job_id = job_id
        context.sku = args["sku"]
        context.app_name = app_info.app_name if app_info else None
        context.box = args["box"]
        context.from_ip = pc_utils.get_ip_address()
        context.from_host = pc_utils.get_hostname()
        context.box_name = box_info.box_name
        context.target_file, _, _ = self.get_target(cmd_parts)
        context.resume_name = resume_name
        context.generated_sweep_text = None  # will be conditionally set in controller

        context.pool = args["pool"]
        context.node_index = node_index
        context.compute = args["target"]
        context.service_type = args["service_type"]

        # provide all provider info to controller
        context.providers = config.get("providers")

        #context.run_specs = run_specs
        context.cmd_parts = cmd_parts
        context.xt_cmd = args[
            "xt_cmd"]  # log our full cmd to support correct rerun's
        context.run_script = run_specs["run_script"]
        context.parent_script = run_specs["parent_script"]

        # for helping docker login to user's Azure Container Registry
        is_docker = (args["docker"] != None)
        # if cmd_parts:
        #     is_docker = (cmd_parts[0] == "docker") or (cmd_parts[0] == "sudo" and cmd_parts[1] == "docker")

        #registry = config.get("environment", "registry", suppress_warning=True)
        registry = None
        compute_def = args["compute_def"]
        if compute_def and "docker" in compute_def:
            docker_name = compute_def["docker"]
            docker_def = self.config.get_docker_def(docker_name)
            if docker_def and "registry" in docker_def:
                registry = docker_def["registry"]

        if registry:
            registry_creds = config.get("external-services", registry)
            needs_login = is_docker and utils.safe_value(
                registry_creds, "login")
            login_server = utils.safe_value(registry_creds, "login-server")
            username = utils.safe_value(registry_creds, "username")
            password = utils.safe_value(registry_creds, "password")
        else:
            needs_login = False
            login_server = None
            username = None
            password = None

        context.docker_login = needs_login
        context.docker_server = login_server
        context.docker_username = username
        context.docker_password = password

        context.username = self.config.get("general", "username")

        setup = self.config.get_setup_from_target_def(compute_def)
        activate_cmd = utils.safe_value(setup, "activate")
        context.activate_cmd = activate_cmd

        # config info
        #box_os = self.get_box_os(box_name)
        box_os = box_info.box_os

        after_files_list = args["after_dirs"]
        after_files_list = utils.parse_list_option_value(after_files_list)
        context.after_files_list = after_files_list

        after_omit_list = args["after_omit"]
        after_omit_list = utils.parse_list_option_value(after_omit_list)
        context.after_omit_list = after_omit_list

        context.primary_metric = args["primary_metric"]
        context.maximize_metric = args["maximize_metric"]
        context.report_rollup = args["report_rollup"]

        context.after_upload = args["after_upload"]
        #context.scrape = config.get("general", "scrape")
        context.log = args["log"]

        # PARENT/CHILD info
        context.repeat = repeat
        context.repeats_remaining = None  # will be set in controller
        context.total_run_count = args["total_run_count"]
        context.search_style = args["search_style"]
        context.is_parent = context.search_style != "single"

        # HPARAM search
        hp_config = args["hp_config"]
        if hp_config:
            hp_config = file_utils.path_join(constants.HP_CONFIG_DIR,
                                             os.path.basename(hp_config))

        context.hp_config = hp_config
        context.fn_generated_config = args["fn_generated_config"]
        context.using_hp = using_hp
        context.search_type = args["search_type"]
        context.option_prefix = args["option_prefix"]

        context.restart = False
        context.concurrent = args["concurrent"]
        context.xtlib_capture = args["xtlib_upload"]

        # for mirroring files to grok server or storage
        context.mirror_dest = args["mirror_dest"]
        context.mirror_files = args["mirror_files"]
        context.grok_server = None  # args["grok_server"]

        context.aggregate_dest = args["aggregate_dest"]
        context.dest_name = exper_name if context.aggregate_dest == "experiment" else job_id

        store_creds = self.config.get_storage_creds()
        context.store_creds = store_creds
        context.store_code_path = config.get_storage_provider_code_path(
            store_creds)

        mongo_creds, mongo_name = self.config.get_mongo_creds()
        context.mongo_conn_str = mongo_creds["mongo-connection-string"]

        context.shell_launch_prefix = box_info.shell_launch_prefix

        #console.print("context=", context)
        return context
Exemple #19
0
    def validate_and_add_defaults(self, arguments, options, arg_dict):
        '''
        args:
            - arguments: list of the arguments for the current cmd 
            - options: list of options for the current cmd
            - arg_dict: dict of name/value pairs for user-specified args and options

        processing:
            - copy arg_dict to "explicit_options"
            - validate all names in arg_dict (against arguments & options)
            - flag as error if any required arguments/options are not specified in arg_dict
            - add default values for all arguments/options not yet specified inarg_dict

        return:
            - fullly populated copy of arg_dict
        '''
        # ensure all names in arg_dict are dash style (for validation)
        full_arg_dict = {
            key.replace("_", "-"): value
            for key, value in arg_dict.items()
        }

        # remember options that were set explicitly (dash-style)
        global explict_options
        explict_options = dict(full_arg_dict)

        # process all aguments, options, and flags; ensure each has a value in arg_dict
        all_args = arguments + options
        all_arg_names = [aa["name"] for aa in all_args]

        # process user args in arg_dict
        for name, value in full_arg_dict.items():

            # validate arg name
            if not name in all_arg_names:
                errors.api_error("unknown args name: {}".format(name))

        # now add default values for all other args
        for info in all_args:
            name = info["name"]
            required = info["required"] if "required" in info else None

            if not name in full_arg_dict:
                if required:
                    self.syntax_error(
                        "cmd '{}' missing value for required option: --{}".
                        format(self.cmd_words, name))

                default_value = utils.safe_value(info, "default")

                # expand "$group.value" type values
                default_value = self.get_default_from_config(default_value)

                # add to user's arg dict
                full_arg_dict[name] = default_value

        # finally, convert all names to underscore style
        full_arg_dict = {
            key.replace("-", "_"): value
            for key, value in full_arg_dict.items()
        }

        console.diag("full_arg_dict=", full_arg_dict)
        return full_arg_dict
Exemple #20
0
    def command_help(self, cmd_info, syntax_only=False, args_only=False):

        show_all = not syntax_only and not args_only

        if cmd_info == "flags":
            print_flags()
            return
        '''Shows help for the specified xt command'''
        name = cmd_info["name"]

        args = cmd_info["arguments"]
        args = [arg for arg in args if not utils.safe_value(arg, "hidden")]

        options = cmd_info["options"]
        options = [
            opt for opt in options if not utils.safe_value(opt, "hidden")
        ]

        examples = cmd_info["examples"]
        see_alsos = cmd_info["see_alsos"]
        faqs = cmd_info["faqs"]
        options_before_args = cmd_info["options_before_args"]

        words = name.replace("_", " ")
        if cmd_info["keyword_optional"]:
            words = "[ " + words + " ]"
        words = " " + words

        opts_text = ""
        if options:
            opts_text += " [OPTIONS]"

        args_text = self.gen_inline_args(args)

        if not syntax_only:
            console.print()

        if options_before_args:
            usage = "Usage: {}".format(
                self.name) + words + opts_text + args_text
        else:
            usage = "Usage: {}".format(
                self.name) + words + args_text + opts_text

        # print usage info
        console.print(usage)

        if show_all and not self.mini_mode:
            # print command help
            doc_string = self.get_formatted_doc_str(cmd_info)

            help_text = doc_string if doc_string else "    " + cmd_info["help"]

            console.print()
            console.print(help_text)

        if syntax_only:
            # show a quck list of options
            console.print("  OPTIONS: ", end="")
            for opt in options:
                console.print("--{} ".format(opt["name"]), end="")
            console.print()  # finish line

        else:
            # show each option on its own line with a short description
            text = ""
            if options_before_args:
                text += self.gen_options(options)
                text += self.gen_args(args)
            else:
                text += self.gen_args(args)
                text += self.gen_options(options)
            console.print(text)

        if show_all and examples:
            console.print("Examples:")
            for example in examples:
                console.print("  {}:".format(example["task"]))
                console.print("  > {}".format(example["text"]))
                console.print()

                if self.mini_mode:
                    # only show first example for mini mode
                    break

        if show_all and faqs:
            console.print("FAQs:")
            for faq in faqs:
                console.print("  {}?".format(faq["question"]))
                console.print("  => {}".format(faq["answer"]))
                console.print()

                if self.mini_mode:
                    # only show first FAQ for mini mode
                    break

        if show_all and see_alsos:
            console.print("See Also:")
            for also in see_alsos:
                text = also["text"]
                page_path = also["page_path"]

                console.print("  - {}".format(text))
Exemple #21
0
    def process_arguments(self, scanner, tok, arguments, arg_dict):
        for arg_info in arguments:
            if utils.safe_value(arg_info, "hidden"):
                continue

            arg_name = arg_info["name"]
            arg_type = arg_info["type"]
            required = arg_info["required"]
            keywords = arg_info["keywords"] if "keywords" in arg_info else None
            current_arg = None

            #print("processing arg=", arg_name, arg_type, tok)

            if arg_type == "cmd" and tok and not tok.startswith("-"):
                # convert remaining tokens to a cmd_info
                if tok:
                    # if self.match(tok, "topics"):
                    #     cmd_info = {"name": "topics"}
                    #     tok = scanner.scan()
                    # else:
                    cmd_info, tok = self.get_cmd_info(tok,
                                                      scanner,
                                                      for_help=True)
                    current_arg = cmd_info
            elif arg_type == "text":
                # convert remaining tokens to a string
                if tok:
                    text = scanner.get_rest_of_text(include_current_token=True)
                    tok = None
                else:
                    text = ""
                current_arg = text
            else:
                if tok and not tok.startswith("-"):
                    current_arg = tok

            if required and not current_arg:
                self.syntax_error(
                    "cmd '{}' missing required argument: {}".format(
                        self.cmd_words, arg_name))

            if current_arg:
                if arg_type == "str_list":
                    value, tok = self.parse_string_list(tok, scanner)
                    if len(value) == 0 and required:
                        self.syntax_error(
                            "missing value for required argument: " + arg_name)
                elif arg_type == "num_list":
                    value, tok = self.parse_num_list(tok, scanner)
                    if len(value) == 0 and required:
                        self.syntax_error(
                            "missing value for required argument: " + arg_name)
                elif arg_type == "int_list":
                    value, tok = self.parse_int_list(tok, scanner)
                    if len(value) == 0 and required:
                        self.syntax_error(
                            "missing value for required argument: " + arg_name)
                elif arg_type == "tag_list":
                    value, tok = self.parse_tag_list(tok, scanner)
                    if len(value) == 0 and required:
                        self.syntax_error(
                            "missing value for required argument: " + arg_name)
                else:
                    value = current_arg
                    if keywords:
                        found = self.match_keyword(value, keywords)
                        if not found:
                            self.syntax_error(
                                "Keyword argument {} has unrecognized value: {}"
                                .format(arg_name, value))
                        value = found
                    tok = scanner.scan()

                # store value to be passed
                arg_dict[arg_name] = value

        if tok and not tok.startswith("--"):
            errors.argument_error("unrecognized argument", tok)
        return tok
Exemple #22
0
    def process_args(self, args):

        run_script = None
        parent_script = None
        run_cmd_from_script = None
        target_file = args["script"]
        target_args = args["script_args"]
        code_upload = args["code_upload"]

        # user may have wrong slashes for this OS
        target_file = file_utils.fix_slashes(target_file)

        if os.path.isabs(target_file):
            errors.syntax_error("path to app file must be specified with a relative path: {}".format(target_file))

        is_rerun = "is_rerun" in args
        if is_rerun:
            # will be running from script dir, so remove any path to script file
            self.script_dir = os.path.dirname(target_file)
            target_file = os.path.basename(target_file)

        if target_file.endswith(".py"):
            # PYTHON target
            cmd_parts = ["python"]
            cmd_parts.append("-u")
            cmd_parts.append(target_file)
        else:
            cmd_parts = [target_file] 

        if target_args:
            # split on unquoted spaces
            arg_parts = utils.cmd_split(target_args)
            cmd_parts += arg_parts

        if target_file == "docker":
            self.is_docker = True
            
        if not self.is_docker and code_upload and not os.path.exists(target_file):
            errors.env_error("script file not found: {}".format(target_file))

        ps_path = args["parent_script"]
        if ps_path:
            parent_script = file_utils.read_text_file(ps_path, as_lines=True)

        if target_file.endswith(".bat") or target_file.endswith(".sh"):
            # a RUN SCRIPT was specified as the target
            run_script = file_utils.read_text_file(target_file, as_lines=True)
            run_cmd_from_script = scriptor.get_run_cmd_from_script(run_script)

        compute = args["target"]
        box_def = self.config.get("boxes", compute, suppress_warning=True)
        setup = utils.safe_value(box_def, "setup")

        compute_def = self.config.get_compute_def(compute)        
        if compute_def:
            # must be defined in [compute-targets]
            compute_def = self.config.get_compute_def(compute)

            if not "service" in compute_def:
                errors.config_error("compute target '{}' must define a 'service' property".format(compute))

            service = compute_def["service"]
            if service in ["local", "pool"]:
                # its a list of box names
                boxes = compute_def["boxes"]
                if len(boxes)==1 and boxes[0] == "localhost":
                    pool = None
                    box = "local"
                    service_type = "pool"
                else:
                    pool = compute
                    box = None
                    service_type = "pool"
            else:
                # it a set of compute service properties
                pool = compute
                box = None
                service_name = compute_def["service"]
                service_type = self.config.get_service_type(service_name)
        elif box_def:
            # translate single box name to a compute_def
            box = compute
            pool = None
            service_type = "pool"
            compute_def = {"service": service_type, "boxes": [box], setup: setup}
        else:
            errors.config_error("unknown target or box: {}".format(compute))

        args["target"] = compute
        args["compute_def"] = compute_def
        args["service_type"] = service_type

        # for legacy code
        args["box"] = box
        args["pool"] = pool

        return service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, \
            compute, compute_def
Exemple #23
0
    def get_mongo_records(self,
                          mongo,
                          filter_dict,
                          workspace,
                          which,
                          actual_to_user,
                          col_dict=None,
                          args=None):

        first, last = self.get_first_last(args)

        if last:
            using_default_last = True
        else:
            using_default_last = False

        reverse = utils.safe_value(args, "reverse")
        # use MONGO to do all of the work (query, sort, first/last)
        sort_col = utils.safe_value(args, "sort", "name")

        if sort_col == "name":
            # special sorting needed; we have created "run_num" field just for this purpose
            sort_col = "run_num" if which == "runs" else "job_num"
        elif not "." in sort_col:
            # translate name of std col from user-friendly version to logged version
            user_to_actual = {
                value: key
                for key, value in actual_to_user.items()
            }

            if not sort_col in user_to_actual:
                errors.general_error("unknown standard property: {} (did you mean metrics.{}, hparams.{}, or tags.{}?)". \
                    format(sort_col, sort_col, sort_col, sort_col))

            sort_col = user_to_actual[sort_col]

        # this is a TRICK to avoid having to call for the exists_count for calculation of skip count
        # it works fine, since we re-sort records on the xt client anyway
        sort_dir = -1 if reverse else 1
        if last:
            sort_dir = -sort_dir
            first = last

        # ensure we only ask for records where sort_col exists, or else we MIGHT end up with less than LIMIT records
        if not sort_dir in filter_dict:
            filter_dict[sort_col] = {"$exists": True}

        container = workspace if which == "runs" else "__jobs__"

        orig_col_dict = col_dict
        if not col_dict:
            col_dict = {"log_records": 0}

        # put our mongo operations together in a retry-compatible function
        def fetch():
            cursor = mongo.mongo_db[container].find(filter_dict, col_dict)
            cursor = cursor.sort(sort_col, 1 if not last else -1)
            if first:
                cursor = cursor.limit(first)
            return cursor

        # here is where MONGO does all the hard work for us
        cursor = mongo.mongo_with_retries("get_mongo_records", fetch)
        records = list(cursor)

        console.diag("after full records retreival, len(records)={}".format(
            len(records)))

        if not orig_col_dict:
            # pull out standard cols, translating from actual to user-friendly names
            records = [
                self.translate_record(rec, actual_to_user) for rec in records
                if rec
            ]

            # pull out requested cols, flattening nested values to their dotted names
            records = self.flatten_records(records, sort_col, args)

        if last:
            # we had to reverse the sort done by mongo, so correct it here
            records.reverse()
            #self.sort_records(records, sort_col, reverse)

        return records, using_default_last, last
Exemple #24
0
    def process_run_command(self, args):
        self.args = args

        # ensure workspace exists
        workspace = args['workspace']
        dry_run = args['dry_run']
        fake_submit = args["fake_submit"]

        if not fake_submit:
            self.store.ensure_workspace_exists(workspace, flag_as_error=False)

        # PRE-PROCESS ARGS
        service_type, cmd_parts, ps_path, parent_script, target_file, run_script, run_cmd_from_script, compute, compute_def = \
            self.process_args(args)

        # create backend helper (pool, philly, batch, aml)
        cluster = utils.safe_value(compute_def, "cluster")
        vc = utils.safe_value(compute_def, "vc")
        self.backend = self.core.create_backend(compute, cluster=cluster, vc=vc, username=None)

        # add conda_packages and pip_packages from SETUP to ARGS
        setup_def = self.config.get_setup_from_target_def(compute_def)

        conda_packages = utils.safe_value(setup_def, "conda-packages")
        pip_packages = utils.safe_value(setup_def, "pip-packages")

        args["conda_packages"] = conda_packages if conda_packages else []
        args["pip_packages"] = pip_packages if pip_packages else []

        self.adjust_pip_packages(args)

        snapshot_dir = self.temp_dir

        if fake_submit:
            script_dir = snapshot_dir
        else:
            # note: always create a snapshot dir for backends to add needed files
            file_utils.ensure_dir_deleted(snapshot_dir)
            script_dir = self.snapshot_all_code(snapshot_dir, cmd_parts, args)

        self.script_dir = script_dir
        direct_run = args["direct_run"]

        # do we need to start the xt controller?
        use_controller = not direct_run
        adjustment_scripts = None

        # create a job_secret that can later be used to authenticate with the XT controller
        # NOTE: we currently log this secret as a job property, which allows all team members to view and control this job
        job_secret = str(uuid.uuid4())

        # do we need to build a "docker run" command?
        if not self.backend.provides_container_support():
            env = args["docker"]
            if not env:
                docker_name = utils.safe_value(compute_def, "docker")
            if docker_name and docker_name != "none":
                cmd_parts = self.build_docker_cmd(docker_name, target_file, cmd_parts, script_dir, snapshot_dir, job_secret, args)
                args["docker"] = docker_name     # for use in building run context info

        # BUILD CMDS (from static hparam search, user multi cmds, or single user cmd)
        cmds, total_run_count, repeat_count, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, search_style = \
            self.build_cmds_with_search(service_type, cmd_parts, parent_script, run_script, run_cmd_from_script, use_controller, dry_run, args)

        if dry_run:
            return

        # make new values available
        args["search_style"] = search_style
        args["total_run_count"] = total_run_count

        resume_name = args['resume_name']
        keep_name = False  # args['keep_name']
        experiment = args['experiment']
        is_distributed = args['distributed']
        direct_run = args["direct_run"]

        # CREATE JOB to hold all runs
        if fake_submit:
            # use lastrun/lastjob info to get a fast incremental fake job number
            xtd = xt_dict.read_xt_dict()
            fake_job_num = xtd["fake_job_num"] if "fake_job_num" in xtd else 1
            xtd["fake_job_num"] = fake_job_num + 1
            xt_dict.write_xt_dict(xtd)
            job_id = "fake_job" + str(fake_job_num)
        else:
            job_id = self.store.create_job()
        fb.feedback(job_id)

        # start the feedback (by parts)
        fb.feedback("{}: {}".format("target", compute))

        # write hparams to FILES
        boxes, num_boxes = self.write_hparams_to_files(job_id, cmds, fake_submit, using_hp, args)

        if sweeps_text and not fake_submit:
            self.upload_sweep_data(sweeps_text, experiment, job_id, args=args)

        # if num_boxes > 1 and service_type != "batch":
        #     fb.feedback("", is_final=True)

        parent_name = None

        # BUILD RUNS, by box
        job_runs = []
        run_count = 1 if is_distributed else len(boxes) 
        secrets_by_node = {}
        remote_control = args["remote_control"]

        for i in range(run_count):
            box_name = boxes[i]

            # generate a box secret for talking to XT controller for this node
            box_secret =  str(uuid.uuid4()) if remote_control else ""

            # build runs for box_name
            run_data = self.build_first_run_for_node(i, boxes[i], target_file, ps_path, using_hp, using_aml_hparam, run_specs, job_id, 
                parent_name, cmds, pool_info, repeat_count, fake_submit, search_style, box_secret, args)

            # for now, adhere to the more general design of multiple runs per box
            box_runs = [run_data]      
            job_runs.append(box_runs)

            node_id = utils.node_id(i)            
            secrets_by_node[node_id] = box_secret

            # FEEDBACK 
            ptype = "single " if search_style == "single" else "parent "
            if is_distributed:
                ptype = "master "

            if run_count == 1:
                node_msg = "creating {}run".format(ptype)
            else:
                node_msg = "creating {}runs: {}/{}".format(ptype, i+1, run_count)

            if service_type == "pool":
                node_msg += ", box: " + box_name

            fb.feedback(node_msg, id="node_msg")  # , add_seperator=is_last)
            last_msg = node_msg

            # run the job

        # build box: runs dict for job info file
        runs_by_box, last_run = self.build_runs_by_box(job_runs, workspace)

        # now that we have run names for all static run names for all nodes, we can adjust cmds (and before files) for using the controller
        if use_controller:
            # we will create 2 temp. controller files in the CURRENT DIRECTORY (that will be captured to JOB)
            # this will also adjust commands for each node to run the XT controller
            adjustment_scripts = self.core.adjust_job_for_controller_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, 
                search_style, args=args)

        else:
            adjustment_scripts = self.core.adjust_job_for_direct_run(job_id, job_runs, cmds, using_hp, experiment, service_type, snapshot_dir, 
                search_style, args=args)

        # add env vars used by both controller and runs
        env_vars = args["env_vars"]

        # create a job guid to uniquely identify this job across all XT instances
        job_guid = str(uuid.uuid4())

        # we add with "node0" and "job_secret", but backend service will override for each node
        scriptor.add_controller_env_vars(env_vars, self.config, None, "node0")

        data_local = args["data_local"]
        if "$scriptdir" in data_local:
            data_local = os.path.realpath(data_local.replace("$scriptdir", script_dir))
            args["data_local"] = data_local

        model_local = args["model_local"]
        if "$scriptdir" in model_local:
            model_local = os.path.realpath(model_local.replace("$scriptdir", script_dir))
            args["model_local"] = model_local

        # ADJUST CMDS: this allows backend to write scripts to snapshot dir, if needed, as a way of adjusting/wrapping run commands
        self.backend.adjust_run_commands(job_id, job_runs, using_hp, experiment, service_type, snapshot_dir, args=args)

        # upload CODE from snapshot_dir
        code_upload = args["code_upload"]
        code_omit = args["code_omit"]
        code_zip = args["code_zip"]
    
        if not fake_submit:
            if code_upload:
                self.core.upload_before_files_to_job(job_id, snapshot_dir, "before/code", code_omit, code_zip, "code", args)

            # upload DATA from data_local (do we need to keep this?  should we upload to normal DATA location, vs. job?)
            data_upload = args["data_upload"]
            if data_upload:
                if not data_local:
                    errors.config_error("cannot do data-upload because no data-local path is defined in the XT config file")

                data_omit = args["data_omit"]
                data_zip = "none"

                self.core.upload_before_files_to_job(job_id, data_local, "before/data", data_omit, data_zip, "data", args)
        
        # dispatch to BACKEND submitters
        '''
        Note: backend submitter functions are responsible for:
            - submitting the job (for each node, queue runs for that node)
            - return service job id (or list of them if per node)

        NOTE: there is a timing issue where submitted job needs access to job info, but final piece
        of job info (service info) is only return after job is submitted.  Therefore, we structure steps as follows:

            - primary job info is logged
            - job is submitted thru backend
            - service info for job is logged
        '''

        # LOG PRIMARY JOB INFO
        dd = {}

        if not fake_submit:
            # mark runs as QUEUED
            for runs in runs_by_box.values():
                first_run = runs[0]
                self.store.log_run_event(workspace, first_run["run_name"], "status-change", {"status": "queued"}) 

            # write the job info file (now that backend has had a chance to update it)
            job_num = int(job_id[3:])

            xt_cmd = args["xt_cmd"]
            schedule = args["schedule"]
            concurrent = args["concurrent"]

            # this job property is used to ensure we don't exceed the specified # of runs when using repeat_count on each node
            dynamic_runs_remaining = None if search_style == "single" else total_run_count
            node_count = len(runs_by_box)

            # static_runs_by_node = None
            # if schedule == "static":
            #     static_runs_by_node = self.build_static_runs_by_node(total_run_count, node_count)
            #console.diag("static_runs_by_node=", static_runs_by_node)

            active_runs = mongo_run_index.build_active_runs(schedule, total_run_count, node_count)

            dd = {"job_id": job_id, "job_num": job_num, "compute": compute, "ws_name": workspace, "exper_name": experiment, 
                "pool_info": compute_def, "runs_by_box": runs_by_box, 
                "primary_metric": args["primary_metric"], 
                "run_count": total_run_count, "repeat": repeat_count, "search_type": args["search_type"], 
                "username": args["username"], "hold": args["hold"], "started": utils.get_time(),
                "job_status": "submitted", "running_nodes": 0, 
                "running_runs": 0, "error_runs": 0, "completed_runs": 0, "job_guid": job_guid, "job_secret": job_secret,
                "dynamic_runs_remaining": dynamic_runs_remaining, "search_style": search_style,     
                "active_runs": active_runs,  "connect_info_by_node": {}, "secrets_by_node": secrets_by_node,  
                "xt_cmd": xt_cmd, "schedule": schedule, "node_count": node_count, "concurrent": concurrent,
                "service_job_info": None, "service_info_by_node": None,
            }

            self.store.log_job_info(job_id, dd)

        # SUBMIT JOB 
        # NOTE: we use "pool_info" here (vs. compute_def, which has not been updated with explicit args)
        service_job_info, service_info_by_node = self.backend.submit_job(job_id, job_runs, workspace, pool_info, resume_name, 
            repeat_count, using_hp, runs_by_box, experiment, snapshot_dir, adjustment_scripts, args)

        # POST SUBMIT processing

        # update job info 
        if not fake_submit:
            dd["service_job_info"] = service_job_info
            dd["service_info_by_node"] = service_info_by_node
            self.store.log_job_info(job_id, dd)

        # update lastrun/lastjob info
        xtd = xt_dict.read_xt_dict()
        xtd["last_run"] = last_run
        xtd["last_job"] = job_id
        xt_dict.write_xt_dict(xtd)

        # return values for API support (X)
        return cmds, run_specs, using_hp, using_aml_hparam, sweeps_text, pool_info, job_id 
    def build_data_frames(self):
        '''
        1. for each run, collect the reported metrics as metric sets (by reported col list)

        2. append to the dataframe for that col list
        '''
        # build "data_frames"
        no_metrics = []
        pp_run_names = []
        used_max = False
        data_frames_by_cols = {}
        got_columns = False

        for i, record in enumerate(self.run_log_records):
            # extract metrics for this run
            run = record["_id"]
            node = utils.node_id(record["node_index"])
            job = record["job_id"]
            experiment = record["exper_name"]
            workspace = record["ws"]
            search_style = utils.safe_value(record, "search_style")
            if search_style and search_style != "single":
                # parent run with children - skip it
                continue

            log_records = record["log_records"]

            metric_sets = run_helper.build_metrics_sets(log_records)
            if not metric_sets:
                no_metrics.append(run)
                continue

            if self.max_runs and len(pp_run_names) >= self.max_runs:
                used_max = True
                break

            if not got_columns:
                # set x and y columns
                explicit = qfe.get_explicit_options()
                if not "x" in explicit:
                    self.x_col = self.get_actual_x_column(
                        metric_sets, self.x_col, self.col_names)

                if not self.col_names:
                    # not specified by user, so build defaults
                    self.col_names = self.get_default_y_columns(
                        metric_sets, self.x_col)

                got_columns = True

            # merge metric sets into dfx
            for metric_set in metric_sets:

                # create a pandas DataFrame
                df = pd.DataFrame(metric_set["records"])
                cols = str(list(df.columns))

                # ensure this df has our x_col
                if self.x_col and not self.x_col in cols:
                    continue

                # ensure this df has at least 1 y_col
                found_y = False
                for y in self.col_names:
                    if y in cols:
                        found_y = True
                        break

                if not found_y:
                    continue

                # add run_name column
                df["run"] = [run] * df.shape[0]
                df["node"] = [node] * df.shape[0]
                df["job"] = [job] * df.shape[0]
                df["experiment"] = [experiment] * df.shape[0]
                df["workspace"] = [workspace] * df.shape[0]

                if not cols in data_frames_by_cols:
                    data_frames_by_cols[cols] = df
                else:
                    dfx = data_frames_by_cols[cols]
                    dfx = dfx.append(df)
                    data_frames_by_cols[cols] = dfx

            pp_run_names.append(run)

        if no_metrics:
            console.print(
                "\nnote: following runs were skipped (currently have no logged metrics): \n    {}\n"
                .format(", ".join(no_metrics)))

        if used_max:
            console.print(
                "plotting first {} runs (use --max-runs to override)".format(
                    self.max_runs))
        else:
            console.print("plotting {} runs...".format(len(pp_run_names)))

        # update our list of run_names to proces
        self.run_names = pp_run_names

        return data_frames_by_cols