Ejemplo n.º 1
0
    def ensure_script_ext_matches_box(self, script_name, fn_script, box_info):
        _, file_ext = os.path.splitext(fn_script)
        if file_ext in [".bat", ".sh"]:
            expected_ext = ".bat" if box_info.box_os == "windows" else ".sh"

            if file_ext != expected_ext:
                errors.combo_error("{} file ext='{}' doesn't match box.os='{}'".format(script_name, file_ext, box_info.box_os))
Ejemplo n.º 2
0
    def read_user_multi_commands(self, using_hp, run_script, cmd_parts, args):
        cmds = None
        
        lines = self.config.get("commands")
        if lines:
            # commands specified in the config file
            args["multi_commands"] = True
            multi_commands = True
        else:
            # did user specify --multi-commands
            multi_commands = args["multi_commands"]

        if multi_commands:
            if using_hp:
                errors.combo_error("Cannot specify both -multi-commands and hyperparameter search")

            # read MULTI CMDS
            if not lines:
                fn_cmds = args["script"]  # run_script if run_script else cmd_parts[0]
                lines = file_utils.read_text_file(fn_cmds, as_lines=True)
                lines = [line.strip() for line in lines if line and not line.strip().startswith("#")]

            cmds = [self.fixup_script_in_cmd(line) for line in lines]

        return cmds
Ejemplo n.º 3
0
 def __init__(self, run_record, plot_x_metric_name, plot_y_metric_name):
     metric_dict = run_record["data"]
     if not plot_x_metric_name in metric_dict:
         errors.combo_error(
             "step name hyperparameter '{}' (named in XT config file) not found in hp search file"
             .format(plot_x_metric_name))
     if not plot_y_metric_name in metric_dict:
         errors.combo_error(
             "primary_metric hyperparameter '{}' (named in XT config file) not found in hp search file"
             .format(plot_y_metric_name))
     self.x = int(metric_dict[plot_x_metric_name])
     self.y = float(metric_dict[plot_y_metric_name])
Ejemplo n.º 4
0
    def monitor_with_jupyter(self, workspace, run_name):
        if not self.is_aml_ws(workspace):
            errors.combo_error(
                "the monitor command is only supported for Azure ML runs")

        run_name, actual_ws = run_helper.parse_run_name(workspace, run_name)

        fn = self.azure_ml.make_monitor_notebook(actual_ws, run_name)
        dir = os.path.dirname(fn)
        #console.print("jupyter notebook written to: " + fn)
        monitor_cmd = "jupyter notebook --notebook-dir=" + dir
        console.print("monitoring notebook created; to run:")
        console.print("  " + monitor_cmd)
Ejemplo n.º 5
0
    def parse_string_list(self, tok, scanner, pipe_objects_enabled=True):
        global pipe_object_list
        #print("parse_string_list, tok=", tok)

        if not tok:
            # empty string specified
            value = []
            tok = scanner.scan()  # skip over the empty string
        elif tok == "$":
            if pipe_objects_enabled:
                global pipe_object_list
                pipe_object_list = get_xt_objects_from_cmd_piping()
                console.diag("pipe_object_list: {}".format(pipe_object_list))

            if pipe_objects_enabled and pipe_object_list:
                #print("found '*', pipe_object_list=", pipe_object_list)
                value = pipe_object_list
                console.print("replacing '$' with: ", value)
            else:
                errors.combo_error(
                    "'$' can only be used for piping the output of a previous XT command into this run"
                )

            # mark pipe objects as having been consumed by this parsing
            pipe_object_list = None

            tok = scanner.scan()  # skip over the $
        else:
            # scan a comma separated list of tokens (some of which can be single quoted strings)
            value = []

            while tok != None:
                if tok.startswith("--"):
                    break

                ev = self.expand_system_values(tok)
                value.append(ev)

                tok = scanner.scan()
                if tok != ",":
                    break

                tok = scanner.scan()  # skip over the comma

        return value, tok
Ejemplo n.º 6
0
    def validate_storage_and_mongo(self, mongo):
        '''
        1. ensure storage has been initialized for XT
        2. ensure mongo and storage point to each other
        3. update storage format if needed
        4. update mongo format if needed
        '''

        # ensure storage has been initialized for XT
        self._create_info_container_if_needed()

        # ensure mongo points to our storage
        storage_name = self.provider.get_service_name()
        connected_mongo = mongo.get_service_name()

        mongo_info = mongo.get_mongo_info()
        paired_storage = utils.safe_value(mongo_info, "paired_storage")
        if paired_storage and storage_name != paired_storage:
            errors.combo_error("mongo paired with storage service='{}', but passed XT storage service='{}'".format(  \
                paired_storage, storage_name))

        storage_info = self._get_storage_info()
        paired_mongo = utils.safe_value(storage_info, "paired_mongo")
        if paired_mongo and connected_mongo != paired_mongo:
            errors.combo_error("this storage paired with mongo service='{}', but passed connection string for mongo service='{}'".format(  \
                connected_mongo, paired_mongo))

        if not paired_storage:
            mongo_info = {
                "paired_storage": storage_name,
                "storage_version": constants.STORAGE_VERSION
            }
            mongo.set_mongo_info(mongo_info)

        if not paired_mongo:
            storage_info = {
                "paired_mongo": connected_mongo,
                "storage_version": constants.STORAGE_VERSION
            }
            self._set_storage_info(storage_info)

        # only check once, (takes .5 secs if already imported)
        # remove this check after all XT users have imported (approx. Dec 2019)
        # but keep around (good for mongodb repair, if needed)
        self.import_jobs_to_mongo_if_needed(mongo)
Ejemplo n.º 7
0
    def import_workspace(self,
                         input_file,
                         new_workspace,
                         job_prefix,
                         overwrite,
                         show_output=True):
        if not job_prefix:
            errors.combo_error("job prefix cannot be blank")

        with tempfile.TemporaryDirectory(prefix="import-") as temp_dir:
            self.import_workspace_core(temp_dir,
                                       input_file,
                                       new_workspace,
                                       job_prefix,
                                       overwrite,
                                       show_output=show_output)

        if show_output:
            console.print("  import completed")
Ejemplo n.º 8
0
    def write_hparams_to_files(self, job_id, cmds, fake_submit, using_hp, args):
        # write to job-level sweeps-list file
        #console.print("cmds=", cmds)   
        cmds_text = json.dumps(cmds)

        if not fake_submit:
            self.store.create_job_file(job_id, constants.HP_SWEEP_LIST_FN, cmds_text)

        boxes, pool_info, service_type = box_information.get_box_list(self, job_id=job_id, args=args)
        num_boxes = len(boxes)

        is_distributed = args["distributed"]
        if is_distributed:
            # check for conflicts
            if using_hp:
                errors.combo_error("Cannot do hyperparamer search on a distributed-training job")

            if service_type != "aml":
                errors.combo_error("Distributed-training is currently only supported for AML jobs")

        return boxes, num_boxes
Ejemplo n.º 9
0
    def calc_actual_layout(self, count, layout):
        if not "x" in layout:
            errors.syntax_error(
                "layout string must be of form RxC (R=# rows, C=# cols)")

        r, c = layout.split("x", 1)

        if r:
            r = int(r)
            c = int(c) if c else math.ceil(count / r)
        elif c:
            c = int(c)
            r = int(r) if r else math.ceil(count / c)

        full_count = r * c
        if full_count < count:
            errors.combo_error(
                "too many plots ({}) for layout cells ({})".format(
                    count, full_count))

        return r, c
Ejemplo n.º 10
0
    def import_workspace_core(self, temp_dir, input_file, new_workspace,
                              job_prefix, overwrite, show_output):

        # unzip files and use contents.json
        file_helper.unzip_files(input_file, temp_dir)

        fn_contents = os.path.join(temp_dir, "contents.json")
        text = file_utils.read_text_file(fn_contents)
        contents = json.loads(text)

        workspaces = contents["workspaces"]
        if len(workspaces) > 1:
            errors.combo_error(
                "import of archive files with multiple workspaces not yet supported"
            )

        workspace = workspaces[0]
        jobs = contents["jobs"]

        if not new_workspace:
            new_workspace = workspace

        if self.store.does_workspace_exist(new_workspace):
            errors.combo_error(
                "cannot import to an existing workspace name: {}".format(
                    new_workspace))

        if show_output:
            console.print(
                "\nimporting workspace {} ({} jobs) as {} from: {}".format(
                    workspace, len(jobs), new_workspace, input_file))

        if not overwrite:
            # before making any changes, verify all job names are available
            job_ids = []

            for jc in jobs:
                prev_job_id = jc["job_id"]
                prev_base = prev_job_id.split("_")[-1]
                new_job_id = "{}_{}".format(job_prefix, prev_base)
                job_ids.append(new_job_id)

            filter_dict = {"job_id": {"$in": job_ids}}
            records = self.store.mongo.get_info_for_jobs(
                filter_dict, {"_id": 1})
            if records:
                id = records[0]["_id"]
                errors.general_error(
                    "at least 1 job ID with prefix already exists: {}".format(
                        id))

        # create the new workspace
        self.store.create_workspace(new_workspace)

        # now, import each JOB
        max_run_seen = 0
        max_end_seen = 0

        for jc in jobs:
            prev_job_id = jc["job_id"]
            prev_base = prev_job_id.split("_")[-1]

            new_job_id = "{}_{}".format(job_prefix, prev_base)
            runs = jc["runs"]

            if show_output:
                console.print("  importing: {} => {} ({} runs)".format(
                    prev_job_id, new_job_id, len(runs)))

            # create MONGO JOB document
            mongo_job_fn = os.path.join(
                temp_dir, "mongo/jobs/{}/mongo_job.json".format(prev_job_id))
            self.import_job_mongo_document(mongo_job_fn, new_workspace,
                                           prev_job_id, new_job_id)

            # create STORAGE JOB blobs
            storage_job_path = os.path.join(
                temp_dir, "storage/jobs/{}".format(prev_job_id))
            self.import_job_storage_blobs(storage_job_path, new_workspace,
                                          prev_job_id, new_job_id)

            # for each run in job
            for run_name in runs:

                run_number = run_helper.get_parent_run_number(run_name)
                max_run_seen = max(max_run_seen, run_number)

                # copy MONGO RUN document
                mongo_run_fn = os.path.join(
                    temp_dir,
                    "mongo/workspaces/{}/runs/{}/mongo_run.json".format(
                        workspace, run_name))
                end_id = self.import_run_mongo_document(
                    mongo_run_fn, workspace, new_workspace, prev_job_id,
                    new_job_id, run_name)
                max_end_seen = max(max_end_seen, end_id)

                # copy STORAGE RUN blobs
                storage_run_path = os.path.join(
                    temp_dir, "storage/workspaces/{}/runs/{}".format(
                        workspace, run_name))
                self.import_run_storage_blobs(storage_run_path, workspace,
                                              new_workspace, prev_job_id,
                                              new_job_id, run_name)

        # update MONGO counters for new workspace
        self.store.mongo.init_workspace_counters(new_workspace,
                                                 1 + max_run_seen,
                                                 1 + max_end_seen)
Ejemplo n.º 11
0
    def export_workspace_core(self, temp_dir, output_file, workspace, tags_all,
                              tags_any, jobs, experiment, show_output):

        # get specified jobs from workspace (by job name, or by workspace name)
        args = {
            "job_list": jobs,
            "tags_all": tags_all,
            "tags_any": tags_any,
            "workspace": workspace,
            "all": True,
            "target": None,
            "available": None,
            "experiment": experiment,
            "service_type": None,
            "username": None,
            "filter": None,
            "columns": ["job", "workspace"]
        }

        job_list, _, _, _, _ = job_helper.get_list_jobs_records(
            self.store, self.config, args)

        if show_output:
            console.print("\nexporting workspace {} ({} jobs) to: {}".format(
                workspace, len(job_list), output_file))

        # build a table of contents structure describing this archive
        archive_version = "1"
        build = constants.BUILD
        username = self.config.get("general", "username")
        dt = datetime.datetime.now()
        dt_text = str(dt)
        storage_name = self.store.get_name()
        mongo_name = self.store.mongo.get_service_name()

        workspaces = []
        jobs = []
        contents = {
            "user": username,
            "export_date": dt_text,
            "archive_version": archive_version,
            "xt_build": build,
            "storage": storage_name,
            "mongo": mongo_name,
            "workspaces": workspaces,
            "jobs": jobs
        }

        first_job = None
        first_ws = None

        # for each job in workspace
        for jr in job_list:
            job_id = jr["job"]
            job_ws = jr["workspace"]

            mongo_runs = self.store.mongo.get_info_for_runs(
                job_ws, {"job_id": job_id}, None)
            run_names = [mr["run_name"] for mr in mongo_runs]

            if show_output:
                console.print("  exporting: {} ({} runs)".format(
                    job_id, len(mongo_runs)))

            job_content = {
                "job_id": job_id,
                "workspace": job_ws,
                "runs": run_names
            }
            jobs.append(job_content)

            if first_job is None:
                first_job = job_id
                first_ws = job_ws

                workspaces.append(job_ws)

            if job_ws != first_ws:
                errors.combo_error("can only export jobs from a single workspace (job {} has ws={}, job {} as ws={})". \
                    format(first_job, first_ws, job_id, job_ws))

            # copy MONGO JOB document
            temp_mongo_path = os.path.join(temp_dir,
                                           "mongo/jobs/{}".format(job_id))
            self.export_job_mongo_document(job_id, temp_mongo_path)

            # copy STORAGE JOB blobs
            temp_store_path = os.path.join(temp_dir,
                                           "storage/jobs/{}".format(job_id))
            self.export_job_storage_blobs(job_id, temp_store_path)

            # for each run in job
            for mr in mongo_runs:

                # copy MONGO RUN document
                run_name = mr["run_name"]
                temp_mongo_path = os.path.join(
                    temp_dir,
                    "mongo/workspaces/{}/runs/{}".format(job_ws, run_name))
                self.export_run_mongo_document(mr, temp_mongo_path)

                # copy STORAGE RUN blobs
                temp_store_path = os.path.join(
                    temp_dir,
                    "storage/workspaces/{}/runs/{}".format(job_ws, run_name))
                self.export_run_storage_blobs(job_ws, run_name,
                                              temp_store_path)

        # add contents
        text = json.dumps(contents, indent=4)
        fn_contents = os.path.join(temp_dir, "contents.json")
        file_utils.write_text_file(fn_contents, text)

        # create zip file
        filenames, local_path = file_utils.get_local_filenames(temp_dir +
                                                               "/**")
        prefix_len = 1 + len(temp_dir)
        file_helper.zip_up_filenames(output_file,
                                     filenames,
                                     compress=True,
                                     remove_prefix_len=prefix_len)
Ejemplo n.º 12
0
    def emit_mount_cmds(
        self,
        cmds,
        storage_name,
        storage_key,
        container,
        store_path,
        mnt_path,
        is_writable,
        install_blobfuse,
        sudo_available,
        use_username,
        use_allow_other,
        env_var_name,
        env_var_name2,
        nonempty=False,
        cleanup_needed=False,
    ):

        if cleanup_needed:
            # on pool machines, for any action, always UNMOUNT mnt_dir
            # also, always zap the folder in case in was used in downloading files
            if self.is_windows:
                self.append(cmds, "rd /s /q {}".format(mnt_path))
            else:
                sudo = "sudo " if sudo_available else ""

                self.append(cmds,
                            "{}fusermount -u -q {}".format(sudo, mnt_path))

                # do NOT call rm as it can delete cloud data if fusermount -u failed
                #self.append(cmds,"{}rm -rf {}".format(sudo, mnt_path))

        if self.is_windows:
            # TODO: provide pseudo-mount for local machine by using data-local and store-local config properties
            errors.combo_error(
                "Mounting of Azure storage (for '{}') not supported by target OS (Windows)"
                .format(store_data_dir))

        # for now, all commands can assume linux form
        self.append(
            cmds,
            "echo MOUNTING {} to container {}".format(mnt_path, container))
        full_mnt_path = mnt_path + "/" + store_path
        self.append(
            cmds, "echo running export {}={}".format(env_var_name,
                                                     full_mnt_path))

        self.append(cmds,
                    'echo setting {}="{}"'.format(env_var_name, full_mnt_path))

        self.append(cmds, 'export {}="{}"'.format(env_var_name, full_mnt_path))
        self.append(cmds, 'export {}="{}"'.format(env_var_name2,
                                                  full_mnt_path))

        requests = [{
            "container": container,
            "mnt_dir": mnt_path,
            "readonly": not is_writable
        }]
        sub_cmds = self.create_blobfuse_commands(
            storage_name,
            storage_key,
            sudo_available,
            requests,
            install_blobfuse=install_blobfuse,
            use_username=use_username,
            use_allow_other=use_allow_other,
            nonempty=nonempty)
        cmds += sub_cmds