Ejemplo n.º 1
0
    def cancel_runs_by_user(self, box_name):
        '''
        Args:
            box_name: the name of the box the runs ran on (pool service)
        Returns:
            cancel_results: a list of kill results records 
                (keys: workspace, run_name, exper_name, killed, status, before_status)
        '''
        cancel_results = []

        # get list of active jobs from batch
        active_jobs = self.get_active_jobs()
        console.diag("after get_active_jobs()")

        if active_jobs:
            for job_record in active_jobs:
                # watch out for older jobs that didn't have service_job_info/service_info_by_node properties
                service_job_info = utils.safe_value(job_record,
                                                    "service_job_info")
                service_info_by_node = utils.safe_value(
                    job_record, "service_info_by_node")

                if service_job_info and service_info_by_node:
                    job_id = job_record["job_id"]
                    cancel_result = self.cancel_job(service_job_info,
                                                    service_info_by_node)
                    for _, node_result in cancel_result.items():
                        cancel_results.append(node_result)

        return cancel_results
Ejemplo n.º 2
0
    def download_files(self, wildcard, dest_folder):
        container, path, wc_target = self._get_container_path_target(wildcard)
        console.diag("container={}, path={}, wc_target={}, wildcard={}".format(
            container, path, wc_target, wildcard))

        return self.store._download_files(container, path, wc_target,
                                          dest_folder)
Ejemplo n.º 3
0
    def get_info_for_jobs(self, filter_dict, fields_dict=None):

        cursor = self.mongo_with_retries(
            "get_info_for_jobs",
            lambda: self.mongo_db["__jobs__"].find(filter_dict, fields_dict))
        job_records = list(cursor) if cursor else []

        console.diag("after get_info_for_jobs()")
        return job_records
Ejemplo n.º 4
0
def copy_data_to_submit_logs(args, data, fn):
    submit_logs = args["submit_logs"]
    if submit_logs:
        text = json.dumps(data)
        # copy text to submit logs
        fn_dest = os.path.join(submit_logs, os.path.basename(fn))
        with open(fn_dest, "w") as outfile:
            outfile.write(text)
        console.diag("copied {} to: {}".format(fn, fn_dest))
Ejemplo n.º 5
0
def copy_to_submit_logs(args, fn, fnx=None):
    submit_logs = args["submit_logs"]
    if submit_logs:
        # copy file to submit logs
        if not fnx:
            fnx = fn
        fn_dest = os.path.join(submit_logs, os.path.basename(fn))
        shutil.copyfile(fn, fn_dest)
        console.diag("copied {} to: {}".format(fn, fn_dest))
Ejemplo n.º 6
0
def get_filtered_sorted_limit_runs(store,
                                   config,
                                   show_gathering,
                                   col_dict=None,
                                   args=None):

    console.diag("start of: get_filtered_sorted_limit_runs")
    # required
    run_list = args["run_list"]

    # optional
    pool = utils.safe_value(args, "target")
    available = utils.safe_value(args, "available")
    workspace = utils.safe_value(args, "workspace")

    if workspace:
        store.ensure_workspace_exists(workspace, flag_as_error=True)

    mongo = store.get_mongo()

    # have MONGO update any old RUN documents to new format
    fixup_mongo_runs.fixup_runs_if_needed(mongo.mongo_db, workspace)

    # get info about run properties
    user_to_actual, std_cols_desc = get_run_property_dicts()
    actual_to_user = {value: key for key, value in user_to_actual.items()}

    builder = ReportBuilder(config, store, client=None)

    # get list of specified runs
    pure_run_list, actual_ws = expand_run_list(store, mongo, workspace,
                                               run_list)
    if run_list and not pure_run_list:
        errors.general_error("no run(s) found")

    # build a filter dict for all specified filters
    filter_dict = build_run_filter_dict(pure_run_list, user_to_actual, builder,
                                        args)

    # if show_gathering:
    #     console.print("gathering run data...", flush=True)

    # get the mongo records for the matching RUNS
    records, using_default_last, last = builder.get_mongo_records(
        mongo,
        filter_dict,
        workspace,
        "runs",
        actual_to_user,
        col_dict=col_dict,
        args=args)

    console.diag("end of: get_filtered_sorted_limit_runs")

    return records, using_default_last, user_to_actual, available, builder, last, std_cols_desc
Ejemplo n.º 7
0
    def make_local_snapshot(self, snapshot_dir, code_dir, dest_name, omit_list):
        '''
        keep code simple (and BEFORE upload fast):
            - always copy code dir to temp dir
            - if needed, copy xtlib subdir
            - later: if needed, add 2 extra controller files
            - later: zip the whole thing at once & upload 
        '''
        if dest_name and dest_name != ".":
            snapshot_dir += "/" + dest_name

        console.diag("before create local snapshot")

        # fixup slashes for good comparison
        snapshot_dir = os.path.realpath(snapshot_dir)

        # fully qualify path to code_dir for simpler code & more informative logging
        code_dir = os.path.realpath(code_dir)

        recursive = True

        if code_dir.endswith("**"):
            code_dir = code_dir[:-2]   # drop the **
        elif code_dir.endswith("*"):
            recursive = False

        # copy user's source dir (as per config file options)
        if True:    
            omit_list = utils.parse_list_option_value(omit_list)

            # build list of files matching both criteria
            filenames = file_helper.get_filenames_from_include_lists(None, omit_list, recursive=recursive, from_dir=code_dir)

            file_utils.ensure_dir_exists(snapshot_dir)
            prefix_len = 2 if code_dir == "." else len(code_dir)
            copy_count = 0

            # copy files recursively, preserving subdir names
            for fn in filenames:
                fn = os.path.realpath(fn)           # fix slashes

                if fn.startswith(code_dir) and fn != code_dir:
                    fn_dest = snapshot_dir + "/" + fn[prefix_len:]
                    file_utils.ensure_dir_exists(file=fn_dest)
                    shutil.copyfile(fn, fn_dest)
                else:
                    shutil.copy(fn, snapshot_dir)
                copy_count += 1

            #console.diag("after snapshot copy of {} files".format(copy_count))
        else:
            shutil.copytree(code_dir, snapshot_dir)  
            
        return snapshot_dir
Ejemplo n.º 8
0
    def get_info_for_runs(self, ws_name, filter_dict, fields_dict=None):

        # filter_dict = {}
        # filter_dict["run_name"] = {"$in": run_names}

        cursor = self.mongo_with_retries(
            "get_boxes_for_runs",
            lambda: self.mongo_db[ws_name].find(filter_dict, fields_dict))
        run_records = list(cursor) if cursor else []

        console.diag("after get_boxes_for_runs()")
        return run_records
Ejemplo n.º 9
0
    def get_all_experiments_in_ws(self, ws_name):
        # cannot get "distinct" command to work ("command not supported")
        #cursor = db["__jobs__"].distinct("ws_name")

        cursor = self.mongo_with_retries(
            "get_all_experiments_in_ws", lambda: self.mongo_db["__jobs__"].
            find({"ws_name": ws_name}, {"exper_name": 1}))
        exper_names = [
            rec["exper_name"] for rec in cursor if "exper_name" in rec
        ]
        exper_names = list(set(exper_names))  # remove dups

        console.diag("after get_all_experiments()")
        return exper_names
Ejemplo n.º 10
0
    def run_aml_job(self, job_id, workspace, aml_ws_name, trainer, experiment,
                    xt_exper_name, aml_exper_name, compute_target, cwd,
                    run_name, box_name, node_index, repeat, fake_submit, args):
        monitor_cmd = None

        console.diag("before AML experiment.submit(trainer)")

        # SUBMIT the run and return an AML run object
        if fake_submit:
            aml_run = None
            aml_run_id = "fake_aml_id"
            aml_run_number = 999
        else:
            aml_run = experiment.submit(trainer)
            aml_run_id = aml_run.id
            aml_run_number = aml_run.number

        # copy to submit-logs
        utils.copy_data_to_submit_logs(args, self.serializable_trainer,
                                       "aml_submit.json")

        console.diag("after AML experiment.submit(trainer)")

        config = self.config
        username = args["username"]
        description = args["description"]
        aggregate_dest = args["aggregate_dest"]
        jupyter_monitor = args["jupyter_monitor"]

        aml_run_name = aml_exper_name + ".{}".format(run_name)

        # set "xt_run_name" property for fast access to run in future
        if not fake_submit:
            aml_run.add_properties({"xt_run_name": aml_run_name})
            aml_run.set_tags({"xt_run_name": aml_run_name})

        # # partially log the start of the RUN
        # self.store.start_run_core(workspace, run_name, exper_name=xt_exper_name, description=description, username=username,
        #         box_name=box_name, app_name=None, repeat=repeat, is_parent=False, job_id=job_id, pool=compute_target, node_index=node_index,
        #         aggregate_dest=aggregate_dest, path=cwd, aml_run_id=aml_run_id)

        if jupyter_monitor:
            fn = self.make_monitor_notebook(aml_ws_name, aml_run_name)
            dir = os.path.dirname(fn)
            #console.print("jupyter notebook written to: " + fn)
            monitor_cmd = "jupyter notebook --notebook-dir=" + dir

        return run_name, monitor_cmd, aml_run_name, aml_run_number, aml_run_id
Ejemplo n.º 11
0
    def _read_blob(self, ws_name, blob_path):
        console.diag("_read_blob: ws_name={}, blob_path={}".format(
            ws_name, blob_path))

        if not self.does_workspace_exist(ws_name):
            # avoid 10 retries and unfriendly storage errors
            errors.store_error("container doesn't exist: " + ws_name)

        if not self.provider.does_blob_exist(ws_name, blob_path):
            # avoid 10 retries and unfriendly storage errors
            errors.store_error(
                "blob doesn't exist: container={}, path={}".format(
                    ws_name, blob_path))

        blob_text = self.provider.get_blob_text(ws_name, blob_path)
        return blob_text
Ejemplo n.º 12
0
    def parse_string_list(self, tok, scanner, pipe_objects_enabled=True):
        global pipe_object_list
        #print("parse_string_list, tok=", tok)

        if not tok:
            # empty string specified
            value = []
            tok = scanner.scan()  # skip over the empty string
        elif tok == "$":
            if pipe_objects_enabled:
                global pipe_object_list
                pipe_object_list = get_xt_objects_from_cmd_piping()
                console.diag("pipe_object_list: {}".format(pipe_object_list))

            if pipe_objects_enabled and pipe_object_list:
                #print("found '*', pipe_object_list=", pipe_object_list)
                value = pipe_object_list
                console.print("replacing '$' with: ", value)
            else:
                errors.combo_error(
                    "'$' can only be used for piping the output of a previous XT command into this run"
                )

            # mark pipe objects as having been consumed by this parsing
            pipe_object_list = None

            tok = scanner.scan()  # skip over the $
        else:
            # scan a comma separated list of tokens (some of which can be single quoted strings)
            value = []

            while tok != None:
                if tok.startswith("--"):
                    break

                ev = self.expand_system_values(tok)
                value.append(ev)

                tok = scanner.scan()
                if tok != ",":
                    break

                tok = scanner.scan()  # skip over the comma

        return value, tok
Ejemplo n.º 13
0
    def create_vault_if_needed(self):
        if not self.vault:
            console.diag("before vault login")
            from xtlib import xt_vault

            # create our vault manager
            vault_url = self.get_vault_url()
            team_name = self.get("general", "xt-team-name")

            azure_tenant_id = self.get("general", "azure-tenant-id")
            self.vault = xt_vault.XTVault(vault_url,
                                          team_name,
                                          azure_tenant_id=azure_tenant_id)

            authentication = self.get("general", "authentication")
            self.vault.init_creds(authentication)

            console.diag("after vault login")
Ejemplo n.º 14
0
    def get_next_sequential_ws_id(self, ws_name, path, default_next_run):
        db = self.mongo_db

        assert not "/" in ws_name
        assert not "/" in path

        console.diag("ws={}, path={}, default_next_run={}".format(
            ws_name, path, default_next_run))

        # does a counters doc exist for this ws_name?
        cursor = db.ws_counters.find({"_id": ws_name}).limit(1)
        if not cursor.count():
            console.diag(
                "LEGACY ws={} found in get_next_sequential_ws_id".format(
                    ws_name))

            # we need BOTH next_run and next_end for a new record
            last_id = self.get_legacy_end_id(ws_name)
            default_next_end = 1 + last_id if last_id else 1

            info = {
                "_id": ws_name,
                "next_run": default_next_run,
                "next_end": default_next_end,
                "next_child": {}
            }
            db.ws_counters.insert_one(info)

        document = db.ws_counters.find_and_modify({"_id": ws_name},
                                                  update={"$inc": {
                                                      path: 1
                                                  }},
                                                  new=False)
        next_id = utils.safe_nested_value(document, path)

        if not next_id:
            # child id's start at 0; if we got that, skip it and get next one
            document = db.ws_counters.find_and_modify(
                {"_id": ws_name}, update={"$inc": {
                    path: 1
                }}, new=False)
            next_id = utils.safe_nested_value(document, path)

        return next_id
Ejemplo n.º 15
0
    def download_runs(self, store, ws_name, run_group_name, run_group_type,
                      hp_config_cloud_path, hp_config_local_dir):
        # Download the all_runs file
        local_cache_path = "{}/{}/{}/".format(hp_config_local_dir, ws_name,
                                              run_group_type)
        local_config_file_path = "{}{}".format(local_cache_path,
                                               "hp-config.yaml")

        if run_group_name == "experiment":
            console.print(
                "downloading runs for EXPERIMENT={}...".format(run_group_type))
            # files are at EXPERIMENT LEVEL
            # read SWEEPS file
            if not store.does_experiment_file_exist(ws_name, run_group_type,
                                                    hp_config_cloud_path):
                errors.store_error(
                    "missing experiment hp_config file (ws={}, exper={}, fn={})"
                    .format(ws_name, run_group_type, hp_config_cloud_path))
            store.download_file_from_experiment(ws_name, run_group_type,
                                                hp_config_cloud_path,
                                                local_config_file_path)

            # read ALLRUNS info aggregated in EXPERIMENT
            allrun_records = store.get_all_runs(run_group_name, ws_name,
                                                run_group_type)
        else:
            console.print(
                "downloading runs for JOB={}...".format(run_group_type))
            # files are at JOB LEVEL
            # read SWEEPS file
            if not store.does_job_file_exist(run_group_type,
                                             hp_config_cloud_path):
                errors.store_error(
                    "missing job hp_config file (job={}, fn={})".format(
                        run_group_type, hp_config_cloud_path))
            store.download_file_from_job(run_group_type, hp_config_cloud_path,
                                         local_config_file_path)

            # read ALLRUNS info aggregated in JOB
            allrun_records = store.get_all_runs(run_group_name, ws_name,
                                                run_group_type)

        console.diag("after downloading all runs")
        return local_config_file_path, allrun_records
Ejemplo n.º 16
0
    def remove_workspace(self, ws_name):
        self.remove_cache(ws_name)

        # remove associated mongo_db container
        container = self.mongo_db[ws_name]
        container.drop()
        count = container.count()

        console.diag("  after mongo_db container={} dropped, count={}=".format(
            container, count))

        # remove counters for this workspace
        cmd = lambda: self.mongo_db.ws_counters.remove({"_id": ws_name})
        self.mongo_with_retries("remove_workspace", cmd, ignore_error=True)

        # remove legacy counters for this workspace
        end_id = ws_name + "-end_id"
        cmd = lambda: self.mongo_db.ws_counters.remove({"_id": end_id})
        self.mongo_with_retries("remove_workspace", cmd)
Ejemplo n.º 17
0
    def _list_directories(self, container, path, wc_target, subdirs=0):
        console.diag(
            "_list_directories: container={}, path={}, wc_target={}, subdirs={}"
            .format(container, path, wc_target, subdirs))

        service_name = self.provider.get_service_name()
        dd = {"store_name": "XT Store ({})".format(service_name)}
        #console.print("dd=", dd)

        if not container:
            # get a list of all containers is a special case
            if path:
                errors.syntax_error(
                    "path can not be set when the container is set to '/'")

            folder, folder_names = self._get_root_folders()
            folders = [folder]

            if subdirs:
                base_path = ""
                for ws_name in folder_names:
                    # get blobs from AZURE
                    console.diag("reading blobs for ws={}".format(ws_name))
                    blobs = self.provider.list_blobs(ws_name,
                                                     path=None,
                                                     return_names=False)
                    blobs = list(blobs)

                    ws_folders = self._build_folders_from_blobs(
                        blobs, ws_name, base_path, subdirs)

                    folders += ws_folders
        else:
            # get blobs from AZURE
            actual_path = path if path else None
            blobs = self.provider.list_blobs(container,
                                             path=actual_path,
                                             return_names=False)
            blobs = list(blobs)

            if wc_target:
                # apply filter
                blobs = [
                    blob for blob in blobs if fnmatch(blob.name, wc_target)
                ]

            console.diag("list_blobs returned: len(blobs)={}".format(
                len(blobs)))

            folders = self._build_folders_from_blobs(blobs, container, path,
                                                     subdirs)

        # filter folders as per subdirs
        if not subdirs is True:
            # subdirs is set to an int value
            #console.print("filtering by subdirs=", subdirs)
            folders = [f for f in folders if f["level"] <= subdirs]

        dd["folders"] = folders
        return dd
Ejemplo n.º 18
0
    def cancel_run(self, ws_name, run_name):
        console.diag("start of azure_ml.cancel_run()")

        target_run = self.get_run(ws_name, run_name)
        if not target_run:
            errors.store_error("run not found: {}".format(run_name))

        console.diag("after get_run() call")

        before_status = target_run.status.lower()
        if before_status in ["preparing", "queued"]:
            target_run.cancel()
            killed = True
            status = "cancelled"
        elif before_status in ["starting", "running"]:
            target_run.cancel()
            killed = True
            status = "cancelled"
        else:
            killed = False
            status = target_run.status

        console.diag("after run.cancel() call")

        return {
            "workspace": ws_name,
            "run_name": run_name,
            "cancelled": killed,
            "status": status
        }
Ejemplo n.º 19
0
    def is_controller_running(self,
                              box_name,
                              box_addr,
                              port=constants.CONTROLLER_PORT):
        if not port:
            port = constants.CONTROLLER_PORT

        # KISS: just try to connect
        is_running = False

        try:
            ip_addr = self.core.get_ip_addr_from_box_addr(box_addr)
            console.diag(
                "  trying to connect with: ip_addr={}, port={}".format(
                    ip_addr, port))

            self.connect(box_name, ip_addr, port=port)
            is_running = True
        except BaseException as ex:
            console.diag("  received exception: " + str(ex))
            is_running = False
            #raise ex   # uncomment to see the stack trace

        console.diag("  is_controller_running: " + str(is_running))
        return is_running
Ejemplo n.º 20
0
    def snapshot_all_code(self, snapshot_dir, cmd_parts, args):
        '''
        make local snapshot of each code_dir (and xtlib, if needed)
        '''
        code_dirs = args["code_dirs"]
        xtlib_capture = args["xtlib_upload"]
        code_omit = args["code_omit"]
        script_dir = None

        code_upload = args["code_upload"]
        
        # this step should always be done so that script_dir is removed from cmd_parts
        script_dir = self.remove_script_dir_from_parts(cmd_parts)

        if code_upload:
            for i, code_dir in enumerate(code_dirs):
                # fixup "$scriptdir" relative paths
                if "$scriptdir" in code_dir:
                    code_dir = code_dir.replace("$scriptdir", script_dir)

                if "::" in code_dir:
                    code_dir, dest_dir = code_dir.split("::")
                else:
                    dest_dir = "."
                self.make_local_snapshot(snapshot_dir, code_dir, dest_dir, code_omit)
        else:
            script_dir = snapshot_dir

        if xtlib_capture:
            # copy XTLIB directory to "xtlib" subdir of temp
            xtlib_dir = file_utils.get_xtlib_dir()
            dest_dir = snapshot_dir + "/xtlib"
            file_utils.ensure_dir_deleted(dest_dir)

            # don't copy the "demo_files" directory
            shutil.copytree(xtlib_dir, dest_dir, ignore=shutil.ignore_patterns("demo_files"))

        console.diag("after create local snapshot")
        return script_dir
Ejemplo n.º 21
0
    def import_jobs_to_mongo_if_needed(self, mongo):
        console.diag("before mongo import check")
        found = mongo.does_jobs_exist()
        console.diag("after mongo import check")

        if not found:
            # first time we have seen this data; import all jobs into mongo-db now
            console.print("one-time import of jobs data into mongo-db:")
            job_names = self.get_job_names()
            if job_names:
                console.print("  {:,} jobs will be imported".format(
                    len(job_names)))
                count = 0
                for job_id in job_names:
                    job_json = self.read_job_info_file(job_id)
                    dd = json.loads(job_json)
                    mongo.update_job_info(job_id, dd)
                    count += 1
                    if count % 100 == 0:
                        console.print("  " + job_id)

                console.print("  {} jobs imported".format(count))
Ejemplo n.º 22
0
    def _download_files(self, container, path, wc_target, dest_folder):
        #console.print("ws_name=", ws_name, ", ws_wildcard=", ws_wildcard)
        files_copied = []

        names = self._list_wild_blobs(container,
                                      path,
                                      wc_target,
                                      include_folder_names=True)
        console.diag("_download_files: names=", names)

        blob_dir = path
        bd_index = 1 + len(blob_dir)  # add for for trailing slash
        #console.print("blob_dir=", blob_dir, ", bd_index=", bd_index)

        for bn in names:
            base_bn = bn[bd_index:]
            dest_fn = dest_folder + "/" + base_bn
            console.detail("_download_files: bn=", bn, ", dest_fn=", dest_fn)

            file_utils.ensure_dir_exists(file=dest_fn)
            self.provider.get_blob_to_path(container, bn, dest_fn)
            files_copied.append(dest_fn)

        return files_copied
Ejemplo n.º 23
0
    def connect_to_controller(self, box_name=None, ip_addr=None, port=None):
        '''
        establish communication with the XT controller process on the specified box.
        return True if connection established, False otherwise.
        '''
        connected = False
        console.diag("init_controler: box_name={}".format(box_name))

        if self.conn == box_name:
            connected = True
        else:
            if ip_addr:
                box_addr = ip_addr
            else:
                info = box_information.get_box_addr(self.config, box_name,
                                                    self.store)
                box_addr = info["box_addr"]
                controller_port = info["controller_port"]
                self.token = info["box_secret"]

                ip_addr = self.core.get_ip_addr_from_box_addr(box_addr)
                port = controller_port if controller_port else constants.CONTROLLER_PORT

            # the controller should now be running - try to connect
            try:
                console.diag("  connecting to controller")
                self.connect(box_name, ip_addr, port=port)
                console.diag("  connection successful!")

                # magic step: allows our callback to work correctly!
                # this must always be executed (even if self.conn is already true)
                bgsrv = rpyc.BgServingThread(self.conn)
                console.diag("  now running BgServingThread")
                connected = True
            except BaseException as ex:
                #self.report_controller_init_failure(box_name, box_addr, self.port, ex)
                # most common reasons for failure: not yet running (backend service) or finished running
                pass

        return connected
Ejemplo n.º 24
0
    def get_run(self, ws_name, run_name):
        if not "." in run_name:
            errors.general_error(
                "Azure ML run name must be of the form: exper.runname")

        ws = self.get_aml_ws(ws_name)
        console.diag("after get_aml_ws() call")

        exper_name, run_part = run_name.split(".")
        experiment = Experiment(ws, name=exper_name)
        runs = experiment.get_runs(properties={"xt_run_name": run_name})
        console.diag("after experiment.get_runs() call")

        runs = list(runs)
        console.diag("after list(runs), len={}".format(len(runs)))

        # run_number = int(run_part[3:])
        # target_run = None

        #runs = [run for run in runs if run.number == run_number]
        target_run = runs[0] if len(runs) else None

        return target_run
Ejemplo n.º 25
0
 def set_timer(timeout):
     console.print("set_timer called: timeout=", timeout)
     time.sleep(timeout)
     console.diag("timer triggered!")
     plt.close("all")
Ejemplo n.º 26
0
    def create_run(self, job_id, user_cmd_parts, box_name="local", parent_name=None, rerun_name=None, node_index=0, 
            using_hp=False, repeat=None, app_info=None, path=None, exper_name=None, pool_info=None, fake_submit=False, 
            search_style=None, args=None):
        '''
        'create_run' does the following:
            - creates a new run name (and matching run directory in the store)
            - logs a "created" record in the run log
            - logs a "created" record in the workspace summary log
            - logs a "cmd" record in the run log
            - log an optional "notes" record in the run log
            - captures the run's "before" files to the store's run directory

        '''
        console.diag("create_run: start")

        app_name = None   # app_info.app_name
        box_nane = args["box"]
        pool = args["pool"]
        run_name = ""
        log_to_store = self.config.get("logging", "log")
        aggregate_dest = args["aggregate_dest"]

        if log_to_store:
            if not exper_name:
                exper_name = input("experiment name (for grouping this run): ")

            #console.print("calling store.start_run with exper_name=", exper_name)
            username = args["username"]
            description = args["description"]
            workspace = args["workspace"]

            console.diag("create_run: before start_run")

            service_type = args["service_type"]
            compute = args["target"]
            search_type = args["search_type"]
            sku = args["sku"]

            if not sku:
                # make default sku explicit
                if pool_info and "sku" in pool_info:
                    sku = pool_info["sku"].lower()

            # create RUN in store
            if fake_submit:
                run_name = "fake_run123"
            else:
                if parent_name:
                    run_name = self.store.start_child_run(workspace, parent_name, box_name=box_name, username=username,
                        exper_name=exper_name, app_name=app_name, pool=pool, job_id=job_id, node_index=node_index, sku=sku,
                        description=description, aggregate_dest=aggregate_des, path=path, compute=compute, service_type=service_type, 
                        search_style=search_style)
                else:
                    is_parent = search_style != "single"

                    run_name = self.store.start_run(workspace, exper_name=exper_name, box_name=box_name, app_name=app_name, 
                        username=username, repeat=repeat, pool=pool, job_id=job_id, node_index=node_index, sku=sku,
                        description=description, aggregate_dest=aggregate_dest, path=path, compute=compute, service_type=service_type, 
                        search_style=search_style, is_parent=is_parent)

            console.diag("create_run: after start_run")

            # always log cmd (for re-run purposes)
            xt_cmd = args["xt_cmd"]

            if not fake_submit:
                self.store.log_run_event(workspace, run_name, "cmd", {"cmd": user_cmd_parts, "xt_cmd": xt_cmd })

            # for now, don't log args (contain private credentials and not clear if we really need it)
            # record all "args" (from cmd line, user config, default config) in log (for audit/re-run purposes)
            #self.store.log_run_event(workspace, run_name, "args", args)

            store_type = self.config.get_storage_type()
            full_run_name = utils.format_workspace_exper_run(store_type, workspace, exper_name, run_name)

            # log NOTES record
            if not fake_submit:
                if self.config.get("logging", "notes") in ["before", "all"]:
                    text = input("Notes: ")
                    if text:
                        self.store.log_run_event(workspace, run_name, "notes", {"notes": text})
        else:
            full_run_name = ""

        console.diag("create_run: after logging")
        workspace = args['workspace']

        return run_name, full_run_name, box_name, pool
Ejemplo n.º 27
0
    def dispatch(self,
                 args,
                 is_rerun=False,
                 capture_output=False,
                 raise_syntax_exception=False):
        self.raise_syntax_exception = raise_syntax_exception

        # TODO: change to cmd_parts parsing, which naturally separates options cleanly (utils.cmd_split)

        # be sure to reset this for each parse (for multi-command XT sessions)
        global explict_options
        explict_options = {}

        orig_text = " ".join(args)
        self.dispatch_cmd = orig_text

        text = self.replace_curlies_with_quotes(orig_text)
        console.diag("fixed cmd={}".format(text))

        scanner = Scanner(text)
        tok = scanner.scan()
        #console.print("first tok=", tok)

        # process any ROOT FLAGS
        if root_cmd_info:
            tok = self.process_root_options(scanner, tok)
        else:
            # there is no command to process --console, so set it explictly now
            console.set_level("normal")

        console.diag("start of command parsing: {}".format(text))

        # process any options before the cmd as RAW options
        # raw_options = []
        # tok = self.collect_raw_options(raw_options, scanner, tok)

        # process COMMAND keywords
        cmd_info, tok = self.get_cmd_info(tok, scanner)
        self.cmd_info = cmd_info

        if "kwgroup_name" in cmd_info:
            cmd_info = get_command("help")

        self.cmd_info = cmd_info

        # # user type incomplete command - display appropriate help
        # if raise_syntax_exception:
        #     errors.syntax_error("incomplete command")

        # if command_help_func:
        #     # parse any help-specific options
        #     help_options = {}
        #     self.parse_options(help_options, options, scanner, tok)

        #     caller = self.impl_dict[command_help_func.__module__]
        #     kwgroup_help_func(caller, cmd_info)
        #     return
        # else:
        #     errors.env_error("no registered 'help' command")

        cmd_name = cmd_info["name"]
        self.cmd_words = cmd_name.replace("_", " ")
        func = cmd_info["func"]
        options = cmd_info["options"]
        arguments = cmd_info["arguments"]
        options_before_args = cmd_info["options_before_args"]

        # command-specific help?
        # if "help" in raw_options:
        #     help_value = raw_options["help"]
        #     if help_value != None:
        #         self.syntax_error("unexpected text after '--help': " + help_value)
        if tok == "--help":
            help_value = scanner.scan()

            if help_value != None:
                self.syntax_error("unexpected text after '--help': " +
                                  help_value)

            caller = self.impl_dict[command_help_func.__module__]
            if self.preprocessor:
                self.preprocessor(caller, arg_dict)

            command_help_func("help", caller, cmd_info)
            return

        # build a dictionary of arguments and options to be passed
        arg_dict = {}

        if options_before_args:
            # options come before arguments
            tok = self.parse_options(arg_dict, options, scanner, tok)
            tok = self.process_arguments(scanner, tok, arguments, arg_dict)
        else:
            # arguments come before options
            tok = self.process_arguments(scanner, tok, arguments, arg_dict)
            tok = self.parse_options(arg_dict, options, scanner, tok)

        # there should be no remaining tokens
        if tok:
            errors.argument_error("end of input", tok)

        full_arg_dict = self.validate_and_add_defaults(arguments, options,
                                                       arg_dict)

        console.diag("dispatching to command func")

        # select the caller using function's module name
        caller = self.impl_dict[func.__module__]
        if capture_output:
            caller.set_capture_output(True)

        if is_rerun:
            full_arg_dict["is_rerun"] = 1

        # call the matching command function with collected func args
        if self.preprocessor:
            self.preprocessor("command", caller, full_arg_dict)

        if cmd_info["pass_by_args"]:
            func(caller, args=full_arg_dict)
        else:
            func(caller, **full_arg_dict)

        console.diag("end of command processing")
        output = None

        if capture_output:
            output = caller.set_capture_output(False)

        return output
Ejemplo n.º 28
0
def main(cmd=None, disable_quickstart=False, capture_output=False, mini=False):

    utils.init_logging(constants.FN_XT_EVENTS, logger, "XT session")

    # fix artifact if no args passed, we end up with python's first arg
    if cmd:
        # treat as if it came from the shell (for consistent debugging/support)
        console.diag("orig cmd={}".format(cmd))

        # shlex and linux loses single quotes around strings, but windows does not
        orig_args = shlex.split(cmd)
        console.diag("shlex args={}".format(orig_args))
    else:
        orig_args = sys.argv[1:]
        console.diag("orig_args={}".format(orig_args))

    cmd = " ".join(orig_args)
    cmd = cmd.strip()

    use_server = "--quic" in cmd
    if not use_server:
        from .helpers.xt_config import XTConfig
        config = XTConfig(create_if_needed=True)
        use_server = config.get("general", "quick-start")

    mid_elapsed = time.time() - xt_start_time
    #console.print("mid_elapsed={:.2f}".format(mid_elapsed))

    if not use_server or disable_quickstart:
        # NORMAL start-up mode
        from xtlib import xt_cmds
        output = xt_cmds.main(cmd,
                              capture_output=capture_output,
                              mini=mini,
                              raise_syntax_exception=False)
    else:
        # QUICK-START mode
        output = None
        log.info("using xt_server")

        import psutil

        need_start = True

        for proc in psutil.process_iter():
            try:
                # Check if process name contains the given name string.
                ptext = str(proc.cmdline())

                # if "python" in ptext:
                #     console.print(ptext)

                if "python" in ptext and "xt_server.py" in ptext:
                    need_start = False
                    break
            except BaseException as ex:
                logger.exception(
                    "Error while enumerating processes looking for xt_server, ex={}"
                    .format(ex))
                pass

        if need_start:
            from .cmd_core import CmdCore
            CmdCore.start_xt_server()

        # for now, always turn on stack traces for server-run cmd
        cmd = "--stack-trace " + cmd

        cmd_dict = {"text": cmd, "cwd": os.getcwd()}

        # retry up to 5 secs (to handle case where xt_server is being restarted)
        retry_count = 0

        for i in range(5):
            try:
                run_cmd_on_server(cmd_dict, retry_count)
                break
            except BaseException as ex:
                logger.exception(
                    "Error retry exceeded sending cmd to xt_server.  Last ex={}"
                    .format(ex))
                console.print(".", end="", flush=True)
                #console.print(ex)
                time.sleep(1)
                retry_count += 1

    elapsed = time.time() - xt_start_time
    #console.print("(elapsed: {:.2f} secs)".format(elapsed))

    # add adjustment for average exit time
    console.diag("end of xt_run (includes exit time={:.2f})".format(EXIT_TIME),
                 exit_time=EXIT_TIME)

    # don't return output if we were called from xt.exe (it will console.print a confusing "[]" to output)
    return output if capture_output else None
Ejemplo n.º 29
0
def main(cmd=None,
         new_start_time=None,
         capture_output=False,
         mini=False,
         raise_syntax_exception=True):
    '''
    This is the XT app, used to manage and scale ML experiments, support various backends (Philly, Azure Batch, Azure ML).
    '''
    if new_start_time:
        global xt_start_time
        xt_start_time = new_start_time

    import numpy as np
    seed = 5
    if seed:
        np.random.seed(seed)
        np.random.RandomState(seed)

    if cmd:
        cmd = cmd.strip()

        if cmd.startswith("xt "):
            cmd = cmd[3:]
        elif cmd == "xt":
            cmd = ""

        args = utils.cmd_split(cmd)

        # remove empty args
        args = [arg for arg in args if arg]
    else:
        # if caller did not supply cmd
        args = sys.argv[1:]

    # when executing multiple commands, reset the feedback for each command
    feedback.reset_feedback()

    #console.print("cmd=", cmd, ", args=", args)
    console.diag("in xt_cmds.main")

    #console.print("config=", config)
    fn_local_config = get_fn_local_config(args)

    impl_shared = ImplShared()
    config = impl_shared.init_config(fn_local_config, mini=mini)
    store = impl_shared.store
    mini = config.mini_mode

    cmd_providers = config.get("providers", "command")
    impl_dict = {}

    for name, code_path in cmd_providers.items():
        package, class_name = code_path.rsplit(".", 1)
        module = importlib.import_module(package)
        impl_class = getattr(module, class_name)

        impl = impl_class(config, store)
        impl_dict[package] = impl

        if name == "help":
            impl.set_mini_mode(mini)

    # this enables QFE to match a function by its module name, to the class instance to process the command
    # impl_dict = {"xtlib.impl_utilities": utilities, "xtlib.impl_storage": storage,
    #     "xtlib.impl_compute": compute, "xtlib.impl_help": help_impl}

    # this parses args and calls the correct command function with its args and options correctly set.
    # the config object supplies the default value for most options and flags.
    dispatcher = qfe.Dispatcher(
        impl_dict, config, preprocessor=impl_shared.pre_dispatch_processing)

    if mini:
        # a dict of commands + arg/options to be surfaced (None means use all args/options)
        show_commands = {
            "cancel_all": ["target"],
            "cancel_job": ["job-id"],
            "cancel_run": ["run-names"],
            "clear_credentials": [],
            "config_cmd": ["default", "create", "reset"],
            "create_demo": ["destination", "response", "overwrite"],
            "create_services_template": [],
            "download": ["local-path", "store-path"],
            "extract": ["runs", "dest-dir", "browse", "workspace"],
            "help": ["command", "about", "browse", "version"],
            "help_topics": ["topic", "browse"],
            "list_blobs": ["path"],
            "list_jobs": [
                "job-list", "experiment", "all", "first", "last", "filter",
                "sort", "reverse", "status", "available"
            ],
            "list_runs": [
                "run-list", "job", "experiment", "all", "first", "last",
                "filter", "sort", "reverse", "status", "available"
            ],
            "monitor": ["name"],
            "run": [
                "script", "script-args", "experiment", "hp-config", "max-runs",
                "nodes", "runs", "search-type", "target"
            ],
            "upload": ["local-path", "store-path"],
            "view_console": ["name", "target", "workspace", "node-index"],
            "view_metrics": ["runs", "metrics"],
            "view_run": ["run-name"]
        }

        dispatcher.show_commands(show_commands)

        qfe.remove_hidden_commands()

    # hide under-development commands
    hide_commands = [
        "collect_logs", "start_tensorboard", "stop_tensorboard", "zip",
        "unzip", "wget"
    ]

    # hide internal cmds (for xt development use only)
    hide_commands.append("generate_help")
    dispatcher.hide_commands(hide_commands)

    # expand symbols like $lastjob, $lastrun
    impl_shared.expand_xt_symbols(args)

    # this is the NORMAL outer exeception handling block, but
    # also see the client/server exception handling in xt_run.py
    try:
        text = dispatcher.dispatch(
            args,
            capture_output=capture_output,
            raise_syntax_exception=raise_syntax_exception)
    except BaseException as ex:
        #console.print("in Exception Handler: utils.show_stack_trace=", utils.show_stack_trace)
        # does user want a stack-trace?
        logger.exception(
            "Error during displatcher.dispatch, args={}".format(args))

        exc_type, exc_value, exc_traceback = sys.exc_info()
        errors.process_exception(exc_type, exc_value, exc_traceback)

    return text
Ejemplo n.º 30
0
    def get_all_runs(self,
                     aggregator_dest,
                     ws_name,
                     job_or_exper_name,
                     filter_dict=None,
                     fields_dict=None,
                     use_cache=True,
                     fn_cache=None,
                     first_count=None,
                     last_count=None,
                     sort_dict=None):
        '''
        cache design: 
            - organized all cached run information by the way it was accessed: a folder for each workspace (created on demand), 
              and under each, a folder specifying the filter_dict and fields_dict.  This way, we only use cache records for
              exactly matching query info.

            - whenever sort, first_count, or last_count is used (that is, included in the mongo db query), we should set "use_cache" to False.

            - note: since Azure Cosmos version of mongo-db doesn't correctly support sort/first/last (totally busted as of Aug 2019), we never
              include sort/first/last in mongo db query.

            - as of 12/20/2019, the only code that correctly uses the fn_cache is hparam_search.  all other code should call with use_cache=False.
        '''
        # PERF-critical function
        # below code not yet cache-compliant
        use_cache = False

        records = []
        target = 0
        cache = None

        if use_cache and not fn_cache:
            # fn_cache = self.run_cache_dir + "/" + constants.ALL_RUNS_CACHE_FN
            # fn_cache = fn_cache.replace("$aggregator", ws_name)
            use_cache = False  # play it safe for now

        if use_cache and os.path.exists(fn_cache):
            # read CACHED runs
            started = time.time()
            cache = utils.load(fn_cache)
            elapsed = time.time() - started

            target = max(
                [rec["end_id"] if "end_id" in rec else 0 for rec in cache])
            console.print(
                "loaded {:,} records in {:.2f} secs from cache: {}".format(
                    len(cache), elapsed, fn_cache))

        if not filter_dict:
            if aggregator_dest == "job":
                filter_dict = {"job_id": job_or_exper_name}
            elif aggregator_dest == "experiment":
                filter_dict = {"exper_name": job_or_exper_name}

        # if not fields_dict:
        #     # by default, do NOT return inner log records
        #     fields_dict = {"log_records": 0}

        # adjust filter to get only missing records
        if target:
            filter_dict["end_id"] = {"$gt": target}

        #console.print("  mongo: filter: {}, fields: {}, sort: {}".format(filter_dict, fields_dict, sort_dict))
        console.diag("  mongo: filter: {}, fields: {}".format(
            filter_dict, fields_dict))

        # limit query to avoid "message max exceeded" errors
        max_query_records = 3000
        started = time.time()

        #records = self.mongo_db[ws_name].find(filter_dict, fields_dict)
        cmd_func = lambda: self.mongo_db[ws_name].find(filter_dict, fields_dict
                                                       )
        cursor = self.mongo_with_retries("get_all_runs", cmd_func)

        # SORT TOTALLY BUSTED ON COSMOS:
        #   - sort of "-id" returns random order each time
        #   - sort of "test-acc" returns 0 records (if ANY missing values, NO records returned)
        #   - docs say pass a dict, but code wants list of 2-tuples (pymongo library)

        # if sort_dict:
        #     items = list(sort_dict.items())
        #     key, value = items[0]
        #     import pymongo
        #     cursor = cursor.sort("job", 1)  # key, value)

        # adjust cursor per first_count, last_count

        # because SORT is busted, we can't use mongo for first/last either
        # if last_count:
        #     if last_count is True:
        #         last_count = 25
        #     avail = cursor.count()
        #     skip_count = avail - last_count
        #     if skip_count > 0:
        #         cursor = cursor.skip(skip_count)
        # elif first_count:
        #     if first_count is True:
        #         first_count = 25
        #     cursor = cursor.limit(first_count)

        records = list(cursor)

        return_count = len(records)
        total_count = self.mongo_db[ws_name].count()

        elapsed = time.time() - started
        console.diag(
            "  mongo query returned {} records (of {}), took: {:2f} secs".
            format(return_count, total_count, elapsed))

        if cache:
            cache += records
            records = cache

        if return_count and use_cache:
            # write to cache
            started = time.time()
            utils.save(records, fn_cache)
            elapsed = time.time() - started
            console.print(
                "wrote {:,} records to cache, took: {:2f} secs".format(
                    len(records), elapsed))

        return records