def ensure_workspace_exists(self, ws_name, flag_as__error=True):
     self._check_ws_name(ws_name)
     exists = self.does_workspace_exist(ws_name)
     if not exists:
         if flag_as__error:
             errors.store_error("Workspace not found: {}".format(ws_name))
         self.create_workspace(ws_name)
    def delete_workspace(self, ws_name):
        result = self.provider.delete_container(ws_name)

        if not result:
            errors.store_error("could not delete workspace: " + ws_name)

        return result
    def get_run_log(self, ws_name, run_name):
        blob_path = self._run_path(run_name) + "/" + constants.RUN_LOG

        if not self.provider.does_blob_exist(ws_name, blob_path):
            # limited support for old-style run logging
            blob_path = run_name + "/" + constants.RUN_LOG

        #console.print("blob_path=", blob_path)
        if not self.provider.does_blob_exist(ws_name, blob_path):
            errors.store_error("unknown run: ws={}, run_name={}".format(
                ws_name, run_name))

        #console.print("get_run_log: ws_name=", ws_name, ", blob_path=", blob_path)

        # watch out for 0-length blobs (azure will throw retryable exception if you use "get_blob_to_text")
        blob = self.provider.get_blob_properties(ws_name, blob_path)
        #console.print("blob.properties.content_length=", blob.properties.content_length)
        lines = []

        if blob.properties.content_length:
            text = self.provider.get_blob_text(ws_name, blob_path)
            #console.print("get_run_log: text=", text)

            lines = text.split("\n")
            #console.print("lines=", lines)
            lines = [json.loads(line) for line in lines if line.strip()]

        return lines
Exemple #4
0
def get_run_record(store, workspace, run_name, fields_dict=None):
    run_records = get_run_records(store, workspace, [run_name], fields_dict)
    if not run_records:
        errors.store_error("Run {} does not exist in workspace {}".format(
            run_name, workspace))
    rr = run_records[0]
    return rr
 def root_files(self, root_name, use_blobs=False):
     if use_blobs:
         return RootBlobs(self, root_name)
     else:
         #return store_azure_file.RootFiles(self, ws_name)
         errors.store_error(
             "Root files are not current supported (use RootBlobs)")
    def cancel_run(self, ws_name, run_name):
        console.diag("start of azure_ml.cancel_run()")

        target_run = self.get_run(ws_name, run_name)
        if not target_run:
            errors.store_error("run not found: {}".format(run_name))

        console.diag("after get_run() call")

        before_status = target_run.status.lower()
        if before_status in ["preparing", "queued"]:
            target_run.cancel()
            killed = True
            status = "cancelled"
        elif before_status in ["starting", "running"]:
            target_run.cancel()
            killed = True
            status = "cancelled"
        else:
            killed = False
            status = target_run.status

        console.diag("after run.cancel() call")

        return {
            "workspace": ws_name,
            "run_name": run_name,
            "cancelled": killed,
            "status": status
        }
def get_client_cs(core, job_id, node_index):
    '''
    instantiate the backend service that owns the specified job node and 
    request it's client connection string
    '''
    cs = None
    box_secret = None

    filter = {"_id": job_id}
    jobs = core.store.mongo.get_info_for_jobs(filter, None)
    if not jobs:
        errors.store_error("unknown job_id: {}".format(job_id))

    job = jobs[0]
    node_id = utils.node_id(node_index)

    compute = utils.safe_value(job, "compute")
    secrets_by_node = utils.safe_value(job, "secrets_by_node")
    if not secrets_by_node:
        errors.store_error("unknown node_index={} for job={}".format(
            node_index, job_id))

    box_secret = utils.safe_value(secrets_by_node, node_id)

    service_info_by_node = utils.safe_value(job, "service_info_by_node")
    node_info = utils.safe_value(service_info_by_node, node_id)

    if compute and node_info:
        backend = core.create_backend(compute)
        cs = backend.get_client_cs(node_info)

    cs_plus = {"cs": cs, "box_secret": box_secret, "job": job}
    return cs_plus
    def create_workspace(self, ws_name, description=None):
        ''' create workspace as top level container '''
        self._check_ws_name(ws_name)

        if self.does_workspace_exist(ws_name):
            errors.store_error("workspace already exists: {}".format(ws_name))

        # note: this operation often must retry several times if same container has just been deleted
        #console.print("creating workspace=", ws_name)

        # MULTIPROCESS: this is the step that will fail (if any)
        result = self.provider.create_container(ws_name)
        if not result:
            errors.store_error("could not create workspace: " + ws_name)

        # MULTIPROCESS: safe now

        # create a holder file for RUNS directory
        runs_holder_fn = constants.RUNS_DIR + "/" + constants.HOLDER_FILE
        self._create_blob(ws_name, runs_holder_fn, "1", True)

        # create a holder file for EXPERIMENTS directory
        experiments_holder_fn = constants.EXPERIMENTS_DIR + "/" + constants.HOLDER_FILE
        self._create_blob(ws_name, experiments_holder_fn, "1", True)

        # create NEXT_RUN_NAME (for extra safety, ensure file doesn't already exist)
        blob_fn = constants.WORKSPACE_DIR + "/" + constants.WORKSPACE_NEXT
        self._create_blob(ws_name, blob_fn, "1", True)
 def ensure_share_exists(self, share_name, flag_as__error=True):
     container_name = utils.make_share_name(share_name)
     self._check_ws_name(container_name)
     exists = self.does_share_exist(share_name)
     if not exists:
         if flag_as__error:
             errors.store_error("Share not found: {}".format(share_name))
         self.create_share(share_name)
    def delete_share(self, share_name):
        container_name = utils.make_share_name(share_name)
        self._check_ws_name(container_name)

        result = self.provider.delete_container(container_name)

        if not result:
            errors.store_error("could not delete share: " + share_name)

        return result
def validate_job_name_with_ws(store, job_name, validate):
    job_name = job_name.lower()
    if not is_job_id(job_name):
        return errors.syntax_error("Illegal job name: {}".format(job_name))

    ws = store.get_job_workspace(job_name)
    if validate and not ws:
        errors.store_error("job '{}' does not exist".format(job_name))

    return ws
    def create_share(self, share_name, description=None):
        ''' create share as top level container '''

        container_name = utils.make_share_name(share_name)
        self._check_ws_name(container_name)

        # note: this operation often must retry several times if same container has just been deleted
        #console.print("creating share=", ws_name)

        # MULTIPROCESS: this is the step that will fail (if any)
        result = self.provider.create_container(container_name)
        if not result:
            errors.store_error("could not create share: " + share_name)
    def download_file(self,
                      fn,
                      dest_fn,
                      progress_callback=None,
                      use_snapshot=False):
        container, path, wc_target = self._get_container_path_target(fn)
        #console.print("container=", container, ", path=", path)

        # ensure blob exists ourselves so we can issue a friendly error
        if not self.store.provider.does_blob_exist(container, path):
            errors.store_error("Blob not found: container={}, path={}".format(
                container, path))

        # ensure the directory of the dest_fn exists
        file_utils.ensure_dir_exists(file=dest_fn)

        if use_snapshot:
            # create temp. snapshot
            if progress_callback:
                progress_callback(status="creating-snapshot")
            props = self.store.provider.snapshot_blob(container, path)
            snapshot_id = props.snapshot

            # download the snapshot
            if progress_callback:
                progress_callback(status="downloading-snapshot")
            text = self.store.provider.get_blob_to_path(
                container,
                path,
                dest_fn,
                snapshot=snapshot_id,
                progress_callback=progress_callback)

            # delete the snapshot
            if progress_callback:
                progress_callback(status="deleting-snapshot")
            self.store.provider.delete_blob(container,
                                            path,
                                            snapshot=snapshot_id)

            if progress_callback:
                progress_callback(status="deleted-snapshot")
        else:
            # normal download
            text = self.store.provider.get_blob_to_path(
                container, path, dest_fn, progress_callback=progress_callback)

        return text
    def _read_blob(self, ws_name, blob_path):
        console.diag("_read_blob: ws_name={}, blob_path={}".format(
            ws_name, blob_path))

        if not self.does_workspace_exist(ws_name):
            # avoid 10 retries and unfriendly storage errors
            errors.store_error("container doesn't exist: " + ws_name)

        if not self.provider.does_blob_exist(ws_name, blob_path):
            # avoid 10 retries and unfriendly storage errors
            errors.store_error(
                "blob doesn't exist: container={}, path={}".format(
                    ws_name, blob_path))

        blob_text = self.provider.get_blob_text(ws_name, blob_path)
        return blob_text
    def copy_run(self, source_workspace_name, source_run_name,
                 dest_workspace_name, dest_run_name):
        if self.does_run_exist(dest_workspace_name, dest_run_name):
            errors.store_error(
                "destination run already exists: ws={}, run={}".format(
                    dest_workspace_name, dest_run_name))

        # copy a single blob at a time
        #for source_blob_path in self.bs.list_blob_names(source_workspace_name, source_run_name):
        for source_blob in self.provider.list_blobs(
                source_workspace_name,
                path=self._run_path(source_run_name) + "/"):
            dest_blob_path = self._run_path(
                dest_run_name) + "/" + self._remove_first_node(source_blob)

            # copy single blob within same storage service
            self.provider.copy_blob(source_workspace_name, source_blob,
                                    dest_workspace_name, dest_blob_path)
Exemple #16
0
    def download_runs(self, store, ws_name, run_group_name, run_group_type,
                      hp_config_cloud_path, hp_config_local_dir):
        # Download the all_runs file
        local_cache_path = "{}/{}/{}/".format(hp_config_local_dir, ws_name,
                                              run_group_type)
        local_config_file_path = "{}{}".format(local_cache_path,
                                               "hp-config.yaml")

        if run_group_name == "experiment":
            console.print(
                "downloading runs for EXPERIMENT={}...".format(run_group_type))
            # files are at EXPERIMENT LEVEL
            # read SWEEPS file
            if not store.does_experiment_file_exist(ws_name, run_group_type,
                                                    hp_config_cloud_path):
                errors.store_error(
                    "missing experiment hp_config file (ws={}, exper={}, fn={})"
                    .format(ws_name, run_group_type, hp_config_cloud_path))
            store.download_file_from_experiment(ws_name, run_group_type,
                                                hp_config_cloud_path,
                                                local_config_file_path)

            # read ALLRUNS info aggregated in EXPERIMENT
            allrun_records = store.get_all_runs(run_group_name, ws_name,
                                                run_group_type)
        else:
            console.print(
                "downloading runs for JOB={}...".format(run_group_type))
            # files are at JOB LEVEL
            # read SWEEPS file
            if not store.does_job_file_exist(run_group_type,
                                             hp_config_cloud_path):
                errors.store_error(
                    "missing job hp_config file (job={}, fn={})".format(
                        run_group_type, hp_config_cloud_path))
            store.download_file_from_job(run_group_type, hp_config_cloud_path,
                                         local_config_file_path)

            # read ALLRUNS info aggregated in JOB
            allrun_records = store.get_all_runs(run_group_name, ws_name,
                                                run_group_type)

        console.diag("after downloading all runs")
        return local_config_file_path, allrun_records
Exemple #17
0
def get_client_cs(core, ws, run_name):

    cs = None
    box_secret = None

    filter = {"_id": run_name}
    runs = core.store.mongo.get_info_for_runs(ws, filter, {"run_logs": 0})
    if not runs:
        errors.store_error("Unknown run: {}/{}".format(ws, run_name))

    if runs:
        from xtlib import job_helper

        run = runs[0]
        job_id = utils.safe_value(run, "job_id")
        node_index = utils.safe_value(run, "node_index")

        cs_plus = job_helper.get_client_cs(core, job_id, node_index)
        cs = cs_plus["cs"]
        box_secret = cs_plus["box_secret"]

    return cs, box_secret
Exemple #18
0
def validate_run_name(store,
                      ws,
                      run_name,
                      error_if_invalid=True,
                      parse_only=False):
    run_name = correct_slash(run_name)
    if "/" in run_name:
        parts = run_name.split("/")
        if len(parts) != 2:
            errors.syntax_error("invalid format for run name: " + run_name)
        ws, run_name = parts

    run_name = run_name.lower()
    if not parse_only and not "*" in run_name:
        if not store.mongo.does_run_exist(ws, run_name):
            if error_if_invalid:
                errors.store_error(
                    "run '{}' does not exist in workspace '{}'".format(
                        run_name, ws))
            else:
                return None, None, None
    return ws, run_name, ws + "/" + run_name
Exemple #19
0
    def download(self,
                 store_path,
                 local_path,
                 share,
                 workspace,
                 experiment,
                 job,
                 run,
                 feedback,
                 snapshot,
                 show_output=True):

        use_blobs = True
        use_multi = True  # default until we test if store_path exists as a file/blob
        download_count = 0

        fs = self.create_file_accessor(use_blobs, share, workspace, experiment,
                                       job, run)

        # test for existance of store_path as a blob/file
        if not "*" in store_path and not "?" in store_path:
            if fs.does_file_exist(store_path):
                use_multi = False

        if local_path:
            # exapnd ~/ in front of local path
            local_path = os.path.expanduser(local_path)
        else:
            # path not specified for local
            if use_multi:
                local_path = "."
            else:
                local_path = "./" + os.path.basename(store_path)

        uri = fs.get_uri(store_path)

        # default store folder to recursive
        if use_multi and not "*" in store_path and not "?" in store_path:
            store_path += "/**"

        use_snapshot = snapshot

        feedback_progress = FeedbackProgress(feedback, show_output)
        progress_callback = feedback_progress.progress if feedback else None

        if use_multi:
            # download MULTI blobs/files

            what = "blobs" if use_blobs else "files"
            single_what = what[0:-1]

            if show_output:
                console.print("collecting {} names from: {}...".format(
                    single_what, uri),
                              end="")

            _, blob_names = fs.get_filenames(store_path, full_paths=False)

            if show_output:
                console.print()

            if len(blob_names) == 0:
                console.print("no matching {} found in: {}".format(what, uri))
                return 0
            elif len(blob_names) == 1:
                what = "blob" if use_blobs else "file"

            if show_output:
                console.print("\ndownloading {} {}...:".format(
                    len(blob_names), what))

            file_utils.ensure_dir_exists(local_path)
            max_name_len = max(
                [len(local_path + "/" + name) for name in blob_names])
            name_width = 1 + max_name_len
            #console.print("max_name_len=", max_name_len, ", name_width=", name_width)

            for f, bn in enumerate(blob_names):
                dest_fn = file_utils.fix_slashes(local_path + "/" + bn)

                if show_output:
                    file_msg = "file {}/{}".format(1 + f, len(blob_names))
                    console.print("  {2:}: {1:<{0:}} ".format(
                        name_width, dest_fn + ":", file_msg),
                                  end="",
                                  flush=True)

                feedback_progress.start()
                full_bn = uri + "/" + bn if uri else bn
                fs.download_file(full_bn,
                                 dest_fn,
                                 progress_callback=progress_callback,
                                 use_snapshot=use_snapshot)
                feedback_progress.end()

                download_count += 1
        else:
            # download SINGLE blobs/files
            what = "blob" if use_blobs else "file"

            if not fs.does_file_exist(store_path):
                errors.store_error("{} not found: {}".format(what, uri))

            local_path = file_utils.fix_slashes(local_path)

            if show_output:
                console.print("\nfrom {}, downloading {}:".format(uri, what))
                console.print("  {}:    ".format(local_path),
                              end="",
                              flush=True)

            feedback_progress.start()
            fs.download_file(store_path,
                             local_path,
                             progress_callback=progress_callback,
                             use_snapshot=use_snapshot)
            feedback_progress.end()

            download_count += 1

        return download_count
def get_job_record(store, job_id, fields_dict=None):
    job_records = get_job_records(store, [job_id], fields_dict)
    if not job_records:
        errors.store_error("job {} does not exist".format(job_id))
    jr = job_records[0]
    return jr