def list_artifacts(self, path=None): from azure.storage.blob._models import BlobPrefix (container, _, artifact_path) = self.parse_wasbs_uri(self.artifact_uri) container_client = self.client.get_container_client(container) dest_path = artifact_path if path: dest_path = posixpath.join(dest_path, path) infos = [] prefix = dest_path + "/" results = container_client.walk_blobs(name_starts_with=prefix) for r in results: if not r.name.startswith(artifact_path): raise MlflowException( "The name of the listed Azure blob does not begin with the specified" " artifact path. Artifact path: {artifact_path}. Blob name:" " {blob_name}".format(artifact_path=artifact_path, blob_name=r.name)) if isinstance(r, BlobPrefix ): # This is a prefix for items in a subdirectory subdir = posixpath.relpath(path=r.name, start=artifact_path) if subdir.endswith("/"): subdir = subdir[:-1] infos.append(FileInfo(subdir, True, None)) else: # Just a plain old blob file_name = posixpath.relpath(path=r.name, start=artifact_path) infos.append(FileInfo(file_name, False, r.size)) return sorted(infos, key=lambda f: f.path)
def list_artifacts(self, path=None): (bucket, artifact_path) = self.parse_oss_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = posixpath.join(dest_path, path) infos = [] prefix = dest_path + "/" if dest_path else "" self._get_oss_bucket(bucket) results = self.oss_bucket.list_objects(prefix=prefix, delimiter='/') for obj in results.object_list: # is file file_path = obj.key self._verify_listed_object_contains_artifact_path_prefix( listed_object_path=file_path, artifact_path=artifact_path) file_rel_path = posixpath.relpath(path=file_path, start=artifact_path) file_size = obj.size infos.append(FileInfo(file_rel_path, False, file_size)) for subdir_path in results.prefix_list: # is dir self._verify_listed_object_contains_artifact_path_prefix( listed_object_path=subdir_path, artifact_path=artifact_path) subdir_rel_path = posixpath.relpath(path=subdir_path, start=artifact_path) infos.append(FileInfo(subdir_rel_path, True, None)) return sorted(infos, key=lambda f: f.path)
def list_artifacts(self, path=None): (bucket, artifact_path) = data.parse_s3_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = build_path(dest_path, path) infos = [] prefix = dest_path + "/" s3_client = self._get_s3_client() paginator = s3_client.get_paginator("list_objects_v2") results = paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/') for result in results: # Subdirectories will be listed as "common prefixes" due to the way we made the request for obj in result.get("CommonPrefixes", []): subdir = obj.get("Prefix")[len(artifact_path) + 1:] if subdir.endswith("/"): subdir = subdir[:-1] infos.append(FileInfo(subdir, True, None)) # Objects listed directly will be files for obj in result.get('Contents', []): name = obj.get("Key")[len(artifact_path) + 1:] size = int(obj.get('Size')) infos.append(FileInfo(name, False, size)) return sorted(infos, key=lambda f: f.path)
def list_artifacts(self, path=None): from azure.storage.blob.models import BlobPrefix (container, _, artifact_path) = self.parse_wasbs_uri(self.artifact_uri) dest_path = artifact_path if path: # Separator needs to be fixed as '/' because of azure blob storage pattern. # Do not change to os.path.join because in Windows system path separator is '\' dest_path = posixpath.join(dest_path, path) infos = [] prefix = dest_path + "/" marker = None # Used to make next list request if this one exceeded the result limit while True: results = self.client.list_blobs(container, prefix=prefix, delimiter='/', marker=marker) for r in results: if isinstance( r, BlobPrefix ): # This is a prefix for items in a subdirectory subdir = r.name[len(artifact_path) + 1:] if subdir.endswith("/"): subdir = subdir[:-1] infos.append(FileInfo(subdir, True, None)) else: # Just a plain old blob file_name = r.name[len(artifact_path) + 1:] infos.append( FileInfo(file_name, False, r.properties.content_length)) # Check whether a new marker is returned, meaning we have to make another request if results.next_marker: marker = results.next_marker else: break return sorted(infos, key=lambda f: f.path)
def list_artifacts(self, path=None): (bucket, artifact_path) = data.parse_s3_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = posixpath.join(dest_path, path) infos = [] prefix = dest_path + "/" if dest_path else "" s3_client = self._get_s3_client() paginator = s3_client.get_paginator("list_objects_v2") results = paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter="/") for result in results: # Subdirectories will be listed as "common prefixes" due to the way we made the request for obj in result.get("CommonPrefixes", []): subdir_path = obj.get("Prefix") self._verify_listed_object_contains_artifact_path_prefix( listed_object_path=subdir_path, artifact_path=artifact_path) subdir_rel_path = posixpath.relpath(path=subdir_path, start=artifact_path) if subdir_rel_path.endswith("/"): subdir_rel_path = subdir_rel_path[:-1] infos.append(FileInfo(subdir_rel_path, True, None)) # Objects listed directly will be files for obj in result.get("Contents", []): file_path = obj.get("Key") self._verify_listed_object_contains_artifact_path_prefix( listed_object_path=file_path, artifact_path=artifact_path) file_rel_path = posixpath.relpath(path=file_path, start=artifact_path) file_size = int(obj.get("Size")) infos.append(FileInfo(file_rel_path, False, file_size)) return sorted(infos, key=lambda f: f.path)
def test_list_artifacts_nested(hdfs_system_mock): repo = HdfsArtifactRepository('hdfs:://host/some/path') expected = [ FileInfo('model/conda.yaml', False, 33), FileInfo('model/model.pkl', False, 33), FileInfo('model/MLmodel', False, 33) ] hdfs_system_mock.return_value.ls.return_value = [{ 'kind': 'file', 'name': 'hdfs://host/some/path/model/conda.yaml', 'size': 33, }, { 'kind': 'file', 'name': 'hdfs://host/some/path/model/model.pkl', 'size': 33, }, { 'kind': 'file', 'name': 'hdfs://host/some/path/model/MLmodel', 'size': 33, }] actual = repo.list_artifacts('model') assert actual == expected
def list_artifacts(self, path=None): """Returns saved artifacts for current artifact uri""" artifacts_info = self.get_artifacts_info(self.artifact_uri) artifacts_under_path_info = None if path: artifacts_under_path_info = list( filter(lambda a: a[0].startswith(path), artifacts_info)) else: artifacts_under_path_info = artifacts_info already_seen_paths, file_infos = [], [] path_len = 0 if path is None else len(path) + 1 for artifact_under_path in artifacts_under_path_info: file_size = artifact_under_path[2] relative_path = artifact_under_path[0][path_len:] relative_path_steps = relative_path.split('/') next_step = relative_path_steps[0] if next_step in already_seen_paths: continue already_seen_paths.append(next_step) file_info_path = next_step if path is None else '/'.join( [path, next_step]) if len(relative_path_steps) == 1: file_infos.append(FileInfo(file_info_path, False, file_size)) else: file_infos.append(FileInfo(file_info_path, True, None)) return file_infos
def test_list_artifacts_nested(hdfs_system_mock): repo = HdfsArtifactRepository("hdfs:://host/some/path") expected = [ FileInfo("model/conda.yaml", False, 33), FileInfo("model/model.pkl", False, 33), FileInfo("model/MLmodel", False, 33), ] hdfs_system_mock.return_value.ls.return_value = [ { "kind": "file", "name": "hdfs://host/some/path/model/conda.yaml", "size": 33 }, { "kind": "file", "name": "hdfs://host/some/path/model/model.pkl", "size": 33 }, { "kind": "file", "name": "hdfs://host/some/path/model/MLmodel", "size": 33 }, ] actual = repo.list_artifacts("model") assert actual == expected
def list_artifacts(path): fullpath = posixpath.join(base_uri, path) if fullpath.endswith("model") or fullpath.endswith("model/"): return [FileInfo(item, False, 123) for item in list_return_val] elif fullpath.endswith("12345") or fullpath.endswith("12345/"): return [FileInfo(posixpath.join(path, "model"), True, 0)] else: return []
def list_artifacts(path): fullpath = posixpath.join(base_uri, path) if fullpath.endswith(_MODEL_DIR) or fullpath.endswith(_MODEL_DIR + "/"): return [FileInfo(item, False, _DUMMY_FILE_SIZE) for item in list_return_val] elif fullpath.endswith(_PARENT_MODEL_DIR) or fullpath.endswith(_PARENT_MODEL_DIR + "/"): return [FileInfo(posixpath.join(path, _MODEL_DIR), True, _EMPTY_FILE_SIZE)] else: return []
def test_file_info_to_json(): file_infos = [ FileInfo("/my/file", False, 123), FileInfo("/my/dir", True, None), ] info_str = _file_infos_to_json(file_infos) assert json.loads(info_str) == [ {"path": "/my/file", "is_dir": False, "file_size": "123"}, {"path": "/my/dir", "is_dir": True}, ]
def list_artifacts(path): if path.endswith(_MODEL_DIR): return [ FileInfo(item, item.endswith(_EMPTY_DIR), _DUMMY_FILE_SIZE) for item in list_return_val ] elif path.endswith(_PARENT_DIR) or path.endswith(_PARENT_DIR + "/"): return [FileInfo(_PARENT_MODEL_DIR, True, _EMPTY_FILE_SIZE)] else: return []
def list_artifacts(path): if path.endswith("model"): return [ FileInfo(item, item.endswith("emptydir"), 123) for item in list_return_val ] elif path.endswith("12345") or path.endswith("12345/"): return [FileInfo("12345/model", True, 0)] else: return []
def get_file_info(path, rel_path): """ Returns file meta data : location, size, ... etc :param path: Path to artifact :return: `FileInfo` object """ if is_directory(path): return FileInfo(rel_path, True, None) else: return FileInfo(rel_path, False, os.path.getsize(path))
def list_artifacts(self, path=None): artifact_dir = self.path list_dir = os.path.join(artifact_dir, path) if path else artifact_dir artifact_files = self.sftp.listdir(list_dir) infos = [] for file_name in artifact_files: file_path = file_name if path is None else os.path.join(path, file_name) full_file_path = os.path.join(list_dir, file_name) if self.sftp.isdir(full_file_path): infos.append(FileInfo(file_path, True, None)) else: infos.append(FileInfo(file_path, False, self.sftp.stat(full_file_path).st_size)) return infos
def list_artifacts(self, path): """ Return all the artifacts for this run_id directly under path. If path is a file, returns an empty list. Will error if path is neither a file nor directory. Note that list_artifacts will not return valid artifact sizes from Azure. :param path: Relative source path that contain desired artifacts :type path: str :return: List of artifacts as FileInfo listed directly under path. """ # get and filter by paths if path and self.artifacts.path and not path.startswith( self.artifacts.path): path = self._get_full_artifact_path( path ) # Adds prefix if called directly and it is not already set path_tokens = path.split("/") if path else [] path_depth = len(path_tokens) artifacts = [] for file_path in self.artifacts.get_file_paths(): if path is None or file_path[:len(path)] == path and len( file_path) > len(path): artifacts.append(file_path) file_infos = [] for artifact in artifacts: artifact_tokens = artifact.split("/") if len(artifact_tokens) == path_depth + 1: # is a file file_infos.append( FileInfo( path=artifact, is_dir=False, file_size= -1 # TODO: artifact size retrieval is not supported in Azure )) else: # is a directory file_infos.append( FileInfo( path="/".join(artifact_tokens[:path_depth + 1]), is_dir=True, file_size= -1 # TODO: artifact size retrieval is not supported in Azure )) return file_infos
def test_creation_and_hydration(self): path = random_str(random_int(10, 50)) is_dir = random_int(10, 2500) % 2 == 0 size_in_bytes = random_int(1, 10000) fi1 = FileInfo(path, is_dir, size_in_bytes) self._check(fi1, path, is_dir, size_in_bytes) as_dict = {"path": path, "is_dir": is_dir, "file_size": size_in_bytes} self.assertEqual(dict(fi1), as_dict) proto = fi1.to_proto() fi2 = FileInfo.from_proto(proto) self._check(fi2, path, is_dir, size_in_bytes) fi3 = FileInfo.from_dictionary(as_dict) self._check(fi3, path, is_dir, size_in_bytes)
def list_artifacts(self, path=None): infos = [] page_token = None if not path: path = "" while True: json_body = self._make_json_body(path, page_token) response = self._call_endpoint(json_body, REGISTRY_LIST_ARTIFACTS_ENDPOINT) try: response.raise_for_status() json_response = json.loads(response.text) except Exception: raise MlflowException( "API request to list files under path `%s` failed with status code %s. " "Response body: %s" % (path, response.status_code, response.text)) artifact_list = json_response.get("files", []) next_page_token = json_response.get("next_page_token", None) # If `path` is a file, ListArtifacts returns a single list element with the # same name as `path`. The list_artifacts API expects us to return an empty list in this # case, so we do so here. if (len(artifact_list) == 1 and artifact_list[0]["path"] == path and not artifact_list[0]["is_dir"]): return [] for output_file in artifact_list: artifact_size = None if output_file["is_dir"] else output_file[ "file_size"] infos.append( FileInfo(output_file["path"], output_file["is_dir"], artifact_size)) if len(artifact_list) == 0 or not next_page_token: break page_token = next_page_token return infos
def list_artifacts(self, path=None): if path: dbfs_path = self._get_dbfs_path(path) else: dbfs_path = self._get_dbfs_path('') dbfs_list_json = {'path': dbfs_path} response = self._dbfs_list_api(dbfs_list_json) try: json_response = json.loads(response.text) except ValueError: raise MlflowException( "API request to list files under DBFS path %s failed with status code %s. " "Response body: %s" % (dbfs_path, response.status_code, response.text)) # /api/2.0/dbfs/list will not have the 'files' key in the response for empty directories. infos = [] artifact_prefix = strip_prefix(self.artifact_uri, 'dbfs:') if json_response.get('error_code', None) == RESOURCE_DOES_NOT_EXIST: return [] dbfs_files = json_response.get('files', []) for dbfs_file in dbfs_files: stripped_path = strip_prefix(dbfs_file['path'], artifact_prefix + '/') # If `path` is a file, the DBFS list API returns a single list element with the # same name as `path`. The list_artifacts API expects us to return an empty list in this # case, so we do so here. if stripped_path == path: return [] is_dir = dbfs_file['is_dir'] artifact_size = None if is_dir else dbfs_file['file_size'] infos.append(FileInfo(stripped_path, is_dir, artifact_size)) return sorted(infos, key=lambda f: f.path)
def list_artifacts(self, path=None): """ Lists files and directories under artifacts directory for the current run_id. (self.path contains the base path - hdfs:/some/path/run_id/artifacts) :param path: Relative source path. Possible subdirectory existing under hdfs:/some/path/run_id/artifacts :return: List of FileInfos under given path """ hdfs_base_path = _resolve_base_path(self.path, path) with hdfs_system(scheme=self.scheme, host=self.host, port=self.port) as hdfs: paths = [] if hdfs.exists(hdfs_base_path): for file_detail in hdfs.ls(hdfs_base_path, detail=True): file_name = file_detail.get("name") # Strip off anything that comes before the artifact root e.g. hdfs://name offset = file_name.index(self.path) rel_path = _relative_path_remote(self.path, file_name[offset:]) is_dir = file_detail.get("kind") == "directory" size = file_detail.get("size") paths.append(FileInfo(rel_path, is_dir, size)) return sorted(paths, key=lambda f: paths)
def _list_artifacts_for_proxied_run_artifact_root(proxied_artifact_root, relative_path=None): """ Lists artifacts from the specified ``relative_path`` within the specified proxied Run artifact root (i.e. a Run artifact root with scheme ``http``, ``https``, or ``mlflow-artifacts``). :param proxied_artifact_root: The Run artifact root location (URI) with scheme ``http``, ``https``, or ``mlflow-artifacts`` that can be resolved by the MLflow server to a concrete storage location. :param relative_path: The relative path within the specified ``proxied_artifact_root`` under which to list artifact contents. If ``None``, artifacts are listed from the ``proxied_artifact_root`` directory. """ parsed_proxied_artifact_root = urllib.parse.urlparse(proxied_artifact_root) assert parsed_proxied_artifact_root.scheme in [ "http", "https", "mlflow-artifacts" ] artifact_destination_repo = _get_artifact_repo_mlflow_artifacts() artifact_destination_path = _get_proxied_run_artifact_destination_path( proxied_artifact_root=proxied_artifact_root, relative_path=relative_path, ) artifact_entities = [] for file_info in artifact_destination_repo.list_artifacts( artifact_destination_path): basename = posixpath.basename(file_info.path) run_relative_artifact_path = (posixpath.join(relative_path, basename) if relative_path else basename) artifact_entities.append( FileInfo(run_relative_artifact_path, file_info.is_dir, file_info.file_size)) return artifact_entities
def _list_folders(self, bkt, prefix, artifact_path): results = bkt.list_blobs(prefix=prefix, delimiter="/") dir_paths = set() for page in results.pages: dir_paths.update(page.prefixes) return [FileInfo(path[len(artifact_path) + 1:-1], True, None) for path in dir_paths]
def list_artifacts(self, path=None): if path: run_relative_path = posixpath.join( self.run_relative_artifact_repo_root_path, path) else: run_relative_path = self.run_relative_artifact_repo_root_path json_body = message_to_json( ListArtifacts(run_id=self.run_id, path=run_relative_path)) response = self._call_endpoint(MlflowService, ListArtifacts, json_body) artifact_list = response.files # If `path` is a file, ListArtifacts returns a single list element with the # same name as `path`. The list_artifacts API expects us to return an empty list in this # case, so we do so here. if (len(artifact_list) == 1 and artifact_list[0].path == run_relative_path and not artifact_list[0].is_dir): return [] infos = [] for output_file in artifact_list: file_rel_path = posixpath.relpath( path=output_file.path, start=self.run_relative_artifact_repo_root_path) artifact_size = None if output_file.is_dir else output_file.file_size infos.append( FileInfo(file_rel_path, output_file.is_dir, artifact_size)) return infos
def list_artifacts(self, path=None): # TODO: pagination bucket, ns, artifact_path = self.parse_os_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = os.path.join(dest_path, path) infos = [] prefix = dest_path + "/" if dest_path else "" os_client = self._get_os_client() results = os_client.list_objects(ns, bucket, prefix=prefix, delimiter='/').data for subdir_path in results.prefixes: subdir_rel_path = os.path.relpath(path=subdir_path, start=artifact_path) infos.append(FileInfo(subdir_rel_path, True, None)) for obj in results.objects: file_rel_path = os.path.relpath(path=obj.name, start=artifact_path) infos.append(FileInfo(file_rel_path, False, obj.size)) return sorted(infos, key=lambda f: f.path)
def test_faculty_object_to_mlflow_file_info( datasets_path, artifact_path, is_directory, artifact_root ): obj = FacultyObject(datasets_path, 1234, "an etag", DATETIME) expected = FileInfo( artifact_path, is_directory, None if is_directory else 1234 ) assert faculty_object_to_mlflow_file_info(obj, artifact_root) == expected
def _list_artifacts_mlflow_artifacts(): """ A request handler for `GET /mlflow-artifacts/artifacts?path=<value>` to list artifacts in `path` (a relative path from the root artifact directory). """ request_message = _get_request_message(ListArtifactsMlflowArtifacts()) path = request_message.path if request_message.HasField("path") else None artifact_repo = _get_artifact_repo_mlflow_artifacts() files = [] for file_info in artifact_repo.list_artifacts(path): basename = posixpath.basename(file_info.path) new_file_info = FileInfo(basename, file_info.is_dir, file_info.file_size) files.append(new_file_info.to_proto()) response_message = ListArtifacts.Response() response_message.files.extend(files) response = Response(mimetype="application/json") response.set_data(message_to_json(response_message)) return response
def to_file_info(self): """ Convert DB model to corresponding FileInfo object. :return: :py:class:`mlflow.entities.FileInfo`. """ return FileInfo(path=os.path.join(self.group_path, self.artifact_name), is_dir=False, file_size=self.artifact_initial_size)
def test_list_artifacts(hdfs_system_mock): repo = HdfsArtifactRepository('hdfs:/some/path') expected = [ FileInfo('conda.yaml', False, 33), FileInfo('model.pkl', False, 33), FileInfo('MLmodel', False, 33) ] hdfs_system_mock.return_value.walk.return_value = [ ('/some/path', False, ['conda.yaml', 'model.pkl', 'MLmodel']) ] hdfs_system_mock.return_value.info.return_value.get.return_value = 33 hdfs_system_mock.return_value.isdir.side_effect = [ True, False, False, False ] actual = repo.list_artifacts() assert actual == expected
def list_artifacts(self, path=None): # Newer versions of `azure-storage-blob` (>= 12.4.0) provide a public # `azure.storage.blob.BlobPrefix` object to signify that a blob is a directory, # while older versions only expose this API internally as # `azure.storage.blob._models.BlobPrefix` try: from azure.storage.blob import BlobPrefix except ImportError: from azure.storage.blob._models import BlobPrefix (container, _, artifact_path) = self.parse_wasbs_uri(self.artifact_uri) container_client = self.client.get_container_client(container) dest_path = artifact_path if path: dest_path = posixpath.join(dest_path, path) infos = [] prefix = dest_path if dest_path.endswith("/") else dest_path + "/" results = container_client.walk_blobs(name_starts_with=prefix) for r in results: if not r.name.startswith(artifact_path): raise MlflowException( "The name of the listed Azure blob does not begin with the specified" " artifact path. Artifact path: {artifact_path}. Blob name:" " {blob_name}".format(artifact_path=artifact_path, blob_name=r.name)) if isinstance(r, BlobPrefix ): # This is a prefix for items in a subdirectory subdir = posixpath.relpath(path=r.name, start=artifact_path) if subdir.endswith("/"): subdir = subdir[:-1] infos.append(FileInfo(subdir, True, None)) else: # Just a plain old blob file_name = posixpath.relpath(path=r.name, start=artifact_path) infos.append(FileInfo(file_name, False, r.size)) # The list_artifacts API expects us to return an empty list if the # the path references a single file. rel_path = dest_path[len(artifact_path) + 1:] if (len(infos) == 1) and not infos[0].is_dir and (infos[0].path == rel_path): return [] return sorted(infos, key=lambda f: f.path)
def list_artifacts(self, path=None): from azure.storage.blob.models import BlobPrefix (container, _, artifact_path) = self.parse_wasbs_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = posixpath.join(dest_path, path) infos = [] prefix = dest_path + "/" marker = None # Used to make next list request if this one exceeded the result limit while True: results = self.client.list_blobs(container, prefix=prefix, delimiter='/', marker=marker) for r in results: if not r.name.startswith(artifact_path): raise MlflowException( "The name of the listed Azure blob does not begin with the specified" " artifact path. Artifact path: {artifact_path}. Blob name:" " {blob_name}".format(artifact_path=artifact_path, blob_name=r.name)) if isinstance( r, BlobPrefix ): # This is a prefix for items in a subdirectory subdir = posixpath.relpath(path=r.name, start=artifact_path) if subdir.endswith("/"): subdir = subdir[:-1] infos.append(FileInfo(subdir, True, None)) else: # Just a plain old blob file_name = posixpath.relpath(path=r.name, start=artifact_path) infos.append( FileInfo(file_name, False, r.properties.content_length)) # Check whether a new marker is returned, meaning we have to make another request if results.next_marker: marker = results.next_marker else: break return sorted(infos, key=lambda f: f.path)