def list_artifacts(self, path=None): (bucket, artifact_path) = data.parse_s3_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = posixpath.join(dest_path, path) infos = [] prefix = dest_path + "/" if dest_path else "" s3_client = self._get_s3_client() paginator = s3_client.get_paginator("list_objects_v2") results = paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/') for result in results: # Subdirectories will be listed as "common prefixes" due to the way we made the request for obj in result.get("CommonPrefixes", []): subdir_path = obj.get("Prefix") self._verify_listed_object_contains_artifact_path_prefix( listed_object_path=subdir_path, artifact_path=artifact_path) subdir_rel_path = posixpath.relpath(path=subdir_path, start=artifact_path) if subdir_rel_path.endswith("/"): subdir_rel_path = subdir_rel_path[:-1] infos.append(FileInfo(subdir_rel_path, True, None)) # Objects listed directly will be files for obj in result.get('Contents', []): file_path = obj.get("Key") self._verify_listed_object_contains_artifact_path_prefix( listed_object_path=file_path, artifact_path=artifact_path) file_rel_path = posixpath.relpath(path=file_path, start=artifact_path) file_size = int(obj.get('Size')) infos.append(FileInfo(file_rel_path, False, file_size)) return sorted(infos, key=lambda f: f.path)
def test_list_artifacts_nested(hdfs_system_mock): repo = HdfsArtifactRepository('hdfs:://host/some/path') expected = [FileInfo('model/conda.yaml', False, 33), FileInfo('model/model.pkl', False, 33), FileInfo('model/MLmodel', False, 33)] hdfs_system_mock.return_value.ls.return_value = [{ 'kind': 'file', 'name': 'hdfs://host/some/path/model/conda.yaml', 'size': 33, }, { 'kind': 'file', 'name': 'hdfs://host/some/path/model/model.pkl', 'size': 33, }, { 'kind': 'file', 'name': 'hdfs://host/some/path/model/MLmodel', 'size': 33, }] actual = repo.list_artifacts('model') assert actual == expected
def list_artifacts(self, path=None): from azure.storage.blob._models import BlobPrefix (container, _, artifact_path) = self.parse_wasbs_uri(self.artifact_uri) container_client = self.client.get_container_client(container) dest_path = artifact_path if path: dest_path = posixpath.join(dest_path, path) infos = [] prefix = dest_path + "/" results = container_client.walk_blobs(name_starts_with=prefix) for r in results: if not r.name.startswith(artifact_path): raise MlflowException( "The name of the listed Azure blob does not begin with the specified" " artifact path. Artifact path: {artifact_path}. Blob name:" " {blob_name}".format(artifact_path=artifact_path, blob_name=r.name)) if isinstance(r, BlobPrefix): # This is a prefix for items in a subdirectory subdir = posixpath.relpath(path=r.name, start=artifact_path) if subdir.endswith("/"): subdir = subdir[:-1] infos.append(FileInfo(subdir, True, None)) else: # Just a plain old blob file_name = posixpath.relpath(path=r.name, start=artifact_path) infos.append(FileInfo(file_name, False, r.size)) # The list_artifacts API expects us to return an empty list if the # the path references a single file. rel_path = dest_path[len(artifact_path)+1:] if (len(infos) == 1) and not infos[0].is_dir and (infos[0].path == rel_path): return [] return sorted(infos, key=lambda f: f.path)
def get_file_info(path, rel_path): """ Returns file meta data : location, size, ... etc :param path: Path to artifact :return: `FileInfo` object """ if is_directory(path): return FileInfo(rel_path, True, None) else: return FileInfo(rel_path, False, os.path.getsize(path))
def test_file_info_to_json(): file_infos = [ FileInfo("/my/file", False, 123), FileInfo("/my/dir", True, None), ] info_str = _file_infos_to_json(file_infos) assert json.loads(info_str) == [{ "path": "/my/file", "is_dir": False, "file_size": "123", }, { "path": "/my/dir", "is_dir": True, }]
def list_artifacts(self, path=None): if path: dbfs_path = self._get_dbfs_path(path) else: dbfs_path = self._get_dbfs_path('') dbfs_list_json = {'path': dbfs_path} response = self._dbfs_list_api(dbfs_list_json) try: json_response = json.loads(response.text) except ValueError: raise MlflowException( "API request to list files under DBFS path %s failed with status code %s. " "Response body: %s" % (dbfs_path, response.status_code, response.text)) # /api/2.0/dbfs/list will not have the 'files' key in the response for empty directories. infos = [] artifact_prefix = strip_prefix(self.artifact_uri, 'dbfs:') if json_response.get('error_code', None) == RESOURCE_DOES_NOT_EXIST: return [] dbfs_files = json_response.get('files', []) for dbfs_file in dbfs_files: stripped_path = strip_prefix(dbfs_file['path'], artifact_prefix + '/') # If `path` is a file, the DBFS list API returns a single list element with the # same name as `path`. The list_artifacts API expects us to return an empty list in this # case, so we do so here. if stripped_path == path: return [] is_dir = dbfs_file['is_dir'] artifact_size = None if is_dir else dbfs_file['file_size'] infos.append(FileInfo(stripped_path, is_dir, artifact_size)) return sorted(infos, key=lambda f: f.path)
def list_artifacts(self, path=None): artifact_dir = self.path list_dir = posixpath.join(artifact_dir, path) if path else artifact_dir if not self.sftp.isdir(list_dir): return [] artifact_files = self.sftp.listdir(list_dir) infos = [] for file_name in artifact_files: file_path = file_name if path is None else posixpath.join( path, file_name) full_file_path = posixpath.join(list_dir, file_name) if self.sftp.isdir(full_file_path): infos.append(FileInfo(file_path, True, None)) else: infos.append( FileInfo(file_path, False, self.sftp.stat(full_file_path).st_size)) return infos
def _list_folders(self, bkt, prefix, artifact_path): results = bkt.list_blobs(prefix=prefix, delimiter="/") dir_paths = set() for page in results.pages: dir_paths.update(page.prefixes) return [ FileInfo(path[len(artifact_path) + 1:-1], True, None) for path in dir_paths ]
def test_list_artifacts_root(hdfs_system_mock): repo = HdfsArtifactRepository('hdfs://host/some/path') expected = [FileInfo('model', True, 0)] hdfs_system_mock.return_value.ls.return_value = [{ 'kind': 'directory', 'name': 'hdfs://host/some/path/model', 'size': 0, }] actual = repo.list_artifacts() assert actual == expected
def list_artifacts(self, path=None): (bucket, artifact_path) = self.parse_gcs_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = posixpath.join(dest_path, path) prefix = dest_path + "/" bkt = self._get_bucket(bucket) infos = self._list_folders(bkt, prefix, artifact_path) results = bkt.list_blobs(prefix=prefix, delimiter="/") for result in results: blob_path = result.name[len(artifact_path) + 1:] infos.append(FileInfo(blob_path, False, result.size)) return sorted(infos, key=lambda f: f.path)
def list_artifacts(self, path=None): """ Lists files and directories under artifacts directory for the current run_id. (self.path contains the base path - hdfs:/some/path/run_id/artifacts) :param path: Relative source path. Possible subdirectory existing under hdfs:/some/path/run_id/artifacts :return: List of FileInfos under given path """ hdfs_base_path = _resolve_base_path(self.path, path) with hdfs_system(scheme=self.scheme, host=self.host, port=self.port) as hdfs: paths = [] if hdfs.exists(hdfs_base_path): for file_detail in hdfs.ls(hdfs_base_path, detail=True): file_name = file_detail.get("name") # Strip off anything that comes before the artifact root e.g. hdfs://name offset = file_name.index(self.path) rel_path = _relative_path_remote(self.path, file_name[offset:]) is_dir = file_detail.get("kind") == "directory" size = file_detail.get("size") paths.append(FileInfo(rel_path, is_dir, size)) return sorted(paths, key=lambda f: paths)
def list_artifacts(self, path=None): if path: run_relative_path = posixpath.join( self.run_relative_artifact_repo_root_path, path) else: run_relative_path = self.run_relative_artifact_repo_root_path infos = [] page_token = None while True: if page_token: json_body = message_to_json( ListArtifacts(run_id=self.run_id, path=run_relative_path, page_token=page_token)) else: json_body = message_to_json( ListArtifacts(run_id=self.run_id, path=run_relative_path)) response = self._call_endpoint(MlflowService, ListArtifacts, json_body) artifact_list = response.files # If `path` is a file, ListArtifacts returns a single list element with the # same name as `path`. The list_artifacts API expects us to return an empty list in this # case, so we do so here. if len(artifact_list) == 1 and artifact_list[0].path == run_relative_path \ and not artifact_list[0].is_dir: return [] for output_file in artifact_list: file_rel_path = posixpath.relpath( path=output_file.path, start=self.run_relative_artifact_repo_root_path) artifact_size = None if output_file.is_dir else output_file.file_size infos.append( FileInfo(file_rel_path, output_file.is_dir, artifact_size)) if len(artifact_list) == 0 or not response.next_page_token: break page_token = response.next_page_token return infos