Exemple #1
0
 def list_artifacts(self, path=None):
     (bucket, artifact_path) = data.parse_s3_uri(self.artifact_uri)
     dest_path = artifact_path
     if path:
         dest_path = posixpath.join(dest_path, path)
     infos = []
     prefix = dest_path + "/" if dest_path else ""
     s3_client = self._get_s3_client()
     paginator = s3_client.get_paginator("list_objects_v2")
     results = paginator.paginate(Bucket=bucket,
                                  Prefix=prefix,
                                  Delimiter='/')
     for result in results:
         # Subdirectories will be listed as "common prefixes" due to the way we made the request
         for obj in result.get("CommonPrefixes", []):
             subdir_path = obj.get("Prefix")
             self._verify_listed_object_contains_artifact_path_prefix(
                 listed_object_path=subdir_path,
                 artifact_path=artifact_path)
             subdir_rel_path = posixpath.relpath(path=subdir_path,
                                                 start=artifact_path)
             if subdir_rel_path.endswith("/"):
                 subdir_rel_path = subdir_rel_path[:-1]
             infos.append(FileInfo(subdir_rel_path, True, None))
         # Objects listed directly will be files
         for obj in result.get('Contents', []):
             file_path = obj.get("Key")
             self._verify_listed_object_contains_artifact_path_prefix(
                 listed_object_path=file_path, artifact_path=artifact_path)
             file_rel_path = posixpath.relpath(path=file_path,
                                               start=artifact_path)
             file_size = int(obj.get('Size'))
             infos.append(FileInfo(file_rel_path, False, file_size))
     return sorted(infos, key=lambda f: f.path)
Exemple #2
0
def test_list_artifacts_nested(hdfs_system_mock):
    repo = HdfsArtifactRepository('hdfs:://host/some/path')

    expected = [FileInfo('model/conda.yaml', False, 33),
                FileInfo('model/model.pkl', False, 33),
                FileInfo('model/MLmodel', False, 33)]

    hdfs_system_mock.return_value.ls.return_value = [{
            'kind': 'file',
            'name': 'hdfs://host/some/path/model/conda.yaml',
            'size': 33,
            },
            {
            'kind': 'file',
            'name': 'hdfs://host/some/path/model/model.pkl',
            'size': 33,
            },
            {
            'kind': 'file',
            'name': 'hdfs://host/some/path/model/MLmodel',
            'size': 33,
            }]

    actual = repo.list_artifacts('model')

    assert actual == expected
 def list_artifacts(self, path=None):
     from azure.storage.blob._models import BlobPrefix
     (container, _, artifact_path) = self.parse_wasbs_uri(self.artifact_uri)
     container_client = self.client.get_container_client(container)
     dest_path = artifact_path
     if path:
         dest_path = posixpath.join(dest_path, path)
     infos = []
     prefix = dest_path + "/"
     results = container_client.walk_blobs(name_starts_with=prefix)
     for r in results:
         if not r.name.startswith(artifact_path):
             raise MlflowException(
                 "The name of the listed Azure blob does not begin with the specified"
                 " artifact path. Artifact path: {artifact_path}. Blob name:"
                 " {blob_name}".format(artifact_path=artifact_path, blob_name=r.name))
         if isinstance(r, BlobPrefix):   # This is a prefix for items in a subdirectory
             subdir = posixpath.relpath(path=r.name, start=artifact_path)
             if subdir.endswith("/"):
                 subdir = subdir[:-1]
             infos.append(FileInfo(subdir, True, None))
         else:  # Just a plain old blob
             file_name = posixpath.relpath(path=r.name, start=artifact_path)
             infos.append(FileInfo(file_name, False, r.size))
     # The list_artifacts API expects us to return an empty list if the
     # the path references a single file.
     rel_path = dest_path[len(artifact_path)+1:]
     if (len(infos) == 1) and not infos[0].is_dir and (infos[0].path == rel_path):
         return []
     return sorted(infos, key=lambda f: f.path)
Exemple #4
0
def get_file_info(path, rel_path):
    """
    Returns file meta data : location, size, ... etc

    :param path: Path to artifact

    :return: `FileInfo` object
    """
    if is_directory(path):
        return FileInfo(rel_path, True, None)
    else:
        return FileInfo(rel_path, False, os.path.getsize(path))
Exemple #5
0
def test_file_info_to_json():
    file_infos = [
        FileInfo("/my/file", False, 123),
        FileInfo("/my/dir", True, None),
    ]
    info_str = _file_infos_to_json(file_infos)
    assert json.loads(info_str) == [{
        "path": "/my/file",
        "is_dir": False,
        "file_size": "123",
    }, {
        "path": "/my/dir",
        "is_dir": True,
    }]
Exemple #6
0
 def list_artifacts(self, path=None):
     if path:
         dbfs_path = self._get_dbfs_path(path)
     else:
         dbfs_path = self._get_dbfs_path('')
     dbfs_list_json = {'path': dbfs_path}
     response = self._dbfs_list_api(dbfs_list_json)
     try:
         json_response = json.loads(response.text)
     except ValueError:
         raise MlflowException(
             "API request to list files under DBFS path %s failed with status code %s. "
             "Response body: %s" %
             (dbfs_path, response.status_code, response.text))
     # /api/2.0/dbfs/list will not have the 'files' key in the response for empty directories.
     infos = []
     artifact_prefix = strip_prefix(self.artifact_uri, 'dbfs:')
     if json_response.get('error_code', None) == RESOURCE_DOES_NOT_EXIST:
         return []
     dbfs_files = json_response.get('files', [])
     for dbfs_file in dbfs_files:
         stripped_path = strip_prefix(dbfs_file['path'],
                                      artifact_prefix + '/')
         # If `path` is a file, the DBFS list API returns a single list element with the
         # same name as `path`. The list_artifacts API expects us to return an empty list in this
         # case, so we do so here.
         if stripped_path == path:
             return []
         is_dir = dbfs_file['is_dir']
         artifact_size = None if is_dir else dbfs_file['file_size']
         infos.append(FileInfo(stripped_path, is_dir, artifact_size))
     return sorted(infos, key=lambda f: f.path)
Exemple #7
0
 def list_artifacts(self, path=None):
     artifact_dir = self.path
     list_dir = posixpath.join(artifact_dir, path) if path else artifact_dir
     if not self.sftp.isdir(list_dir):
         return []
     artifact_files = self.sftp.listdir(list_dir)
     infos = []
     for file_name in artifact_files:
         file_path = file_name if path is None else posixpath.join(
             path, file_name)
         full_file_path = posixpath.join(list_dir, file_name)
         if self.sftp.isdir(full_file_path):
             infos.append(FileInfo(file_path, True, None))
         else:
             infos.append(
                 FileInfo(file_path, False,
                          self.sftp.stat(full_file_path).st_size))
     return infos
Exemple #8
0
    def _list_folders(self, bkt, prefix, artifact_path):
        results = bkt.list_blobs(prefix=prefix, delimiter="/")
        dir_paths = set()
        for page in results.pages:
            dir_paths.update(page.prefixes)

        return [
            FileInfo(path[len(artifact_path) + 1:-1], True, None)
            for path in dir_paths
        ]
Exemple #9
0
def test_list_artifacts_root(hdfs_system_mock):
    repo = HdfsArtifactRepository('hdfs://host/some/path')

    expected = [FileInfo('model', True, 0)]

    hdfs_system_mock.return_value.ls.return_value = [{
            'kind': 'directory',
            'name': 'hdfs://host/some/path/model',
            'size': 0,
            }]

    actual = repo.list_artifacts()

    assert actual == expected
Exemple #10
0
    def list_artifacts(self, path=None):
        (bucket, artifact_path) = self.parse_gcs_uri(self.artifact_uri)
        dest_path = artifact_path
        if path:
            dest_path = posixpath.join(dest_path, path)
        prefix = dest_path + "/"

        bkt = self._get_bucket(bucket)

        infos = self._list_folders(bkt, prefix, artifact_path)

        results = bkt.list_blobs(prefix=prefix, delimiter="/")
        for result in results:
            blob_path = result.name[len(artifact_path) + 1:]
            infos.append(FileInfo(blob_path, False, result.size))

        return sorted(infos, key=lambda f: f.path)
Exemple #11
0
    def list_artifacts(self, path=None):
        """
            Lists files and directories under artifacts directory for the current run_id.
            (self.path contains the base path - hdfs:/some/path/run_id/artifacts)

            :param path: Relative source path. Possible subdirectory existing under
                         hdfs:/some/path/run_id/artifacts
            :return: List of FileInfos under given path
        """
        hdfs_base_path = _resolve_base_path(self.path, path)

        with hdfs_system(scheme=self.scheme, host=self.host, port=self.port) as hdfs:
            paths = []
            if hdfs.exists(hdfs_base_path):
                for file_detail in hdfs.ls(hdfs_base_path, detail=True):
                    file_name = file_detail.get("name")
                    # Strip off anything that comes before the artifact root e.g. hdfs://name
                    offset = file_name.index(self.path)
                    rel_path = _relative_path_remote(self.path, file_name[offset:])
                    is_dir = file_detail.get("kind") == "directory"
                    size = file_detail.get("size")
                    paths.append(FileInfo(rel_path, is_dir, size))
            return sorted(paths, key=lambda f: paths)
 def list_artifacts(self, path=None):
     if path:
         run_relative_path = posixpath.join(
             self.run_relative_artifact_repo_root_path, path)
     else:
         run_relative_path = self.run_relative_artifact_repo_root_path
     infos = []
     page_token = None
     while True:
         if page_token:
             json_body = message_to_json(
                 ListArtifacts(run_id=self.run_id,
                               path=run_relative_path,
                               page_token=page_token))
         else:
             json_body = message_to_json(
                 ListArtifacts(run_id=self.run_id, path=run_relative_path))
         response = self._call_endpoint(MlflowService, ListArtifacts,
                                        json_body)
         artifact_list = response.files
         # If `path` is a file, ListArtifacts returns a single list element with the
         # same name as `path`. The list_artifacts API expects us to return an empty list in this
         # case, so we do so here.
         if len(artifact_list) == 1 and artifact_list[0].path == run_relative_path \
                 and not artifact_list[0].is_dir:
             return []
         for output_file in artifact_list:
             file_rel_path = posixpath.relpath(
                 path=output_file.path,
                 start=self.run_relative_artifact_repo_root_path)
             artifact_size = None if output_file.is_dir else output_file.file_size
             infos.append(
                 FileInfo(file_rel_path, output_file.is_dir, artifact_size))
         if len(artifact_list) == 0 or not response.next_page_token:
             break
         page_token = response.next_page_token
     return infos