def list_artifacts(self, path=None): if path: dbfs_path = self._get_dbfs_path(path) else: dbfs_path = self._get_dbfs_path('') dbfs_list_json = {'path': dbfs_path} response = self._dbfs_list_api(dbfs_list_json) try: json_response = json.loads(response.text) except ValueError: raise MlflowException( "API request to list files under DBFS path %s failed with status code %s. " "Response body: %s" % (dbfs_path, response.status_code, response.text)) # /api/2.0/dbfs/list will not have the 'files' key in the response for empty directories. infos = [] artifact_prefix = strip_prefix(self.artifact_uri, 'dbfs:') if json_response.get('error_code', None) == RESOURCE_DOES_NOT_EXIST: return [] dbfs_files = json_response.get('files', []) for dbfs_file in dbfs_files: stripped_path = strip_prefix(dbfs_file['path'], artifact_prefix + '/') # If `path` is a file, the DBFS list API returns a single list element with the # same name as `path`. The list_artifacts API expects us to return an empty list in this # case, so we do so here. if stripped_path == path: return [] is_dir = dbfs_file['is_dir'] artifact_size = None if is_dir else dbfs_file['file_size'] infos.append(FileInfo(stripped_path, is_dir, artifact_size)) return sorted(infos, key=lambda f: f.path)
def copy_artifacts(artifact_uri, artifact_path): local_dir = "/dbfs/%s/%s" % (strip_prefix( artifact_uri.rstrip('/'), 'dbfs:/'), strip_prefix(artifact_path, '/')) artifact_path = artifact_path or '' for (dirpath, _, filenames) in os.walk(local_dir): artifact_subdir = artifact_path if dirpath != local_dir: rel_path = os.path.relpath(dirpath, local_dir) rel_path = relative_path_to_artifact_path(rel_path) artifact_subdir = posixpath.join(artifact_path, rel_path) for name in filenames: file_path = os.path.join(dirpath, name) _copy_artifact(file_path, artifact_uri, artifact_subdir)
def dbfs_artifact_repo_factory(artifact_uri): """ Returns an ArtifactRepository subclass for storing artifacts on DBFS. This factory method is used with URIs of the form ``dbfs:/<path>``. DBFS-backed artifact storage can only be used together with the RestStore. In the special case where the URI is of the form `dbfs:/databricks/mlflow-tracking/<Exp-ID>/<Run-ID>/<path>', a DatabricksArtifactRepository is returned. This is capable of storing access controlled artifacts. :param artifact_uri: DBFS root artifact URI (string). :return: Subclass of ArtifactRepository capable of storing artifacts on DBFS. """ cleaned_artifact_uri = artifact_uri.rstrip('/') uri_scheme = get_uri_scheme(artifact_uri) if uri_scheme != 'dbfs': raise MlflowException( "DBFS URI must be of the form " "dbfs:/<path>, but received {uri}".format(uri=artifact_uri)) if is_databricks_acled_artifacts_uri(artifact_uri): return DatabricksArtifactRepository(cleaned_artifact_uri) elif mlflow.utils.databricks_utils.is_dbfs_fuse_available() \ and os.environ.get(USE_FUSE_ENV_VAR, "").lower() != "false" \ and not artifact_uri.startswith("dbfs:/databricks/mlflow-registry"): # If the DBFS FUSE mount is available, write artifacts directly to /dbfs/... using # local filesystem APIs file_uri = "file:///dbfs/{}".format( strip_prefix(cleaned_artifact_uri, "dbfs:/")) return LocalArtifactRepository(file_uri) return DbfsRestArtifactRepository(cleaned_artifact_uri)
def list_artifacts(self, path=None): if path: dbfs_list_json = {'path': self._get_dbfs_path(path)} else: dbfs_list_json = {'path': self._get_dbfs_path('')} response = _dbfs_list_api(dbfs_list_json, self.http_request_kwargs) json_response = json.loads(response.text) # /api/2.0/dbfs/list will not have the 'files' key in the response for empty directories. infos = [] artifact_prefix = strip_prefix(self.artifact_uri, 'dbfs:') if json_response.get('error_code', None) == RESOURCE_DOES_NOT_EXIST: return [] dbfs_files = json_response.get('files', []) for dbfs_file in dbfs_files: is_dir = dbfs_file['is_dir'] artifact_size = None if is_dir else dbfs_file['file_size'] stripped_path = strip_prefix(dbfs_file['path'], artifact_prefix + '/') infos.append(FileInfo(stripped_path, is_dir, artifact_size)) return sorted(infos, key=lambda f: f.path)
def dbfs_artifact_repo_factory(artifact_uri): """ Returns an ArtifactRepository subclass for storing artifacts on DBFS. This factory method is used with URIs of the form ``dbfs:/<path>``. DBFS-backed artifact storage can only be used together with the RestStore. :param artifact_uri: DBFS root artifact URI (string). :return: Subclass of ArtifactRepository capable of storing artifacts on DBFS. """ cleaned_artifact_uri = artifact_uri.rstrip('/') if mlflow.utils.databricks_utils.is_dbfs_fuse_available() \ and os.environ.get(USE_FUSE_ENV_VAR, "").lower() != "false": # If the DBFS FUSE mount is available, write artifacts directly to /dbfs/... using # local filesystem APIs file_uri = "file:///dbfs/{}".format(strip_prefix(cleaned_artifact_uri, "dbfs:/")) return LocalArtifactRepository(file_uri) return DbfsRestArtifactRepository(cleaned_artifact_uri)
def dbfs_artifact_repo_factory(artifact_uri): """ Returns an ArtifactRepository subclass for storing artifacts on DBFS. This factory method is used with URIs of the form ``dbfs:/<path>``. DBFS-backed artifact storage can only be used together with the RestStore. In the special case where the URI is of the form `dbfs:/databricks/mlflow-tracking/<Exp-ID>/<Run-ID>/<path>', a DatabricksArtifactRepository is returned. This is capable of storing access controlled artifacts. :param artifact_uri: DBFS root artifact URI (string). :return: Subclass of ArtifactRepository capable of storing artifacts on DBFS. """ if not is_valid_dbfs_uri(artifact_uri): raise MlflowException( "DBFS URI must be of the form dbfs:/<path> or " + "dbfs://profile@databricks/<path>, but received " + artifact_uri ) cleaned_artifact_uri = artifact_uri.rstrip("/") db_profile_uri = get_databricks_profile_uri_from_artifact_uri(cleaned_artifact_uri) if is_databricks_acled_artifacts_uri(artifact_uri): return DatabricksArtifactRepository(cleaned_artifact_uri) elif ( mlflow.utils.databricks_utils.is_dbfs_fuse_available() and os.environ.get(USE_FUSE_ENV_VAR, "").lower() != "false" and not is_databricks_model_registry_artifacts_uri(artifact_uri) and (db_profile_uri is None or db_profile_uri == "databricks") ): # If the DBFS FUSE mount is available, write artifacts directly to # /dbfs/... using local filesystem APIs. # Note: it is possible for a named Databricks profile to point to the current workspace, # but we're going to avoid doing a complex check and assume users will use `databricks` # to mean the current workspace. Using `DbfsRestArtifactRepository` to access the current # workspace's DBFS should still work; it just may be slower. final_artifact_uri = remove_databricks_profile_info_from_artifact_uri(cleaned_artifact_uri) file_uri = "file:///dbfs/{}".format(strip_prefix(final_artifact_uri, "dbfs:/")) return LocalArtifactRepository(file_uri) return DbfsRestArtifactRepository(cleaned_artifact_uri)
def test_strip_prefix(original, prefix, expected): assert strip_prefix(original, prefix) == expected
def _get_dbfs_path(self, artifact_path): return '/%s/%s' % (strip_prefix( self.artifact_uri, 'dbfs:/'), strip_prefix(artifact_path, '/'))
def _get_dbfs_endpoint(artifact_uri, artifact_path): return "/dbfs/%s/%s" % (strip_prefix( artifact_uri.rstrip('/'), 'dbfs:/'), strip_prefix(artifact_path, '/'))
def _get_dbfs_path(self, artifact_path): return "/%s/%s" % ( strip_prefix(self.artifact_uri, "dbfs:/"), strip_prefix(artifact_path, "/"), )