コード例 #1
0
 def _get_run_files(self, run_uuid, resource_type):
     _validate_run_id(run_uuid)
     run_info = self._get_run_info(run_uuid)
     if run_info is None:
         raise MlflowException(
             "Run '%s' metadata is in invalid state." % run_uuid,
             databricks_pb2.INVALID_STATE)
     if resource_type == "metric":
         subfolder_name = FileStore.METRICS_FOLDER_NAME
     elif resource_type == "param":
         subfolder_name = FileStore.PARAMS_FOLDER_NAME
     elif resource_type == "tag":
         subfolder_name = FileStore.TAGS_FOLDER_NAME
     else:
         raise Exception("Looking for unknown resource under run.")
     _, run_dir = self._find_run_root(run_uuid)
     # run_dir exists since run validity has been confirmed above.
     source_dirs = find(run_dir, subfolder_name, full_path=True)
     if len(source_dirs) == 0:
         return run_dir, []
     file_names = []
     for root, _, files in os.walk(source_dirs[0]):
         for name in files:
             abspath = os.path.join(root, name)
             file_names.append(os.path.relpath(abspath, source_dirs[0]))
     if sys.platform == "win32":
         # Turn metric relative path into metric name.
         # Metrics can have '/' in the name. On windows, '/' is interpreted as a separator.
         # When the metric is read back the path will use '\' for separator.
         # We need to translate the path into posix path.
         from mlflow.utils.file_utils import relative_path_to_artifact_path
         file_names = [
             relative_path_to_artifact_path(x) for x in file_names
         ]
     return source_dirs[0], file_names
コード例 #2
0
    def log_artifacts(self, local_dir, artifact_path=None):
        """
        Parallelized implementation of `download_artifacts` for Databricks.
        """
        artifact_path = artifact_path or ""
        inflight_uploads = {}
        for (dirpath, _, filenames) in os.walk(local_dir):
            artifact_subdir = artifact_path
            if dirpath != local_dir:
                rel_path = os.path.relpath(dirpath, local_dir)
                rel_path = relative_path_to_artifact_path(rel_path)
                artifact_subdir = posixpath.join(artifact_path, rel_path)
            for name in filenames:
                file_path = os.path.join(dirpath, name)
                upload_future = self.thread_pool.submit(
                    self.log_artifact, file_path, artifact_subdir)
                inflight_uploads[file_path] = upload_future

        # Join futures to ensure that all artifacts have been uploaded prior to returning
        failed_uploads = {}
        for (src_file_path, upload_future) in inflight_uploads.items():
            try:
                upload_future.result()
            except Exception as e:
                failed_uploads[src_file_path] = repr(e)

        if len(failed_uploads) > 0:
            raise MlflowException(message=(
                "The following failures occurred while uploading one or more artifacts"
                " to {artifact_root}: {failures}".format(
                    artifact_root=self.artifact_uri,
                    failures=failed_uploads,
                )))
コード例 #3
0
 def log_artifacts(self, local_dir, artifact_path=None):
     artifact_path = artifact_path or ""
     for (dirpath, _, filenames) in os.walk(local_dir):
         artifact_subdir = artifact_path
         if dirpath != local_dir:
             rel_path = os.path.relpath(dirpath, local_dir)
             rel_path = relative_path_to_artifact_path(rel_path)
             artifact_subdir = posixpath.join(artifact_path, rel_path)
         for name in filenames:
             file_path = os.path.join(dirpath, name)
             self.log_artifact(file_path, artifact_subdir)
コード例 #4
0
 def log_artifacts(self, local_dir, artifact_path=None):
     local_dir = os.path.abspath(local_dir)
     for root, _, filenames in os.walk(local_dir):
         if root == local_dir:
             artifact_dir = artifact_path
         else:
             rel_path = os.path.relpath(root, local_dir)
             rel_path = relative_path_to_artifact_path(rel_path)
             artifact_dir = (posixpath.join(artifact_path, rel_path)
                             if artifact_path else rel_path)
         for f in filenames:
             self.log_artifact(os.path.join(root, f), artifact_dir)
コード例 #5
0
def copy_artifacts(artifact_uri, artifact_path):
    local_dir = "/dbfs/%s/%s" % (strip_prefix(
        artifact_uri.rstrip('/'), 'dbfs:/'), strip_prefix(artifact_path, '/'))
    artifact_path = artifact_path or ''
    for (dirpath, _, filenames) in os.walk(local_dir):
        artifact_subdir = artifact_path
        if dirpath != local_dir:
            rel_path = os.path.relpath(dirpath, local_dir)
            rel_path = relative_path_to_artifact_path(rel_path)
            artifact_subdir = posixpath.join(artifact_path, rel_path)
        for name in filenames:
            file_path = os.path.join(dirpath, name)
            _copy_artifact(file_path, artifact_uri, artifact_subdir)
コード例 #6
0
ファイル: s3_artifact_repo.py プロジェクト: zied2/mlflow
 def log_artifacts(self, local_dir, artifact_path=None):
     (bucket, dest_path) = data.parse_s3_uri(self.artifact_uri)
     if artifact_path:
         dest_path = posixpath.join(dest_path, artifact_path)
     s3_client = self._get_s3_client()
     local_dir = os.path.abspath(local_dir)
     for (root, _, filenames) in os.walk(local_dir):
         upload_path = dest_path
         if root != local_dir:
             rel_path = os.path.relpath(root, local_dir)
             rel_path = relative_path_to_artifact_path(rel_path)
             upload_path = posixpath.join(dest_path, rel_path)
         for f in filenames:
             s3_client.upload_file(os.path.join(root, f), bucket,
                                   posixpath.join(upload_path, f))
コード例 #7
0
 def list_artifacts(self, path=None):
     # NOTE: The path is expected to be in posix format.
     # Posix paths work fine on windows but just in case we normalize it here.
     if path:
         path = os.path.normpath(path)
     list_dir = os.path.join(self.artifact_dir, path) if path else self.artifact_dir
     if os.path.isdir(list_dir):
         artifact_files = list_all(list_dir, full_path=True)
         infos = [get_file_info(f,
                                relative_path_to_artifact_path(
                                    os.path.relpath(f, self.artifact_dir)))
                  for f in artifact_files]
         return sorted(infos, key=lambda f: f.path)
     else:
         return []
コード例 #8
0
 def log_artifacts(self, local_dir, artifact_path=None):
     (bucket, dest_path) = self.parse_oss_uri(self.artifact_uri)
     if artifact_path:
         dest_path = posixpath.join(dest_path, artifact_path)
     self._get_oss_bucket(bucket)
     local_dir = os.path.abspath(local_dir)
     for (root, _, filenames) in os.walk(local_dir):
         upload_path = dest_path
         if root != local_dir:
             rel_path = os.path.relpath(root, local_dir)
             rel_path = relative_path_to_artifact_path(rel_path)
             upload_path = posixpath.join(dest_path, rel_path)
         for f in filenames:
             self.oss_bucket.put_object_from_file(
                 posixpath.join(upload_path, f), os.path.join(root, f))
コード例 #9
0
    def log_artifacts(self, local_dir, artifact_path=None):
        (bucket, dest_path) = self.parse_gcs_uri(self.artifact_uri)
        if artifact_path:
            dest_path = posixpath.join(dest_path, artifact_path)
        gcs_bucket = self.gcs.Client().get_bucket(bucket)

        local_dir = os.path.abspath(local_dir)
        for (root, _, filenames) in os.walk(local_dir):
            upload_path = dest_path
            if root != local_dir:
                rel_path = os.path.relpath(root, local_dir)
                rel_path = relative_path_to_artifact_path(rel_path)
                upload_path = posixpath.join(dest_path, rel_path)
            for f in filenames:
                path = posixpath.join(upload_path, f)
                gcs_bucket.blob(path).upload_from_filename(os.path.join(root, f))
コード例 #10
0
    def log_artifacts(self, local_dir, artifact_path=None):
        dest_path = posixpath.join(self.path, artifact_path) \
            if artifact_path else self.path

        local_dir = os.path.abspath(local_dir)
        for (root, _, filenames) in os.walk(local_dir):
            upload_path = dest_path
            if root != local_dir:
                rel_path = os.path.relpath(root, local_dir)
                upload_path = relative_path_to_artifact_path(rel_path)
            if not filenames:
                with self.get_ftp_client() as ftp:
                    self._mkdir(ftp, posixpath.join(self.path, upload_path))
            for f in filenames:
                if os.path.isfile(os.path.join(root, f)):
                    self.log_artifact(os.path.join(root, f), upload_path)
コード例 #11
0
    def _get_resource_files(self, root_dir, subfolder_name):
        source_dirs = find(root_dir, subfolder_name, full_path=True)
        if len(source_dirs) == 0:
            return root_dir, []
        file_names = []
        for root, _, files in os.walk(source_dirs[0]):
            for name in files:
                abspath = os.path.join(root, name)
                file_names.append(os.path.relpath(abspath, source_dirs[0]))
        if sys.platform == "win32":
            # Turn metric relative path into metric name.
            # Metrics can have '/' in the name. On windows, '/' is interpreted as a separator.
            # When the metric is read back the path will use '\' for separator.
            # We need to translate the path into posix path.
            from mlflow.utils.file_utils import relative_path_to_artifact_path

            file_names = [relative_path_to_artifact_path(x) for x in file_names]
        return source_dirs[0], file_names
コード例 #12
0
    def log_artifacts(self, local_dir, artifact_path=None):
        (bucket, dest_path) = self.parse_gcs_uri(self.artifact_uri)
        if artifact_path:
            dest_path = posixpath.join(dest_path, artifact_path)
        gcs_bucket = self._get_bucket(bucket)

        local_dir = os.path.abspath(local_dir)
        for (root, _, filenames) in os.walk(local_dir):
            upload_path = dest_path
            if root != local_dir:
                rel_path = os.path.relpath(root, local_dir)
                rel_path = relative_path_to_artifact_path(rel_path)
                upload_path = posixpath.join(dest_path, rel_path)
            for f in filenames:
                path = posixpath.join(upload_path, f)
                gcs_bucket.blob(path, chunk_size=self._GCS_UPLOAD_CHUNK_SIZE
                                ).upload_from_filename(
                                    os.path.join(root, f),
                                    timeout=self._GCS_DEFAULT_TIMEOUT)
コード例 #13
0
def log_artifacts_minio(
    run: mlflow.entities.Run,
    local_dir: str,
    artifact_path: str = None,
    delete_local: bool = True,
) -> None:
    """Upload local artefacts via Minio client
    This is needed as boto3 and Minio have problems with empty files. See

    - https://github.com/minio/minio/issues/5150
    - https://github.com/boto/botocore/pull/1328  

    :param run: an active Mlflow Run
    :type run: mlflow.entities.Run 
    :param local_dir: the path to the local directory with artifacts to log to Mlflow
    :type local_dir: str
    :param artifact_path: relative path of logged artifacts in Mlflow Run assets
    :type artifact_path: str
    :param delete_local: whether to delete the local assets after logging them to Mlflow
    :type delete_local: bool
    """
    (bucket, dest_path) = parse_s3_uri(run.info.artifact_uri)
    if artifact_path:
        dest_path = posixpath.join(dest_path, artifact_path)
    minio_client = Minio(
        urlparse(os.environ["MLFLOW_S3_ENDPOINT_URL"]).netloc,
        access_key=os.environ["AWS_ACCESS_KEY_ID"],
        secret_key=os.environ["AWS_SECRET_ACCESS_KEY"],
        secure=False,
    )
    local_dir = os.path.abspath(local_dir)
    for (root, _, filenames) in os.walk(local_dir):
        upload_path = dest_path
        if root != local_dir:
            rel_path = os.path.relpath(root, local_dir)
            rel_path = relative_path_to_artifact_path(rel_path)
            upload_path = posixpath.join(dest_path, rel_path)
        for f in filenames:
            minio_client.fput_object(bucket, posixpath.join(upload_path, f),
                                     os.path.join(root, f))
    if delete_local:
        shutil.rmtree(local_dir)
コード例 #14
0
 def log_artifacts(self, local_dir, artifact_path=None):
     bucket, ns, dest_path = self.parse_os_uri(self.artifact_uri)
     if artifact_path:
         dest_path = os.path.join(dest_path, artifact_path)
     os_client = self._get_os_client()
     local_dir = os.path.abspath(local_dir)
     for (root, _, filenames) in os.walk(local_dir):
         upload_path = dest_path
         if root != local_dir:
             rel_path = os.path.relpath(root, local_dir)
             rel_path = relative_path_to_artifact_path(rel_path)
             upload_path = os.path.join(dest_path, rel_path)
         for f in filenames:
             self._upload_file(
                 os_client,
                 local_file=os.path.join(root, f),
                 bucket=bucket,
                 ns=ns,
                 dest_path=os.path.join(upload_path, f),
             )
コード例 #15
0
    def log_artifacts(self, local_dir, artifact_path=None):
        dest_path = posixpath.join(self.path, artifact_path) \
            if artifact_path else self.path

        dest_path = posixpath.join(dest_path, os.path.split(local_dir)[1])
        dest_path_re = os.path.split(local_dir)[1]
        if artifact_path:
            dest_path_re = posixpath.join(artifact_path,
                                          os.path.split(local_dir)[1])

        local_dir = os.path.abspath(local_dir)
        for (root, _, filenames) in os.walk(local_dir):
            upload_path = dest_path
            if root != local_dir:
                rel_path = os.path.relpath(root, local_dir)
                rel_path = relative_path_to_artifact_path(rel_path)
                upload_path = posixpath.join(dest_path_re, rel_path)
            if not filenames:
                self._mkdir(posixpath.join(self.path, upload_path))
            for f in filenames:
                if os.path.isfile(os.path.join(root, f)):
                    self.log_artifact(os.path.join(root, f), upload_path)
コード例 #16
0
def _relative_path_local(base_dir, subdir_path):
    rel_path = _relative_path(base_dir, subdir_path, os.path)
    return relative_path_to_artifact_path(rel_path) if rel_path is not None else None
コード例 #17
0
    def log_artifacts(self, local_dir, artifact_path=None):
        """
        Parallelized implementation of `download_artifacts` for Databricks.
        """
        StagedArtifactUpload = namedtuple(
            "StagedArtifactUpload",
            [
                # Local filesystem path of the source file to upload
                "src_file_path",
                # Run-relative artifact path specifying the upload destination
                "dst_run_relative_artifact_path",
            ],
        )

        artifact_path = artifact_path or ""

        staged_uploads = []
        for (dirpath, _, filenames) in os.walk(local_dir):
            artifact_subdir = artifact_path
            if dirpath != local_dir:
                rel_path = os.path.relpath(dirpath, local_dir)
                rel_path = relative_path_to_artifact_path(rel_path)
                artifact_subdir = posixpath.join(artifact_path, rel_path)
            for name in filenames:
                file_path = os.path.join(dirpath, name)
                dst_run_relative_artifact_path = self._get_run_relative_artifact_path_for_upload(
                    src_file_path=file_path,
                    dst_artifact_dir=artifact_subdir,
                )
                staged_uploads.append(
                    StagedArtifactUpload(
                        src_file_path=file_path,
                        dst_run_relative_artifact_path=
                        dst_run_relative_artifact_path,
                    ))

        write_credential_infos = self._get_write_credential_infos(
            run_id=self.run_id,
            paths=[
                staged_upload.dst_run_relative_artifact_path
                for staged_upload in staged_uploads
            ],
        )

        inflight_uploads = {}
        for staged_upload, write_credential_info in zip(
                staged_uploads, write_credential_infos):
            upload_future = self.thread_pool.submit(
                self._upload_to_cloud,
                cloud_credential_info=write_credential_info,
                src_file_path=staged_upload.src_file_path,
                dst_run_relative_artifact_path=staged_upload.
                dst_run_relative_artifact_path,
            )
            inflight_uploads[staged_upload.src_file_path] = upload_future

        # Join futures to ensure that all artifacts have been uploaded prior to returning
        failed_uploads = {}
        for (src_file_path, upload_future) in inflight_uploads.items():
            try:
                upload_future.result()
            except Exception as e:
                failed_uploads[src_file_path] = repr(e)

        if len(failed_uploads) > 0:
            raise MlflowException(message=(
                "The following failures occurred while uploading one or more artifacts"
                " to {artifact_root}: {failures}".format(
                    artifact_root=self.artifact_uri,
                    failures=failed_uploads,
                )))