Esempio n. 1
0
 def log_artifacts(self, local_dir, artifact_path=None):
     if artifact_path:
         root_http_endpoint = self._get_dbfs_endpoint(artifact_path)
     else:
         root_http_endpoint = self._get_dbfs_endpoint(
             os.path.basename(local_dir))
     for (dirpath, _, filenames) in os.walk(local_dir):
         dir_http_endpoint = root_http_endpoint
         if dirpath != local_dir:
             rel_path = get_relative_path(local_dir, dirpath)
             dir_http_endpoint = build_path(root_http_endpoint, rel_path)
         for name in filenames:
             endpoint = build_path(dir_http_endpoint, name)
             with open(build_path(dirpath, name), 'rb') as f:
                 response = http_request(endpoint=endpoint,
                                         method='POST',
                                         data=f,
                                         allow_redirects=False,
                                         **self.http_request_kwargs)
             if response.status_code == 409:
                 raise MlflowException(
                     'File already exists at {} and can\'t be overwritten.'.
                     format(endpoint))
             elif response.status_code != 200:
                 raise MlflowException(
                     'log_artifacts to "{}" returned a non-200 status code.'
                     .format(endpoint))
Esempio n. 2
0
    def log_artifact(self, local_file, artifact_path=None):
        (bucket, dest_path) = self.parse_s3_uri(self.artifact_uri)
        if artifact_path:
            dest_path = build_path(dest_path, artifact_path)
        dest_path = build_path(dest_path, os.path.basename(local_file))

        boto3.client('s3').upload_file(local_file, bucket, dest_path)
Esempio n. 3
0
 def log_artifact(self, local_file, artifact_path=None):
     (bucket, dest_path) = data.parse_s3_uri(self.artifact_uri)
     if artifact_path:
         dest_path = build_path(dest_path, artifact_path)
     dest_path = build_path(dest_path, os.path.basename(local_file))
     s3_client = self._get_s3_client()
     s3_client.upload_file(local_file, bucket, dest_path)
Esempio n. 4
0
 def log_artifact(self, local_file, artifact_path=None):
     (bucket, dest_path) = data.parse_s3_uri(self.artifact_uri)
     if artifact_path:
         dest_path = build_path(dest_path, artifact_path)
     dest_path = build_path(dest_path, os.path.basename(local_file))
     s3_endpoint_url = os.environ.get('MLFLOW_S3_ENDPOINT_URL')
     boto3.client('s3', endpoint_url=s3_endpoint_url).upload_file(
         local_file, bucket, dest_path)
Esempio n. 5
0
    def log_artifact(self, local_file, artifact_path=None):
        (bucket, dest_path) = self.parse_gcs_uri(self.artifact_uri)
        if artifact_path:
            dest_path = build_path(dest_path, artifact_path)
        dest_path = build_path(dest_path, os.path.basename(local_file))

        gcs_bucket = self.gcs.Client().get_bucket(bucket)
        blob = gcs_bucket.blob(dest_path)
        blob.upload_from_filename(local_file)
 def log_artifacts(self, local_dir, artifact_path=None):
     (container, _, dest_path) = self.parse_wasbs_uri(self.artifact_uri)
     if artifact_path:
         dest_path = build_path(dest_path, artifact_path)
     local_dir = os.path.abspath(local_dir)
     for (root, _, filenames) in os.walk(local_dir):
         upload_path = dest_path
         if root != local_dir:
             rel_path = get_relative_path(local_dir, root)
             upload_path = build_path(dest_path, rel_path)
         for f in filenames:
             path = build_path(upload_path, f)
             self.client.create_blob_from_path(container, path, build_path(root, f))
Esempio n. 7
0
 def log_artifacts(self, local_dir, artifact_path=None):
     (bucket, dest_path) = self.parse_s3_uri(self.artifact_uri)
     if artifact_path:
         dest_path = build_path(dest_path, artifact_path)
     s3 = boto3.client('s3')
     local_dir = os.path.abspath(local_dir)
     for (root, _, filenames) in os.walk(local_dir):
         upload_path = dest_path
         if root != local_dir:
             rel_path = get_relative_path(local_dir, root)
             upload_path = build_path(dest_path, rel_path)
         for f in filenames:
             s3.upload_file(build_path(root, f), bucket, build_path(upload_path, f))
Esempio n. 8
0
 def log_artifacts(self, local_dir, artifact_path=None):
     (bucket, dest_path) = data.parse_s3_uri(self.artifact_uri)
     if artifact_path:
         dest_path = build_path(dest_path, artifact_path)
     s3_endpoint_url = os.environ.get('MLFLOW_S3_ENDPOINT_URL')
     s3 = boto3.client('s3', endpoint_url=s3_endpoint_url)
     local_dir = os.path.abspath(local_dir)
     for (root, _, filenames) in os.walk(local_dir):
         upload_path = dest_path
         if root != local_dir:
             rel_path = get_relative_path(local_dir, root)
             upload_path = build_path(dest_path, rel_path)
         for f in filenames:
             s3.upload_file(build_path(root, f), bucket,
                            build_path(upload_path, f))
Esempio n. 9
0
 def log_artifacts(self, local_dir, artifact_path=None):
     if artifact_path:
         root_http_endpoint = self._get_dbfs_endpoint(artifact_path)
     else:
         root_http_endpoint = self._get_dbfs_endpoint(os.path.basename(local_dir))
     for (dirpath, _, filenames) in os.walk(local_dir):
         dir_http_endpoint = root_http_endpoint
         if dirpath != local_dir:
             rel_path = get_relative_path(local_dir, dirpath)
             dir_http_endpoint = build_path(root_http_endpoint, rel_path)
         for name in filenames:
             endpoint = build_path(dir_http_endpoint, name)
             with open(build_path(dirpath, name), 'r') as f:
                 http_request(endpoint=endpoint, method='POST', data=f,
                              **self.http_request_kwargs)
Esempio n. 10
0
 def log_artifacts(self, local_dir, artifact_path=None):
     if artifact_path:
         root_http_endpoint = self._get_dbfs_endpoint(artifact_path)
     else:
         root_http_endpoint = self._get_dbfs_endpoint('')
     for (dirpath, _, filenames) in os.walk(local_dir):
         dir_http_endpoint = root_http_endpoint
         if dirpath != local_dir:
             rel_path = get_relative_path(local_dir, dirpath)
             dir_http_endpoint = build_path(root_http_endpoint, rel_path)
         for name in filenames:
             endpoint = build_path(dir_http_endpoint, name)
             with open(build_path(dirpath, name), 'rb') as f:
                 self._databricks_api_request(
                     endpoint=endpoint, method='POST', data=f, allow_redirects=False)
Esempio n. 11
0
    def download_files(self, path, destination):
        with self.get_ftp_client() as ftp:
            ftp.cwd(path)
            if not os.path.isdir(destination):
                os.makedirs(destination)

            filelist = ftp.nlst()

            for ftp_file in filelist:
                if self._is_dir(build_path(path, ftp_file)):
                    self.download_files(build_path(path, ftp_file),
                                        build_path(destination, ftp_file))
                else:
                    with open(os.path.join(destination, ftp_file), "wb") as f:
                        ftp.retrbinary("RETR " + ftp_file, f)
Esempio n. 12
0
 def _download_artifacts_into(self, artifact_path, dest_dir):
     """Private version of download_artifacts that takes a destination directory."""
     basename = os.path.basename(artifact_path)
     local_path = build_path(dest_dir, basename)
     listing = self.list_artifacts(artifact_path)
     if len(listing) > 0:
         # Artifact_path is a directory, so make a directory for it and download everything
         os.mkdir(local_path)
         for file_info in listing:
             self._download_artifacts_into(file_info.path, local_path)
     else:
         (bucket, s3_path) = self.parse_s3_uri(self.artifact_uri)
         s3_path = build_path(s3_path, artifact_path)
         boto3.client('s3').download_file(bucket, s3_path, local_path)
     return local_path
 def _download_artifacts_into(self, artifact_path, dest_dir):
     """Private version of download_artifacts that takes a destination directory."""
     basename = os.path.basename(artifact_path)
     local_path = build_path(dest_dir, basename)
     listing = self.list_artifacts(artifact_path)
     if len(listing) > 0:
         # Artifact_path is a directory, so make a directory for it and download everything
         os.mkdir(local_path)
         for file_info in listing:
             self._download_artifacts_into(file_info.path, local_path)
     else:
         (container, _, remote_path) = self.parse_wasbs_uri(self.artifact_uri)
         remote_path = build_path(remote_path, artifact_path)
         self.client.get_blob_to_path(container, remote_path, local_path)
     return local_path
Esempio n. 14
0
    def log_artifacts(self, local_dir, artifact_path=None):
        (bucket, dest_path) = self.parse_gcs_uri(self.artifact_uri)
        if artifact_path:
            dest_path = build_path(dest_path, artifact_path)
        gcs_bucket = self.gcs.Client().get_bucket(bucket)

        local_dir = os.path.abspath(local_dir)
        for (root, _, filenames) in os.walk(local_dir):
            upload_path = dest_path
            if root != local_dir:
                rel_path = get_relative_path(local_dir, root)
                upload_path = build_path(dest_path, rel_path)
            for f in filenames:
                path = build_path(upload_path, f)
                gcs_bucket.blob(path).upload_from_filename(build_path(root, f))
Esempio n. 15
0
def test_yaml_read_and_write(tmpdir):
    temp_dir = str(tmpdir)
    yaml_file = random_file("yaml")
    long_value = long(1) if six.PY2 else 1  # pylint: disable=undefined-variable
    data = {"a": random_int(), "B": random_int(), "text_value": u"中文",
            "long_value": long_value, "int_value": 32, "text_value_2": u"hi"}
    file_utils.write_yaml(temp_dir, yaml_file, data)
    read_data = file_utils.read_yaml(temp_dir, yaml_file)
    assert data == read_data
    yaml_path = file_utils.build_path(temp_dir, yaml_file)
    with codecs.open(yaml_path, encoding="utf-8") as handle:
        contents = handle.read()
    assert "!!python" not in contents
    # Check that UTF-8 strings are written properly to the file (rather than as ASCII
    # representations of their byte sequences).
    assert u"中文" in contents

    def edit_func(old_dict):
        old_dict["more_text"] = u"西班牙语"
        return old_dict

    assert "more_text" not in file_utils.read_yaml(temp_dir, yaml_file)
    with safe_edit_yaml(temp_dir, yaml_file, edit_func):
        editted_dict = file_utils.read_yaml(temp_dir, yaml_file)
        assert "more_text" in editted_dict
        assert editted_dict["more_text"] == u"西班牙语"
    assert "more_text" not in file_utils.read_yaml(temp_dir, yaml_file)
Esempio n. 16
0
    def _upload_project_to_dbfs(self, project_dir, experiment_id):
        """
        Tars a project directory into an archive in a temp dir and uploads it to DBFS, returning
        the HDFS-style URI of the tarball in DBFS (e.g. dbfs:/path/to/tar).

        :param project_dir: Path to a directory containing an MLflow project to upload to DBFS (e.g.
                            a directory containing an MLproject file).
        """
        temp_tarfile_dir = tempfile.mkdtemp()
        temp_tar_filename = file_utils.build_path(temp_tarfile_dir, "project.tar.gz")

        def custom_filter(x):
            return None if os.path.basename(x.name) == "mlruns" else x

        try:
            file_utils.make_tarfile(temp_tar_filename, project_dir, DB_TARFILE_ARCHIVE_NAME,
                                    custom_filter=custom_filter)
            with open(temp_tar_filename, "rb") as tarred_project:
                tarfile_hash = hashlib.sha256(tarred_project.read()).hexdigest()
            # TODO: Get subdirectory for experiment from the tracking server
            dbfs_fuse_uri = os.path.join("/dbfs", DBFS_EXPERIMENT_DIR_BASE, str(experiment_id),
                                         "projects-code", "%s.tar.gz" % tarfile_hash)
            if not self._dbfs_path_exists(dbfs_fuse_uri):
                self._upload_to_dbfs(temp_tar_filename, dbfs_fuse_uri)
                eprint("=== Finished uploading project to %s ===" % dbfs_fuse_uri)
            else:
                eprint("=== Project already exists in DBFS ===")
        finally:
            shutil.rmtree(temp_tarfile_dir)
        return dbfs_fuse_uri
Esempio n. 17
0
 def _get_run_dir(self, experiment_id, run_uuid):
     _validate_run_id(run_uuid)
     if not self._has_experiment(experiment_id):
         return None
     return build_path(
         self._get_experiment_path(experiment_id, assert_exists=True),
         run_uuid)
 def list_artifacts(self, path=None):
     from azure.storage.blob.models import BlobPrefix
     (container, _, artifact_path) = self.parse_wasbs_uri(self.artifact_uri)
     dest_path = artifact_path
     if path:
         dest_path = build_path(dest_path, path)
     infos = []
     prefix = dest_path + "/"
     marker = None  # Used to make next list request if this one exceeded the result limit
     while True:
         results = self.client.list_blobs(container,
                                          prefix=prefix,
                                          delimiter='/',
                                          marker=marker)
         for r in results:
             if isinstance(
                     r, BlobPrefix
             ):  # This is a prefix for items in a subdirectory
                 subdir = r.name[len(artifact_path) + 1:]
                 if subdir.endswith("/"):
                     subdir = subdir[:-1]
                 infos.append(FileInfo(subdir, True, None))
             else:  # Just a plain old blob
                 file_name = r.name[len(artifact_path) + 1:]
                 infos.append(
                     FileInfo(file_name, False,
                              r.properties.content_length))
         # Check whether a new marker is returned, meaning we have to make another request
         if results.next_marker:
             marker = results.next_marker
         else:
             break
     return sorted(infos, key=lambda f: f.path)
Esempio n. 19
0
 def _create_experiment_with_id(self, name, experiment_id, artifact_uri):
     self._check_root_dir()
     meta_dir = mkdir(self.root_directory, experiment_id)
     artifact_uri = artifact_uri or build_path(self.artifact_root_uri, experiment_id)
     experiment = Experiment(experiment_id, name, artifact_uri, LifecycleStage.ACTIVE)
     write_yaml(meta_dir, FileStore.META_DATA_FILE_NAME, dict(experiment))
     return experiment_id
Esempio n. 20
0
 def _create_experiment_with_id(self, name, experiment_id):
     self._check_root_dir()
     meta_dir = mkdir(self.root_directory, str(experiment_id))
     artifact_uri = build_path(self.artifact_root_uri, str(experiment_id))
     experiment = Experiment(experiment_id, name, artifact_uri)
     write_yaml(meta_dir, FileStore.META_DATA_FILE_NAME, dict(experiment))
     return experiment_id
Esempio n. 21
0
 def _download_artifacts_into(self, artifact_path, dest_dir):
     """Private version of download_artifacts that takes a destination directory."""
     basename = os.path.basename(artifact_path)
     local_path = build_path(dest_dir, basename)
     listing = self.list_artifacts(artifact_path)
     if len(listing) > 0:
         # Artifact_path is a directory, so make a directory for it and download everything
         os.mkdir(local_path)
         for file_info in listing:
             self._download_artifacts_into(file_info.path, local_path)
     else:
         (bucket, remote_path) = self.parse_gcs_uri(self.artifact_uri)
         remote_path = build_path(remote_path, artifact_path)
         gcs_bucket = self.gcs.Client().get_bucket(bucket)
         gcs_bucket.get_blob(remote_path).download_to_filename(local_path)
     return local_path
Esempio n. 22
0
 def list_artifacts(self, path=None):
     (bucket, artifact_path) = data.parse_s3_uri(self.artifact_uri)
     dest_path = artifact_path
     if path:
         dest_path = build_path(dest_path, path)
     infos = []
     prefix = dest_path + "/"
     s3_client = self._get_s3_client()
     paginator = s3_client.get_paginator("list_objects_v2")
     results = paginator.paginate(Bucket=bucket,
                                  Prefix=prefix,
                                  Delimiter='/')
     for result in results:
         # Subdirectories will be listed as "common prefixes" due to the way we made the request
         for obj in result.get("CommonPrefixes", []):
             subdir = obj.get("Prefix")[len(artifact_path) + 1:]
             if subdir.endswith("/"):
                 subdir = subdir[:-1]
             infos.append(FileInfo(subdir, True, None))
         # Objects listed directly will be files
         for obj in result.get('Contents', []):
             name = obj.get("Key")[len(artifact_path) + 1:]
             size = int(obj.get('Size'))
             infos.append(FileInfo(name, False, size))
     return sorted(infos, key=lambda f: f.path)
Esempio n. 23
0
    def create_run(self, experiment_id, user_id, run_name, source_type, source_name,
                   entry_point_name, start_time, source_version, tags, parent_run_id):
        experiment = self.get_experiment(experiment_id)

        if experiment.lifecycle_stage != LifecycleStage.ACTIVE:
            raise MlflowException('Experiment id={} must be active'.format(experiment_id),
                                  INVALID_STATE)

        run_uuid = uuid.uuid4().hex
        artifact_location = build_path(experiment.artifact_location, run_uuid,
                                       SqlAlchemyStore.ARTIFACTS_FOLDER_NAME)
        run = SqlRun(name=run_name or "", artifact_uri=artifact_location, run_uuid=run_uuid,
                     experiment_id=experiment_id, source_type=SourceType.to_string(source_type),
                     source_name=source_name, entry_point_name=entry_point_name,
                     user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING),
                     start_time=start_time, end_time=None,
                     source_version=source_version, lifecycle_stage=LifecycleStage.ACTIVE)

        for tag in tags:
            run.tags.append(SqlTag(key=tag.key, value=tag.value))
        if parent_run_id:
            run.tags.append(SqlTag(key=MLFLOW_PARENT_RUN_ID, value=parent_run_id))
        if run_name:
            run.tags.append(SqlTag(key=MLFLOW_RUN_NAME, value=run_name))

        self._save_to_db([run])

        return run.to_mlflow_entity()
Esempio n. 24
0
 def list_artifacts(self, path=None):
     artifact_dir = self.artifact_uri
     list_dir = build_path(artifact_dir, path) if path else artifact_dir
     artifact_files = list_all(list_dir, full_path=True)
     return [
         get_file_info(f, get_relative_path(artifact_dir, f))
         for f in artifact_files
     ]
Esempio n. 25
0
 def log_artifacts(self, local_dir, artifact_path=None):
     if artifact_path and path_not_unique(artifact_path):
         raise Exception("Invalid artifact path: '%s'. %s" %
                         (artifact_path, bad_path_message(artifact_path)))
     artifact_dir = build_path(self.artifact_uri, artifact_path) \
         if artifact_path else self.artifact_uri
     if not exists(artifact_dir):
         mkdir(artifact_dir)
     dir_util.copy_tree(src=local_dir, dst=artifact_dir)
Esempio n. 26
0
 def list_artifacts(self, path=None):
     artifact_dir = self.artifact_uri
     list_dir = build_path(artifact_dir, path) if path else artifact_dir
     artifact_files = list_all(list_dir, full_path=True)
     infos = [
         get_file_info(f, get_relative_path(artifact_dir, f))
         for f in artifact_files
     ]
     return sorted(infos, key=lambda f: f.path)
Esempio n. 27
0
 def download_artifacts_into(artifact_path, dest_dir):
     basename = os.path.basename(artifact_path)
     local_path = build_path(dest_dir, basename)
     listing = self.list_artifacts(artifact_path)
     if len(listing) > 0:
         # Artifact_path is a directory, so make a directory for it and download everything
         if not os.path.exists(local_path):
             os.mkdir(local_path)
         for file_info in listing:
             download_artifacts_into(artifact_path=file_info.path, dest_dir=local_path)
     else:
         self._download_file(remote_file_path=artifact_path, local_path=local_path)
     return local_path
Esempio n. 28
0
 def _download_artifacts_into(self, artifact_path, dest_dir):
     """Private version of download_artifacts that takes a destination directory."""
     basename = os.path.basename(artifact_path)
     local_path = build_path(dest_dir, basename)
     dbfs_path = self._get_dbfs_path(artifact_path)
     if _dbfs_is_dir(dbfs_path, self.http_request_kwargs):
         # Artifact_path is a directory, so make a directory for it and download everything
         if not os.path.exists(local_path):
             os.mkdir(local_path)
         for file_info in self.list_artifacts(artifact_path):
             self._download_artifacts_into(file_info.path, local_path)
     else:
         _dbfs_download(output_path=local_path, endpoint=self._get_dbfs_endpoint(artifact_path),
                        http_request_kwargs=self.http_request_kwargs)
     return local_path
Esempio n. 29
0
    def list_artifacts(self, path=None):
        (bucket, artifact_path) = self.parse_gcs_uri(self.artifact_uri)
        dest_path = artifact_path
        if path:
            dest_path = build_path(dest_path, path)
        prefix = dest_path + "/"

        bkt = self.gcs.Client().get_bucket(bucket)

        infos = self._list_folders(bkt, prefix, artifact_path)

        results = bkt.list_blobs(prefix=prefix, delimiter="/")
        for result in results:
            blob_path = result.name[len(artifact_path) + 1:]
            infos.append(FileInfo(blob_path, False, result.size))

        return sorted(infos, key=lambda f: f.path)
Esempio n. 30
0
 def __init__(self, root_directory=None, artifact_root_uri=None):
     """
     Create a new FileStore with the given root directory and a given default artifact root URI.
     """
     super(FileStore, self).__init__()
     self.root_directory = root_directory or _default_root_dir()
     self.artifact_root_uri = artifact_root_uri or self.root_directory
     self.trash_folder = build_path(self.root_directory, FileStore.TRASH_FOLDER_NAME)
     # Create root directory if needed
     if not exists(self.root_directory):
         mkdir(self.root_directory)
         self._create_experiment_with_id(name="Default",
                                         experiment_id=Experiment.DEFAULT_EXPERIMENT_ID,
                                         artifact_uri=None)
     # Create trash folder if needed
     if not exists(self.trash_folder):
         mkdir(self.trash_folder)