def log_artifacts(self, local_dir, artifact_path=None): if artifact_path: root_http_endpoint = self._get_dbfs_endpoint(artifact_path) else: root_http_endpoint = self._get_dbfs_endpoint( os.path.basename(local_dir)) for (dirpath, _, filenames) in os.walk(local_dir): dir_http_endpoint = root_http_endpoint if dirpath != local_dir: rel_path = get_relative_path(local_dir, dirpath) dir_http_endpoint = build_path(root_http_endpoint, rel_path) for name in filenames: endpoint = build_path(dir_http_endpoint, name) with open(build_path(dirpath, name), 'rb') as f: response = http_request(endpoint=endpoint, method='POST', data=f, allow_redirects=False, **self.http_request_kwargs) if response.status_code == 409: raise MlflowException( 'File already exists at {} and can\'t be overwritten.'. format(endpoint)) elif response.status_code != 200: raise MlflowException( 'log_artifacts to "{}" returned a non-200 status code.' .format(endpoint))
def log_artifact(self, local_file, artifact_path=None): (bucket, dest_path) = self.parse_s3_uri(self.artifact_uri) if artifact_path: dest_path = build_path(dest_path, artifact_path) dest_path = build_path(dest_path, os.path.basename(local_file)) boto3.client('s3').upload_file(local_file, bucket, dest_path)
def log_artifact(self, local_file, artifact_path=None): (bucket, dest_path) = data.parse_s3_uri(self.artifact_uri) if artifact_path: dest_path = build_path(dest_path, artifact_path) dest_path = build_path(dest_path, os.path.basename(local_file)) s3_client = self._get_s3_client() s3_client.upload_file(local_file, bucket, dest_path)
def log_artifact(self, local_file, artifact_path=None): (bucket, dest_path) = data.parse_s3_uri(self.artifact_uri) if artifact_path: dest_path = build_path(dest_path, artifact_path) dest_path = build_path(dest_path, os.path.basename(local_file)) s3_endpoint_url = os.environ.get('MLFLOW_S3_ENDPOINT_URL') boto3.client('s3', endpoint_url=s3_endpoint_url).upload_file( local_file, bucket, dest_path)
def log_artifact(self, local_file, artifact_path=None): (bucket, dest_path) = self.parse_gcs_uri(self.artifact_uri) if artifact_path: dest_path = build_path(dest_path, artifact_path) dest_path = build_path(dest_path, os.path.basename(local_file)) gcs_bucket = self.gcs.Client().get_bucket(bucket) blob = gcs_bucket.blob(dest_path) blob.upload_from_filename(local_file)
def log_artifacts(self, local_dir, artifact_path=None): (container, _, dest_path) = self.parse_wasbs_uri(self.artifact_uri) if artifact_path: dest_path = build_path(dest_path, artifact_path) local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = get_relative_path(local_dir, root) upload_path = build_path(dest_path, rel_path) for f in filenames: path = build_path(upload_path, f) self.client.create_blob_from_path(container, path, build_path(root, f))
def log_artifacts(self, local_dir, artifact_path=None): (bucket, dest_path) = self.parse_s3_uri(self.artifact_uri) if artifact_path: dest_path = build_path(dest_path, artifact_path) s3 = boto3.client('s3') local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = get_relative_path(local_dir, root) upload_path = build_path(dest_path, rel_path) for f in filenames: s3.upload_file(build_path(root, f), bucket, build_path(upload_path, f))
def log_artifacts(self, local_dir, artifact_path=None): (bucket, dest_path) = data.parse_s3_uri(self.artifact_uri) if artifact_path: dest_path = build_path(dest_path, artifact_path) s3_endpoint_url = os.environ.get('MLFLOW_S3_ENDPOINT_URL') s3 = boto3.client('s3', endpoint_url=s3_endpoint_url) local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = get_relative_path(local_dir, root) upload_path = build_path(dest_path, rel_path) for f in filenames: s3.upload_file(build_path(root, f), bucket, build_path(upload_path, f))
def log_artifacts(self, local_dir, artifact_path=None): if artifact_path: root_http_endpoint = self._get_dbfs_endpoint(artifact_path) else: root_http_endpoint = self._get_dbfs_endpoint(os.path.basename(local_dir)) for (dirpath, _, filenames) in os.walk(local_dir): dir_http_endpoint = root_http_endpoint if dirpath != local_dir: rel_path = get_relative_path(local_dir, dirpath) dir_http_endpoint = build_path(root_http_endpoint, rel_path) for name in filenames: endpoint = build_path(dir_http_endpoint, name) with open(build_path(dirpath, name), 'r') as f: http_request(endpoint=endpoint, method='POST', data=f, **self.http_request_kwargs)
def log_artifacts(self, local_dir, artifact_path=None): if artifact_path: root_http_endpoint = self._get_dbfs_endpoint(artifact_path) else: root_http_endpoint = self._get_dbfs_endpoint('') for (dirpath, _, filenames) in os.walk(local_dir): dir_http_endpoint = root_http_endpoint if dirpath != local_dir: rel_path = get_relative_path(local_dir, dirpath) dir_http_endpoint = build_path(root_http_endpoint, rel_path) for name in filenames: endpoint = build_path(dir_http_endpoint, name) with open(build_path(dirpath, name), 'rb') as f: self._databricks_api_request( endpoint=endpoint, method='POST', data=f, allow_redirects=False)
def download_files(self, path, destination): with self.get_ftp_client() as ftp: ftp.cwd(path) if not os.path.isdir(destination): os.makedirs(destination) filelist = ftp.nlst() for ftp_file in filelist: if self._is_dir(build_path(path, ftp_file)): self.download_files(build_path(path, ftp_file), build_path(destination, ftp_file)) else: with open(os.path.join(destination, ftp_file), "wb") as f: ftp.retrbinary("RETR " + ftp_file, f)
def _download_artifacts_into(self, artifact_path, dest_dir): """Private version of download_artifacts that takes a destination directory.""" basename = os.path.basename(artifact_path) local_path = build_path(dest_dir, basename) listing = self.list_artifacts(artifact_path) if len(listing) > 0: # Artifact_path is a directory, so make a directory for it and download everything os.mkdir(local_path) for file_info in listing: self._download_artifacts_into(file_info.path, local_path) else: (bucket, s3_path) = self.parse_s3_uri(self.artifact_uri) s3_path = build_path(s3_path, artifact_path) boto3.client('s3').download_file(bucket, s3_path, local_path) return local_path
def _download_artifacts_into(self, artifact_path, dest_dir): """Private version of download_artifacts that takes a destination directory.""" basename = os.path.basename(artifact_path) local_path = build_path(dest_dir, basename) listing = self.list_artifacts(artifact_path) if len(listing) > 0: # Artifact_path is a directory, so make a directory for it and download everything os.mkdir(local_path) for file_info in listing: self._download_artifacts_into(file_info.path, local_path) else: (container, _, remote_path) = self.parse_wasbs_uri(self.artifact_uri) remote_path = build_path(remote_path, artifact_path) self.client.get_blob_to_path(container, remote_path, local_path) return local_path
def log_artifacts(self, local_dir, artifact_path=None): (bucket, dest_path) = self.parse_gcs_uri(self.artifact_uri) if artifact_path: dest_path = build_path(dest_path, artifact_path) gcs_bucket = self.gcs.Client().get_bucket(bucket) local_dir = os.path.abspath(local_dir) for (root, _, filenames) in os.walk(local_dir): upload_path = dest_path if root != local_dir: rel_path = get_relative_path(local_dir, root) upload_path = build_path(dest_path, rel_path) for f in filenames: path = build_path(upload_path, f) gcs_bucket.blob(path).upload_from_filename(build_path(root, f))
def test_yaml_read_and_write(tmpdir): temp_dir = str(tmpdir) yaml_file = random_file("yaml") long_value = long(1) if six.PY2 else 1 # pylint: disable=undefined-variable data = {"a": random_int(), "B": random_int(), "text_value": u"中文", "long_value": long_value, "int_value": 32, "text_value_2": u"hi"} file_utils.write_yaml(temp_dir, yaml_file, data) read_data = file_utils.read_yaml(temp_dir, yaml_file) assert data == read_data yaml_path = file_utils.build_path(temp_dir, yaml_file) with codecs.open(yaml_path, encoding="utf-8") as handle: contents = handle.read() assert "!!python" not in contents # Check that UTF-8 strings are written properly to the file (rather than as ASCII # representations of their byte sequences). assert u"中文" in contents def edit_func(old_dict): old_dict["more_text"] = u"西班牙语" return old_dict assert "more_text" not in file_utils.read_yaml(temp_dir, yaml_file) with safe_edit_yaml(temp_dir, yaml_file, edit_func): editted_dict = file_utils.read_yaml(temp_dir, yaml_file) assert "more_text" in editted_dict assert editted_dict["more_text"] == u"西班牙语" assert "more_text" not in file_utils.read_yaml(temp_dir, yaml_file)
def _upload_project_to_dbfs(self, project_dir, experiment_id): """ Tars a project directory into an archive in a temp dir and uploads it to DBFS, returning the HDFS-style URI of the tarball in DBFS (e.g. dbfs:/path/to/tar). :param project_dir: Path to a directory containing an MLflow project to upload to DBFS (e.g. a directory containing an MLproject file). """ temp_tarfile_dir = tempfile.mkdtemp() temp_tar_filename = file_utils.build_path(temp_tarfile_dir, "project.tar.gz") def custom_filter(x): return None if os.path.basename(x.name) == "mlruns" else x try: file_utils.make_tarfile(temp_tar_filename, project_dir, DB_TARFILE_ARCHIVE_NAME, custom_filter=custom_filter) with open(temp_tar_filename, "rb") as tarred_project: tarfile_hash = hashlib.sha256(tarred_project.read()).hexdigest() # TODO: Get subdirectory for experiment from the tracking server dbfs_fuse_uri = os.path.join("/dbfs", DBFS_EXPERIMENT_DIR_BASE, str(experiment_id), "projects-code", "%s.tar.gz" % tarfile_hash) if not self._dbfs_path_exists(dbfs_fuse_uri): self._upload_to_dbfs(temp_tar_filename, dbfs_fuse_uri) eprint("=== Finished uploading project to %s ===" % dbfs_fuse_uri) else: eprint("=== Project already exists in DBFS ===") finally: shutil.rmtree(temp_tarfile_dir) return dbfs_fuse_uri
def _get_run_dir(self, experiment_id, run_uuid): _validate_run_id(run_uuid) if not self._has_experiment(experiment_id): return None return build_path( self._get_experiment_path(experiment_id, assert_exists=True), run_uuid)
def list_artifacts(self, path=None): from azure.storage.blob.models import BlobPrefix (container, _, artifact_path) = self.parse_wasbs_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = build_path(dest_path, path) infos = [] prefix = dest_path + "/" marker = None # Used to make next list request if this one exceeded the result limit while True: results = self.client.list_blobs(container, prefix=prefix, delimiter='/', marker=marker) for r in results: if isinstance( r, BlobPrefix ): # This is a prefix for items in a subdirectory subdir = r.name[len(artifact_path) + 1:] if subdir.endswith("/"): subdir = subdir[:-1] infos.append(FileInfo(subdir, True, None)) else: # Just a plain old blob file_name = r.name[len(artifact_path) + 1:] infos.append( FileInfo(file_name, False, r.properties.content_length)) # Check whether a new marker is returned, meaning we have to make another request if results.next_marker: marker = results.next_marker else: break return sorted(infos, key=lambda f: f.path)
def _create_experiment_with_id(self, name, experiment_id, artifact_uri): self._check_root_dir() meta_dir = mkdir(self.root_directory, experiment_id) artifact_uri = artifact_uri or build_path(self.artifact_root_uri, experiment_id) experiment = Experiment(experiment_id, name, artifact_uri, LifecycleStage.ACTIVE) write_yaml(meta_dir, FileStore.META_DATA_FILE_NAME, dict(experiment)) return experiment_id
def _create_experiment_with_id(self, name, experiment_id): self._check_root_dir() meta_dir = mkdir(self.root_directory, str(experiment_id)) artifact_uri = build_path(self.artifact_root_uri, str(experiment_id)) experiment = Experiment(experiment_id, name, artifact_uri) write_yaml(meta_dir, FileStore.META_DATA_FILE_NAME, dict(experiment)) return experiment_id
def _download_artifacts_into(self, artifact_path, dest_dir): """Private version of download_artifacts that takes a destination directory.""" basename = os.path.basename(artifact_path) local_path = build_path(dest_dir, basename) listing = self.list_artifacts(artifact_path) if len(listing) > 0: # Artifact_path is a directory, so make a directory for it and download everything os.mkdir(local_path) for file_info in listing: self._download_artifacts_into(file_info.path, local_path) else: (bucket, remote_path) = self.parse_gcs_uri(self.artifact_uri) remote_path = build_path(remote_path, artifact_path) gcs_bucket = self.gcs.Client().get_bucket(bucket) gcs_bucket.get_blob(remote_path).download_to_filename(local_path) return local_path
def list_artifacts(self, path=None): (bucket, artifact_path) = data.parse_s3_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = build_path(dest_path, path) infos = [] prefix = dest_path + "/" s3_client = self._get_s3_client() paginator = s3_client.get_paginator("list_objects_v2") results = paginator.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/') for result in results: # Subdirectories will be listed as "common prefixes" due to the way we made the request for obj in result.get("CommonPrefixes", []): subdir = obj.get("Prefix")[len(artifact_path) + 1:] if subdir.endswith("/"): subdir = subdir[:-1] infos.append(FileInfo(subdir, True, None)) # Objects listed directly will be files for obj in result.get('Contents', []): name = obj.get("Key")[len(artifact_path) + 1:] size = int(obj.get('Size')) infos.append(FileInfo(name, False, size)) return sorted(infos, key=lambda f: f.path)
def create_run(self, experiment_id, user_id, run_name, source_type, source_name, entry_point_name, start_time, source_version, tags, parent_run_id): experiment = self.get_experiment(experiment_id) if experiment.lifecycle_stage != LifecycleStage.ACTIVE: raise MlflowException('Experiment id={} must be active'.format(experiment_id), INVALID_STATE) run_uuid = uuid.uuid4().hex artifact_location = build_path(experiment.artifact_location, run_uuid, SqlAlchemyStore.ARTIFACTS_FOLDER_NAME) run = SqlRun(name=run_name or "", artifact_uri=artifact_location, run_uuid=run_uuid, experiment_id=experiment_id, source_type=SourceType.to_string(source_type), source_name=source_name, entry_point_name=entry_point_name, user_id=user_id, status=RunStatus.to_string(RunStatus.RUNNING), start_time=start_time, end_time=None, source_version=source_version, lifecycle_stage=LifecycleStage.ACTIVE) for tag in tags: run.tags.append(SqlTag(key=tag.key, value=tag.value)) if parent_run_id: run.tags.append(SqlTag(key=MLFLOW_PARENT_RUN_ID, value=parent_run_id)) if run_name: run.tags.append(SqlTag(key=MLFLOW_RUN_NAME, value=run_name)) self._save_to_db([run]) return run.to_mlflow_entity()
def list_artifacts(self, path=None): artifact_dir = self.artifact_uri list_dir = build_path(artifact_dir, path) if path else artifact_dir artifact_files = list_all(list_dir, full_path=True) return [ get_file_info(f, get_relative_path(artifact_dir, f)) for f in artifact_files ]
def log_artifacts(self, local_dir, artifact_path=None): if artifact_path and path_not_unique(artifact_path): raise Exception("Invalid artifact path: '%s'. %s" % (artifact_path, bad_path_message(artifact_path))) artifact_dir = build_path(self.artifact_uri, artifact_path) \ if artifact_path else self.artifact_uri if not exists(artifact_dir): mkdir(artifact_dir) dir_util.copy_tree(src=local_dir, dst=artifact_dir)
def list_artifacts(self, path=None): artifact_dir = self.artifact_uri list_dir = build_path(artifact_dir, path) if path else artifact_dir artifact_files = list_all(list_dir, full_path=True) infos = [ get_file_info(f, get_relative_path(artifact_dir, f)) for f in artifact_files ] return sorted(infos, key=lambda f: f.path)
def download_artifacts_into(artifact_path, dest_dir): basename = os.path.basename(artifact_path) local_path = build_path(dest_dir, basename) listing = self.list_artifacts(artifact_path) if len(listing) > 0: # Artifact_path is a directory, so make a directory for it and download everything if not os.path.exists(local_path): os.mkdir(local_path) for file_info in listing: download_artifacts_into(artifact_path=file_info.path, dest_dir=local_path) else: self._download_file(remote_file_path=artifact_path, local_path=local_path) return local_path
def _download_artifacts_into(self, artifact_path, dest_dir): """Private version of download_artifacts that takes a destination directory.""" basename = os.path.basename(artifact_path) local_path = build_path(dest_dir, basename) dbfs_path = self._get_dbfs_path(artifact_path) if _dbfs_is_dir(dbfs_path, self.http_request_kwargs): # Artifact_path is a directory, so make a directory for it and download everything if not os.path.exists(local_path): os.mkdir(local_path) for file_info in self.list_artifacts(artifact_path): self._download_artifacts_into(file_info.path, local_path) else: _dbfs_download(output_path=local_path, endpoint=self._get_dbfs_endpoint(artifact_path), http_request_kwargs=self.http_request_kwargs) return local_path
def list_artifacts(self, path=None): (bucket, artifact_path) = self.parse_gcs_uri(self.artifact_uri) dest_path = artifact_path if path: dest_path = build_path(dest_path, path) prefix = dest_path + "/" bkt = self.gcs.Client().get_bucket(bucket) infos = self._list_folders(bkt, prefix, artifact_path) results = bkt.list_blobs(prefix=prefix, delimiter="/") for result in results: blob_path = result.name[len(artifact_path) + 1:] infos.append(FileInfo(blob_path, False, result.size)) return sorted(infos, key=lambda f: f.path)
def __init__(self, root_directory=None, artifact_root_uri=None): """ Create a new FileStore with the given root directory and a given default artifact root URI. """ super(FileStore, self).__init__() self.root_directory = root_directory or _default_root_dir() self.artifact_root_uri = artifact_root_uri or self.root_directory self.trash_folder = build_path(self.root_directory, FileStore.TRASH_FOLDER_NAME) # Create root directory if needed if not exists(self.root_directory): mkdir(self.root_directory) self._create_experiment_with_id(name="Default", experiment_id=Experiment.DEFAULT_EXPERIMENT_ID, artifact_uri=None) # Create trash folder if needed if not exists(self.trash_folder): mkdir(self.trash_folder)