def test_upload_project_to_dbfs(dbfs_root_mock, tmpdir, dbfs_path_exists_mock, upload_to_dbfs_mock): # pylint: disable=unused-argument # Upload project to a mock directory dbfs_path_exists_mock.return_value = False runner = DatabricksJobRunner(databricks_profile="DEFAULT") dbfs_uri = runner._upload_project_to_dbfs(project_dir=TEST_PROJECT_DIR, experiment_id=0) # Get expected tar local_tar_path = os.path.join(dbfs_root_mock, dbfs_uri.split("/dbfs/")[1]) expected_tar_path = str(tmpdir.join("expected.tar.gz")) file_utils.make_tarfile(output_filename=expected_tar_path, source_dir=TEST_PROJECT_DIR, archive_name=databricks.DB_TARFILE_ARCHIVE_NAME) # Extract the tarred project, verify its contents assert filecmp.cmp(local_tar_path, expected_tar_path, shallow=False)
def _create_docker_build_ctx(work_dir, dockerfile_contents): """ Creates build context tarfile containing Dockerfile and project code, returning path to tarfile """ directory = tempfile.mkdtemp() try: dst_path = os.path.join(directory, "mlflow-project-contents") shutil.copytree(src=work_dir, dst=dst_path) with open(os.path.join(dst_path, _GENERATED_DOCKERFILE_NAME), "w") as handle: handle.write(dockerfile_contents) _, result_path = tempfile.mkstemp() file_utils.make_tarfile( output_filename=result_path, source_dir=dst_path, archive_name=_PROJECT_TAR_ARCHIVE_NAME) finally: shutil.rmtree(directory) return result_path
def _upload_project_to_dbfs(self, project_dir, experiment_id): """ Tars a project directory into an archive in a temp dir and uploads it to DBFS, returning the HDFS-style URI of the tarball in DBFS (e.g. dbfs:/path/to/tar). :param project_dir: Path to a directory containing an MLflow project to upload to DBFS (e.g. a directory containing an MLproject file). """ temp_tarfile_dir = tempfile.mkdtemp() temp_tar_filename = os.path.join(temp_tarfile_dir, "project.tar.gz") def custom_filter(x): return None if os.path.basename(x.name) == "mlruns" else x try: directory_size = file_utils._get_local_project_dir_size(project_dir) _logger.info( f"=== Creating tarball from {project_dir} in temp directory {temp_tarfile_dir} ===" ) _logger.info(f"=== Total file size to compress: {directory_size} KB ===") file_utils.make_tarfile( temp_tar_filename, project_dir, DB_TARFILE_ARCHIVE_NAME, custom_filter=custom_filter ) with open(temp_tar_filename, "rb") as tarred_project: tarfile_hash = hashlib.sha256(tarred_project.read()).hexdigest() # TODO: Get subdirectory for experiment from the tracking server dbfs_path = posixpath.join( DBFS_EXPERIMENT_DIR_BASE, str(experiment_id), "projects-code", "%s.tar.gz" % tarfile_hash, ) tar_size = file_utils._get_local_file_size(temp_tar_filename) dbfs_fuse_uri = posixpath.join("/dbfs", dbfs_path) if not self._dbfs_path_exists(dbfs_path): _logger.info( f"=== Uploading project tarball (size: {tar_size} KB) to {dbfs_fuse_uri} ===" ) self._upload_to_dbfs(temp_tar_filename, dbfs_fuse_uri) _logger.info("=== Finished uploading project to %s ===", dbfs_fuse_uri) else: _logger.info("=== Project already exists in DBFS ===") finally: shutil.rmtree(temp_tarfile_dir) return dbfs_fuse_uri
def _upload_project_to_dbfs(self, project_dir, experiment_id): """ Tars a project directory into an archive in a temp dir and uploads it to DBFS, returning the HDFS-style URI of the tarball in DBFS (e.g. dbfs:/path/to/tar). :param project_dir: Path to a directory containing an MLflow project to upload to DBFS (e.g. a directory containing an MLproject file). """ temp_tarfile_dir = tempfile.mkdtemp() temp_tar_filename = file_utils.build_path(temp_tarfile_dir, "project.tar.gz") def custom_filter(x): return None if os.path.basename(x.name) == "mlruns" else x try: file_utils.make_tarfile(temp_tar_filename, project_dir, DB_TARFILE_ARCHIVE_NAME, custom_filter=custom_filter) with open(temp_tar_filename, "rb") as tarred_project: tarfile_hash = hashlib.sha256( tarred_project.read()).hexdigest() # TODO: Get subdirectory for experiment from the tracking server dbfs_fuse_uri = os.path.join("/dbfs", DBFS_EXPERIMENT_DIR_BASE, str(experiment_id), "projects-code", "%s.tar.gz" % tarfile_hash) if not self._dbfs_path_exists(dbfs_fuse_uri): self._upload_to_dbfs(temp_tar_filename, dbfs_fuse_uri) eprint("=== Finished uploading project to %s ===" % dbfs_fuse_uri) else: eprint("=== Project already exists in DBFS ===") finally: shutil.rmtree(temp_tarfile_dir) return dbfs_fuse_uri