def deploy(client: ApiClient, job_conf: Dict[str, Any], task_args: Dict[str, Any]): dbfs_new_jar_name = job_conf['libraries'][0]['jar'] logging.info("Submitting job with configuration %s and jar file %s" % (job_conf, dbfs_new_jar_name)) dbfs_api = DbfsApi(client) dbfs_api.cp(recursive=False, overwrite=True, src=task_args["jar"], dst=dbfs_new_jar_name) job_data = client.perform_query('POST', '/jobs/create', data=job_conf, headers=None) logging.info("Job creation data %s" % job_data) if task_args["run_now"]: logging.info("Requested to launch job immediately") run_data = client.perform_query('POST', '/jobs/run-now', data=job_data, headers=None) logging.info("Job launched with run data: %s" % run_data) if task_args["trace"]: logging.info("Requested to trace the job status") run_finised = False while not run_finised: time.sleep(4) run_status = client.perform_query('GET', '/jobs/runs/get', data={"run_id": run_data["run_id"]}, headers=None) logging.info(run_status) result_state = run_status["state"].get("result_state", None) if result_state: run_finised = True if result_state == "SUCCESS": logging.info("Job successfully finished!") else: exception_text = "Job finished with result state %s. Please check run UI!" % result_state raise Exception(exception_text) logging.info("All deployment actions successfully performed")
def __init__(self, user, token, workspaceUrl): self.dbcli_apiclient = ApiClient(user, password=token, host=workspaceUrl, verify=True, command_name='Python Client') self.dbfs_api_client = DbfsApi(self.dbcli_apiclient)
def cp_cli(api_client, recursive, overwrite, src, dst): """ Copy files to and from DBFS. Note that this function will fail if the src and dst are both on the local filesystem or if they are both DBFS paths. For non-recursive copies, if the dst is a directory, the file will be placed inside the directory. For example ``dbfs cp dbfs:/apple.txt .`` will create a file at `./apple.txt`. For recursive copies, files inside of the src directory will be copied inside the dst directory with the same name. If the dst path does not exist, a directory will be created. For example ``dbfs cp -r dbfs:/foo foo`` will create a directory foo and place the files ``dbfs:/foo/a`` at ``foo/a``. If ``foo/a`` already exists, the file will not be overriden unless the --overwrite flag is provided -- however, dbfs cp --recursive will continue to try and copy other files. """ # Copy to DBFS in this case dbfs_api = DbfsApi(api_client) if not DbfsPath.is_valid(src) and DbfsPath.is_valid(dst): if not os.path.exists(src): error_and_quit('The local file {} does not exist.'.format(src)) if not recursive: if os.path.isdir(src): error_and_quit(( 'The local file {} is a directory. You must provide --recursive' ).format(src)) copy_to_dbfs_non_recursive(dbfs_api, src, DbfsPath(dst), overwrite) else: if not os.path.isdir(src): copy_to_dbfs_non_recursive(dbfs_api, src, DbfsPath(dst), overwrite) return copy_to_dbfs_recursive(dbfs_api, src, DbfsPath(dst), overwrite) # Copy from DBFS in this case elif DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst): if not recursive: copy_from_dbfs_non_recursive(dbfs_api, DbfsPath(src), dst, overwrite) else: dbfs_path_src = DbfsPath(src) if not dbfs_api.get_status(dbfs_path_src).is_dir: copy_from_dbfs_non_recursive(dbfs_api, dbfs_path_src, dst, overwrite) copy_from_dbfs_recursive(dbfs_api, dbfs_path_src, dst, overwrite) elif not DbfsPath.is_valid(src) and not DbfsPath.is_valid(dst): error_and_quit( 'Both paths provided are from your local filesystem. ' 'To use this utility, one of the src or dst must be prefixed ' 'with dbfs:/') elif DbfsPath.is_valid(src) and DbfsPath.is_valid(dst): error_and_quit( 'Both paths provided are from the DBFS filesystem. ' 'To copy between the DBFS filesystem, you currently must copy the ' 'file from DBFS to your local filesystem and then back.') else: assert False, 'not reached'
def main(): ENDPOINT = od.getenv('ENDPOINT') TOKEN = os.getenv('TOKEN') databricks_client = ApiClient(host=config.ENDPOINT, token=TOKEN) dbfs_client = DbfsApi(self.databricks_client) src_path = "~/dev/deploy_whl_cluster_test/dist/test_package-0.0.1-py3-none-any.whl" dbfs_client.cp(src=src_path, dst=dst_path, overwrite=True, recursive=False)
def rm_cli(api_client, recursive, dbfs_path): """ Remove files from dbfs. To remove a directory you must provide the --recursive flag. """ DbfsApi(api_client).delete(dbfs_path, recursive)
def mkdirs_cli(api_client, dbfs_path): """ Make directories in DBFS. Mkdirs will create directories along the path to the argument directory. """ DbfsApi(api_client).mkdirs(dbfs_path)
class ApiClient(): def __init__(self, profile=None): api_client = get_api_client(profile) self.dbfs_client = DbfsApi(api_client) self.runs_client = RunsApi(api_client) def mkdirs(self, dbfs_path): return self.dbfs_client.mkdirs(DbfsPath(dbfs_path)) def list_files(self, dbfs_path): return self.dbfs_client.list_files(DbfsPath(dbfs_path)) def put_file(self, src_path, dbfs_path, overwrite=True): return self.dbfs_client.put_file(src_path, DbfsPath(dbfs_path), overwrite) def submit_run(self, json_data): return self.runs_client.submit_run(json_data) def get_run(self, run_id): return self.runs_client.get_run(run_id)
def test_tgt_objects_exist(tgt_policy_service: PolicyService, tgt_pool_api: InstancePoolsApi, src_dbfs_api: DbfsApi, tgt_dbfs_api: DbfsApi, tgt_workspace_api: WorkspaceApi, env): assert len( tgt_policy_service.list_policies() ["policies"]) == db_objects['cluster-policies']['import_object_count'] assert len(tgt_pool_api.list_instance_pools()["instance_pools"] ) == db_objects['instance-pools']['import_object_count'] assert len(tgt_workspace_api.list_objects( "/Shared")) == db_objects['notebooks']['import_object_count'] assert len(tgt_dbfs_api.list_files(DbfsPath("dbfs:/example_notebook.py")) ) == db_objects['dbfs']['import_object_count']
def ls_cli(api_client, l, absolute, dbfs_path): # NOQA """ List files in DBFS. """ if len(dbfs_path) == 0: dbfs_path = DbfsPath('dbfs:/') elif len(dbfs_path) == 1: dbfs_path = dbfs_path[0] else: error_and_quit('ls can take a maximum of one path.') files = DbfsApi(api_client).list_files(dbfs_path) table = tabulate([f.to_row(is_long_form=l, is_absolute=absolute) for f in files], tablefmt='plain') click.echo(table)
class DatabricksAPIClient(object): def __init__(self, user, token, workspaceUrl): self.dbcli_apiclient = ApiClient(user, password=token, host=workspaceUrl, verify=True, command_name='Python Client') self.dbfs_api_client = DbfsApi(self.dbcli_apiclient) # List init script directory def _list_init_script_dir(self, srcPath="dbfs:/databricks/init"): print("Starting to list the legacy global init scripts folder") files = self.dbfs_api_client.list_files(dbfs_path=DbfsPath(srcPath)) file_list = [f.dbfs_path.absolute_path for f in files] return file_list # Copy global init script to local def _cp_legacy_gis_to_local(self, srcPath="dbfs:/databricks/init", destPath="./dbx_gis_v1"): print("Starting to copy the legacy global init scripts to path {}". format(destPath)) self.dbfs_api_client.cp(recursive=True, overwrite=True, src=srcPath, dst=destPath) print("Copied the legacy global init scripts to path {}".format( destPath)) def _copy_test_file(self): self.dbfs_api_client.cp(recursive=False, overwrite=True, src="./dbx_test_src/random.sh", dst="dbfs:/databricks/init") print("copied test file") def _remove_test_file(self): self.dbfs_api_client.delete( dbfs_path=DbfsPath("dbfs:/databricks/init/random.sh"), recursive=False) print("removed test file") # Upload the init script as a global init script v2 # By default disabled & placed at the last location in the order of execution def _upload_init_script_as_gis_v2(self, script_name, base64_encoded_content): request_data = {"name": script_name, "script": base64_encoded_content} self.dbcli_apiclient.perform_query(method='POST', path='/global-init-scripts', data=request_data) print("Script uploaded as GIS v2 - {}".format(script_name))
def prepare_for_operationalization(cluster_id, api_client, dbfs_path, overwrite, spark_version): """ Installs appropriate versions of several libraries to support operationalization. Args: cluster_id (str): cluster_id representing the cluster to prepare for operationalization api_client (ApiClient): the ApiClient object used to authenticate to the workspace dbfs_path (str): the path on dbfs to upload libraries to overwrite (bool): whether to overwrite existing files on dbfs with new files of the same name spark_version (str): str version indicating which version of spark is installed on the databricks cluster Returns: A dictionary of libraries installed """ print("Preparing for operationlization...") cosmosdb_jar_url = COSMOSDB_JAR_FILE_OPTIONS[spark_version] # download the cosmosdb jar local_jarname = os.path.basename(cosmosdb_jar_url) # only download if you need it: if overwrite or not os.path.exists(local_jarname): print("Downloading {}...".format(cosmosdb_jar_url)) local_jarname, _ = urlretrieve(cosmosdb_jar_url, local_jarname) else: print("File {} already downloaded.".format(local_jarname)) # upload jar to dbfs: upload_path = Path(dbfs_path, local_jarname).as_posix() print("Uploading CosmosDB driver to databricks at {}".format(upload_path)) if dbfs_file_exists(api_client, upload_path) and overwrite: print("Overwriting file at {}".format(upload_path)) DbfsApi(api_client).cp(recursive=False, src=local_jarname, dst=upload_path, overwrite=overwrite) # setup the list of libraries to install: # jar library setup libs2install = [{"jar": upload_path}] # setup libraries to install: libs2install.extend([{"pypi": {"package": i}} for i in PYPI_O16N_LIBS]) print( "Installing jar and pypi libraries required for operationalization...") LibrariesApi(api_client).install_libraries(cluster_id, libs2install) return libs2install
def cp_cli(api_client, recursive, overwrite, src, dst): """ Copy files to and from DBFS. Note that this function will fail if the src and dst are both on the local filesystem. For non-recursive copies, if the dst is a directory, the file will be placed inside the directory. For example ``dbfs cp dbfs:/apple.txt .`` will create a file at `./apple.txt`. For recursive copies, files inside of the src directory will be copied inside the dst directory with the same name. If the dst path does not exist, a directory will be created. For example ``dbfs cp -r dbfs:/foo foo`` will create a directory foo and place the files ``dbfs:/foo/a`` at ``foo/a``. If ``foo/a`` already exists, the file will not be overriden unless the --overwrite flag is provided -- however, dbfs cp --recursive will continue to try and copy other files. """ # Copy to DBFS in this case DbfsApi(api_client).cp(recursive, overwrite, src, dst)
def dbfs_file_exists(api_client, dbfs_path): """ Checks to determine whether a file exists. Args: api_client (ApiClient object): Object used for authenticating to the workspace dbfs_path (str): Path to check Returns: True if file exists on dbfs, False otherwise. """ try: DbfsApi(api_client).list_files(dbfs_path=DbfsPath(dbfs_path)) file_exists = True except: file_exists = False return file_exists
def __init__(self, api_client): self.jobs_client = JobsApi(api_client) self.workspace_client = WorkspaceApi(api_client) self.dbfs_client = DbfsApi(api_client)
def tgt_dbfs_api(tgt_api_client:ApiClient): return DbfsApi(tgt_api_client)
def mv_cli(api_client, src, dst): """ Moves a file between two DBFS paths. """ DbfsApi(api_client).move(src, dst)
upload_path = Path(args.dbfs_path, args.eggname).as_posix() # Check if file exists to alert user. print("Uploading {} to databricks at {}".format(args.eggname, upload_path)) if dbfs_file_exists(my_api_client, upload_path): if args.overwrite: print("Overwriting file at {}".format(upload_path)) else: raise IOError(""" {} already exists on databricks cluster. This is likely an older version of the library. Please use the '--overwrite' flag to proceed. """.format(upload_path)) DbfsApi(my_api_client).cp(recursive=False, src=myegg, dst=upload_path, overwrite=args.overwrite) # steps below require the cluster to be running. Check status try: status = ClusterApi(my_api_client).get_cluster(args.cluster_id) except HTTPError as e: print(e) print(textwrap.dedent(CLUSTER_NOT_FOUND_MSG.format(args.cluster_id))) raise if status["state"] == "TERMINATED": print( textwrap.dedent( CLUSTER_NOT_RUNNING_MSG.format(args.cluster_id, status["state"])))
def __init__(self, profile=None): api_client = get_api_client(profile) self.dbfs_client = DbfsApi(api_client) self.runs_client = RunsApi(api_client)
class StackApi(object): def __init__(self, api_client): self.jobs_client = JobsApi(api_client) self.workspace_client = WorkspaceApi(api_client) self.dbfs_client = DbfsApi(api_client) def deploy(self, config_path, **kwargs): """ Deploys a stack given stack JSON configuration template at path config_path. Loads the JSON template as well as status JSON if stack has been deployed before. The working directory is changed to that where the JSON template is contained so that paths within the stack configuration are relative to the directory of the JSON template instead of the directory where this function is called. :param config_path: Path to stack JSON configuration template. Must have the fields of 'name', the name of the stack and 'resources', a list of stack resources. :return: None. """ stack_config = self._load_json(config_path) status_path = self._generate_stack_status_path(config_path) stack_status = self._load_json(status_path) config_dir = os.path.dirname(os.path.abspath(config_path)) cli_dir = os.getcwd() os.chdir(config_dir) # Switch current working directory to where json config is stored new_stack_status = self.deploy_config(stack_config, stack_status, **kwargs) os.chdir(cli_dir) click.echo("Saving stack status to {}".format(status_path)) self._save_json(status_path, new_stack_status) def download(self, config_path, **kwargs): """ Downloads a stack given stack JSON configuration template at path config_path. The working directory is changed to that where the JSON template is contained so that paths within the stack configuration are relative to the directory of the JSON template instead of the directory where this function is called. :param config_path: Path to stack JSON configuration template. Must have the fields of 'name', the name of the stack and 'resources', a list of stack resources. :return: None. """ stack_config = self._load_json(config_path) config_dir = os.path.dirname(os.path.abspath(config_path)) cli_dir = os.getcwd() os.chdir(config_dir) # Switch current working directory to where json config is stored self.download_from_config(stack_config, **kwargs) os.chdir(cli_dir) def deploy_config(self, stack_config, stack_status=None, **kwargs): """ Deploys a stack given stack JSON configuration template at path config_path. After going through each of the resources and deploying them, stores status JSON of deployment with deploy status of each resource deployment. For each resource deployment, stack_status is used to get the associated resource status of a resource from the last deployment. :param stack_config: Must have the fields of 'name', the name of the stack and 'resources', a list of stack resources. :param stack_status: Must have the fields of :return: """ click.echo('#' * 80) self._validate_config(stack_config) if stack_status: click.echo('#' * 80) self._validate_status(stack_status) resource_id_to_status = self._get_resource_to_status_map(stack_status) else: resource_id_to_status = {} stack_name = stack_config.get(STACK_NAME) click.echo('#' * 80) click.echo('Deploying stack {}'.format(stack_name)) # List of statuses, One for each resource in stack_config[STACK_RESOURCES] resource_statuses = [] click.echo('#' * 80) for resource_config in stack_config.get(STACK_RESOURCES): # Retrieve resource deployment info from the last deployment. resource_map_key = (resource_config.get(RESOURCE_ID), resource_config.get(RESOURCE_SERVICE)) resource_status = resource_id_to_status.get(resource_map_key) \ if resource_map_key in resource_id_to_status else None # Deploy resource, get resource_status new_resource_status = self._deploy_resource(resource_config, resource_status, **kwargs) resource_statuses.append(new_resource_status) click.echo('#' * 80) # stack deploy status is original config with deployed resource statuses added new_stack_status = copy.deepcopy(stack_config) new_stack_status.update({STACK_DEPLOYED: resource_statuses}) new_stack_status.update({CLI_VERSION_KEY: CLI_VERSION}) # Validate that the status has been created correctly self._validate_status(new_stack_status) click.echo('#' * 80) return new_stack_status def download_from_config(self, stack_config, **kwargs): """ Downloads a stack given a dict of the stack configuration. :param stack_config: dict of stack configuration. Must contain 'name' and 'resources' field. :return: None. """ self._validate_config(stack_config) stack_name = stack_config.get(STACK_NAME) click.echo('Downloading stack {}'.format(stack_name)) click.echo('#' * 80) for resource_config in stack_config.get(STACK_RESOURCES): # Deploy resource, get resource_status self._download_resource(resource_config, **kwargs) click.echo('#' * 80) def _deploy_resource(self, resource_config, resource_status=None, **kwargs): """ Deploys a resource given a resource information extracted from the stack JSON configuration template. :param resource_config: A dict of the resource with fields of 'id', 'service' and 'properties'. ex. {'id': 'example-resource', 'service': 'jobs', 'properties': {...}} :param resource_status: A dict of the resource's deployment info from the last deployment. Will be None if this is the first deployment. ex. {'id': 'example-resource', 'service': 'jobs', 'physical_id': {...}} :return: dict resource_status- A dictionary of deployment information of the resource to be stored at deploy time. It includes the resource id of the resource along with the physical id and deploy output of the resource. ex. {'id': 'example-resource', 'service': 'jobs', 'physical_id': {'job_id': 123}, 'timestamp': 123456789, 'deploy_output': {..}} """ resource_id = resource_config.get(RESOURCE_ID) resource_service = resource_config.get(RESOURCE_SERVICE) resource_properties = resource_config.get(RESOURCE_PROPERTIES) physical_id = resource_status.get(RESOURCE_PHYSICAL_ID) if resource_status else None if resource_service == JOBS_SERVICE: click.echo("Deploying job '{}' with properties: \n{}".format(resource_id, json.dumps( resource_properties, indent=2, separators=(',', ': ')))) new_physical_id, deploy_output = self._deploy_job(resource_properties, physical_id) elif resource_service == WORKSPACE_SERVICE: click.echo( "Deploying workspace asset '{}' with properties \n{}" .format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')) ) ) overwrite = kwargs.get('overwrite', False) new_physical_id, deploy_output = self._deploy_workspace(resource_properties, physical_id, overwrite) elif resource_service == DBFS_SERVICE: click.echo( "Deploying DBFS asset '{}' with properties \n{}".format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')) ) ) overwrite = kwargs.get('overwrite', False) new_physical_id, deploy_output = self._deploy_dbfs(resource_properties, physical_id, overwrite) else: raise StackError("Resource service '{}' not supported".format(resource_service)) new_resource_status = {RESOURCE_ID: resource_id, RESOURCE_SERVICE: resource_service, RESOURCE_DEPLOY_TIMESTAMP: # Milliseconds since epoch. int(time.mktime(datetime.now().timetuple()) * MS_SEC), RESOURCE_PHYSICAL_ID: new_physical_id, RESOURCE_DEPLOY_OUTPUT: deploy_output} return new_resource_status def _download_resource(self, resource_config, **kwargs): """ Downloads a resource given a resource information extracted from the stack JSON configuration template. :param resource_config: A dict of the resource with fields of 'id', 'service' and 'properties'. ex. {'id': 'example-resource', 'service': 'jobs', 'properties': {...}} """ resource_id = resource_config.get(RESOURCE_ID) resource_service = resource_config.get(RESOURCE_SERVICE) resource_properties = resource_config.get(RESOURCE_PROPERTIES) if resource_service == WORKSPACE_SERVICE: click.echo( "Downloading workspace asset '{}' with properties \n{}" .format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')) ) ) overwrite = kwargs.get('overwrite', False) self._download_workspace(resource_properties, overwrite) else: click.echo("Resource service '{}' not supported for download. " "skipping.".format(resource_service)) def _deploy_job(self, resource_properties, physical_id=None): """ Deploys a job resource by either creating a job if the job isn't kept track of through the physical_id of the job or updating an existing job. The job is created or updated using the the settings specified in the inputted job_settings. :param resource_properties: A dict of the Databricks JobSettings data structure :param physical_id: A dict object containing 'job_id' field of job identifier in Databricks server :return: tuple of (physical_id, deploy_output), where physical_id contains a 'job_id' field of the physical job_id of the job on databricks. deploy_output is the output of the job from databricks when a GET request is called for it. """ job_settings = resource_properties # resource_properties of jobs are solely job settings. if physical_id: job_id = physical_id.get(JOBS_RESOURCE_JOB_ID) self._update_job(job_settings, job_id) else: job_id = self._put_job(job_settings) click.echo("Job deployed on Databricks with Job ID {}".format(job_id)) physical_id = {JOBS_RESOURCE_JOB_ID: job_id} deploy_output = self.jobs_client.get_job(job_id) return physical_id, deploy_output def _put_job(self, job_settings): """ Given settings of the job in job_settings, create a new job. For purposes of idempotency and to reduce leaked resources in alpha versions of stack deployment, if a job exists with the same name, that job will be updated. If multiple jobs are found with the same name, the deployment will abort. :param job_settings: :return: job_id, Physical ID of job on Databricks server. """ job_name = job_settings.get(JOBS_RESOURCE_NAME) jobs_same_name = self.jobs_client._list_jobs_by_name(job_name) if len(jobs_same_name) > 1: raise StackError("Multiple jobs with the same name '{}' already exist, aborting" " stack deployment".format(job_name)) elif len(jobs_same_name) == 1: existing_job = jobs_same_name[0] creator_name = existing_job.get('creator_user_name') timestamp = existing_job.get('created_time') / MS_SEC # Convert to readable date. date_created = datetime.fromtimestamp(timestamp).strftime('%Y-%m-%d %H:%M:%S') click.echo("Warning: Job exists with same name '{}' created by {} on {}. Job will " "be overwritten".format(job_name, creator_name, date_created)) # Calling jobs_client.reset_job directly so as to not call same level function. self.jobs_client.reset_job({'job_id': existing_job.get('job_id'), 'new_settings': job_settings}) return existing_job.get('job_id') else: job_id = self.jobs_client.create_job(job_settings).get('job_id') return job_id def _update_job(self, job_settings, job_id): """ Given job settings and an existing job_id of a job, update the job settings on databricks. :param job_settings: job settings to update the job with. :param job_id: physical job_id of job in databricks server. """ self.jobs_client.reset_job({'job_id': job_id, 'new_settings': job_settings}) def _deploy_workspace(self, resource_properties, physical_id, overwrite): """ Deploy workspace asset. :param resource_properties: dict of properties for the workspace asset. Must contain the 'source_path', 'path' and 'object_type' fields. :param physical_id: dict containing physical identifier of workspace asset on databricks. Should contain the field 'path'. :param overwrite: Whether or not to overwrite the contents of workspace notebooks. :return: (dict, dict) of (physical_id, deploy_output). physical_id is the physical ID for the stack status that contains the workspace path of the notebook or directory on datbricks. deploy_output is the initial information about the asset on databricks at deploy time returned by the REST API. """ local_path = resource_properties.get(WORKSPACE_RESOURCE_SOURCE_PATH) workspace_path = resource_properties.get(WORKSPACE_RESOURCE_PATH) object_type = resource_properties.get(WORKSPACE_RESOURCE_OBJECT_TYPE) actual_object_type = DIRECTORY if os.path.isdir(local_path) else NOTEBOOK if object_type != actual_object_type: raise StackError("Field '{}' ({}) not consistent" "with actual object type ({})".format(WORKSPACE_RESOURCE_OBJECT_TYPE, object_type, actual_object_type)) click.echo('Uploading {} from {} to Databricks workspace at {}'.format(object_type, local_path, workspace_path)) if object_type == NOTEBOOK: # Inference of notebook language and format language_fmt = WorkspaceLanguage.to_language_and_format(local_path) if language_fmt is None: raise StackError("Workspace notebook language and format cannot be inferred" "Please check file extension of notebook file.") language, fmt = language_fmt # Create needed directories in workspace. self.workspace_client.mkdirs(os.path.dirname(workspace_path)) self.workspace_client.import_workspace(local_path, workspace_path, language, fmt, overwrite) elif object_type == DIRECTORY: self.workspace_client.import_workspace_dir(local_path, workspace_path, overwrite, exclude_hidden_files=True) else: # Shouldn't reach here because of verification of object_type above. assert False if physical_id and physical_id[WORKSPACE_RESOURCE_PATH] != workspace_path: # physical_id['path'] is the workspace path from the last deployment. Alert when changed click.echo("Workspace asset had path changed from {} to {}" .format(physical_id[WORKSPACE_RESOURCE_PATH], workspace_path)) new_physical_id = {WORKSPACE_RESOURCE_PATH: workspace_path} deploy_output = self.workspace_client.client.get_status(workspace_path) return new_physical_id, deploy_output def _download_workspace(self, resource_properties, overwrite): """ Download workspace asset. :param resource_properties: dict of properties for the workspace asset. Must contain the 'source_path', 'path' and 'object_type' fields. :param overwrite: Whether or not to overwrite the contents of workspace notebooks. """ local_path = resource_properties.get(WORKSPACE_RESOURCE_SOURCE_PATH) workspace_path = resource_properties.get(WORKSPACE_RESOURCE_PATH) object_type = resource_properties.get(WORKSPACE_RESOURCE_OBJECT_TYPE) click.echo('Downloading {} from Databricks path {} to {}'.format(object_type, workspace_path, local_path)) if object_type == NOTEBOOK: # Inference of notebook language and format. A tuple of (language, fmt) or Nonetype. language_fmt = WorkspaceLanguage.to_language_and_format(local_path) if language_fmt is None: raise StackError("Workspace Notebook language and format cannot be inferred." "Please check file extension of notebook 'source_path'.") (_, fmt) = language_fmt local_dir = os.path.dirname(os.path.abspath(local_path)) if not os.path.exists(local_dir): os.makedirs(local_dir) self.workspace_client.export_workspace(workspace_path, local_path, fmt, overwrite) elif object_type == DIRECTORY: self.workspace_client.export_workspace_dir(workspace_path, local_path, overwrite) else: raise StackError("Invalid value for '{}' field: {}" .format(WORKSPACE_RESOURCE_OBJECT_TYPE, object_type)) def _deploy_dbfs(self, resource_properties, physical_id, overwrite): """ Deploy dbfs asset. :param resource_properties: dict of properties for the dbfs asset. Must contain the 'source_path', 'path' and 'is_dir' fields. :param physical_id: dict containing physical identifier of dbfs asset on Databricks. Should contain the field 'path'. :param overwrite: Whether or not to overwrite the contents of dbfs files. :return: (dict, dict) of (physical_id, deploy_output). physical_id is a dict that contains the dbfs path of the file on Databricks. ex.{"path":"dbfs:/path/in/dbfs"} deploy_output is the initial information about the dbfs asset at deploy time returned by the REST API. """ local_path = resource_properties.get(DBFS_RESOURCE_SOURCE_PATH) dbfs_path = resource_properties.get(DBFS_RESOURCE_PATH) is_dir = resource_properties.get(DBFS_RESOURCE_IS_DIR) if is_dir != os.path.isdir(local_path): dir_or_file = 'directory' if os.path.isdir(local_path) else 'file' raise StackError("local source_path '{}' is found to be a {}, but is not specified" " as one with is_dir: {}." .format(local_path, dir_or_file, str(is_dir).lower())) if is_dir: click.echo('Uploading directory from {} to DBFS at {}'.format(local_path, dbfs_path)) self.dbfs_client.cp(recursive=True, overwrite=overwrite, src=local_path, dst=dbfs_path) else: click.echo('Uploading file from {} to DBFS at {}'.format(local_path, dbfs_path)) self.dbfs_client.cp(recursive=False, overwrite=overwrite, src=local_path, dst=dbfs_path) if physical_id and physical_id[DBFS_RESOURCE_PATH] != dbfs_path: # physical_id['path'] is the dbfs path from the last deployment. Alert when changed click.echo("Dbfs asset had path changed from {} to {}" .format(physical_id[DBFS_RESOURCE_PATH], dbfs_path)) new_physical_id = {DBFS_RESOURCE_PATH: dbfs_path} deploy_output = self.dbfs_client.client.get_status(dbfs_path) return new_physical_id, deploy_output def _validate_config(self, stack_config): """ Validate fields within a stack configuration. This ensures that an inputted configuration has the necessary fields for stack deployment to function well. :param stack_config: dict- stack config that is inputted by the user. :return: None. Raises errors to stop deployment if there is a problem. """ click.echo('Validating fields in stack configuration...') self._assert_fields_in_dict([STACK_NAME, STACK_RESOURCES], stack_config) seen_resource_ids = set() # Store seen resources to restrict duplicates. for resource in stack_config.get(STACK_RESOURCES): # Get validate resource ID exists, then get it. self._assert_fields_in_dict([RESOURCE_ID], resource) resource_id = resource.get(RESOURCE_ID) click.echo('Validating fields in resource with ID "{}"'.format(resource_id)) self._assert_fields_in_dict([RESOURCE_SERVICE, RESOURCE_PROPERTIES], resource) resource_service = resource.get(RESOURCE_SERVICE) resource_properties = resource.get(RESOURCE_PROPERTIES) # Error on duplicate resource ID's if resource_id in seen_resource_ids: raise StackError('Duplicate resource ID "{}" found, please resolve.'.format( resource_id)) seen_resource_ids.add(resource_id) # Resource service-specific validations click.echo('Validating fields in "{}" of {} resource.' .format(RESOURCE_PROPERTIES, resource_service)) if resource_service == JOBS_SERVICE: self._assert_fields_in_dict([JOBS_RESOURCE_NAME], resource_properties) elif resource_service == WORKSPACE_SERVICE: self._assert_fields_in_dict( [WORKSPACE_RESOURCE_PATH, WORKSPACE_RESOURCE_SOURCE_PATH, WORKSPACE_RESOURCE_OBJECT_TYPE], resource_properties) elif resource_service == DBFS_SERVICE: self._assert_fields_in_dict( [DBFS_RESOURCE_PATH, DBFS_RESOURCE_SOURCE_PATH, DBFS_RESOURCE_IS_DIR], resource_properties) else: raise StackError("Resource service '{}' not supported".format(resource_service)) def _validate_status(self, stack_status): """ Validate fields within a stack status. This ensures that a stack status has the necessary fields for stack deployment to function well. If there is an error here, then it is either an implementation error that must be fixed by a developer or the User edited the stack status file created by the program. :param stack_status: dict- stack status that is created by the program. :return: None. Raises errors to stop deployment if there is a problem. """ click.echo('Validating fields in stack status...') self._assert_fields_in_dict([STACK_NAME, STACK_RESOURCES, STACK_DEPLOYED], stack_status) for resource_status in stack_status.get(STACK_DEPLOYED): self._assert_fields_in_dict([RESOURCE_ID], resource_status) resource_id = resource_status.get(RESOURCE_ID) click.echo('Validating fields in resource status of resource with ID "{}"' .format(resource_id)) self._assert_fields_in_dict([RESOURCE_SERVICE, RESOURCE_PHYSICAL_ID, RESOURCE_DEPLOY_OUTPUT], resource_status) resource_service = resource_status.get(RESOURCE_SERVICE) resource_physical_id = resource_status.get(RESOURCE_PHYSICAL_ID) click.echo('Validating fields in "{}" of {} resource status' .format(RESOURCE_PHYSICAL_ID, resource_service)) if resource_service == JOBS_SERVICE: self._assert_fields_in_dict([JOBS_RESOURCE_JOB_ID], resource_physical_id) elif resource_service == WORKSPACE_SERVICE: self._assert_fields_in_dict([WORKSPACE_RESOURCE_PATH], resource_physical_id) elif resource_service == DBFS_SERVICE: self._assert_fields_in_dict([DBFS_RESOURCE_PATH], resource_physical_id) else: raise StackError("{} not a valid resource status service".format(resource_service)) def _assert_fields_in_dict(self, fields, dictionary): for field in fields: if field not in dictionary: raise StackError('Required field "{}" not found'.format(field)) def _get_resource_to_status_map(self, stack_status): """ Returns a dictionary that maps a resource's (id, service) to the resource's status from the last deployment The key for this dictionary is the resource's (id, service) so that we don't load persisted resources with the wrong resource service. """ return { (resource_status.get(RESOURCE_ID), resource_status.get(RESOURCE_SERVICE)): resource_status for resource_status in stack_status.get(STACK_DEPLOYED) } def _generate_stack_status_path(self, stack_path): """ Given a path to the stack configuration template JSON file, generates a path to where the deployment status JSON will be stored after successful deployment of the stack. :param stack_path: Path to the stack config template JSON file :return: The path to the stack status file. >>> self._generate_stack_status_path('./stack.json') './stack.deployed.json' """ stack_status_insert = 'deployed' stack_path_split = stack_path.split('.') stack_path_split.insert(-1, stack_status_insert) return '.'.join(stack_path_split) def _load_json(self, path): """ Parse a json file to a readable dict format. Returns an empty dictionary if the path doesn't exist. :param path: File path of the JSON stack configuration template. :return: dict of parsed JSON stack config template. """ stack_conf = {} if os.path.exists(path): with open(path, 'r') as f: stack_conf = json.load(f) return stack_conf def _save_json(self, path, data): """ Writes data to a JSON file. :param path: Path of JSON file. :param data: dict- data that wants to by written to JSON file :return: None """ with open(path, 'w') as f: json.dump(data, f, indent=2, sort_keys=True)
def src_dbfs_api(src_api_client:ApiClient): return DbfsApi(src_api_client)
class StackApi(object): def __init__(self, api_client): self.jobs_client = JobsApi(api_client) self.workspace_client = WorkspaceApi(api_client) self.dbfs_client = DbfsApi(api_client) def deploy(self, stack_config, stack_status=None, headers=None, **kwargs): """ Deploys a stack given stack JSON configuration template at path config_path. After going through each of the resources and deploying them, stores status JSON of deployment with deploy status of each resource deployment. For each resource deployment, stack_status is used to get the associated resource status of a resource from the last deployment. :param stack_config: Must have the fields of 'name', the name of the stack and 'resources', a list of stack resources. :param stack_status: Must have the fields of 'name', the name of the stack, 'resources', a list of stack resources, and 'deployed', a list of resource statuses from a previous deployment. :return: new_stack_status: The new stack status generated from the deployment of the given stack_config. """ click.echo('#' * 80) self._validate_config(stack_config) if stack_status: click.echo('#' * 80) self._validate_status(stack_status) resource_id_to_status = self._get_resource_to_status_map( stack_status) else: resource_id_to_status = {} stack_name = stack_config.get(STACK_NAME) click.echo('#' * 80) click.echo('Deploying stack {}'.format(stack_name)) # List of statuses, One for each resource in stack_config[STACK_RESOURCES] resource_statuses = [] click.echo('#' * 80) for resource_config in stack_config.get(STACK_RESOURCES): # Retrieve resource deployment info from the last deployment. resource_map_key = (resource_config.get(RESOURCE_ID), resource_config.get(RESOURCE_SERVICE)) resource_status = resource_id_to_status.get(resource_map_key) \ if resource_map_key in resource_id_to_status else None # Deploy resource, get resource_status new_resource_status = self._deploy_resource(resource_config, resource_status, headers=headers, **kwargs) if resource_config.get(RESOURCE_WRITE_STATUS, True): resource_statuses.append(new_resource_status) click.echo('#' * 80) new_stack_status = { STACK_NAME: stack_name, CLI_VERSION_KEY: CLI_VERSION, STACK_DEPLOYED: resource_statuses } # Validate that the status has been created correctly self._validate_status(new_stack_status) click.echo('#' * 80) return new_stack_status def download(self, stack_config, headers=None, **kwargs): """ Downloads a stack given a dict of the stack configuration. :param stack_config: dict of stack configuration. Must contain 'name' and 'resources' field. :return: None. """ self._validate_config(stack_config) stack_name = stack_config.get(STACK_NAME) click.echo('Downloading stack {}'.format(stack_name)) click.echo('#' * 80) for resource_config in stack_config.get(STACK_RESOURCES): # Deploy resource, get resource_status self._download_resource(resource_config, headers=headers, **kwargs) click.echo('#' * 80) def _deploy_resource(self, resource_config, resource_status=None, headers=None, **kwargs): """ Deploys a resource given a resource information extracted from the stack JSON configuration template. :param resource_config: A dict of the resource with fields of 'id', 'service' and 'properties'. ex. {'id': 'example-resource', 'service': 'jobs', 'properties': {...}} :param resource_status: A dict of the resource's deployment info from the last deployment. Will be None if this is the first deployment. ex. {'id': 'example-resource', 'service': 'jobs', 'databricks_id': {...}} :return: dict resource_status- A dictionary of deployment information of the resource to be stored at deploy time. It includes the resource id of the resource along with the databricks id and deploy output of the resource. ex. {'id': 'example-resource', 'service': 'jobs', 'databricks_id': {'job_id': 123}} """ resource_id = resource_config.get(RESOURCE_ID) resource_service = resource_config.get(RESOURCE_SERVICE) resource_properties = resource_config.get(RESOURCE_PROPERTIES) databricks_id = resource_status.get( RESOURCE_DATABRICKS_ID) if resource_status else None if resource_service == JOBS_SERVICE: click.echo('Deploying job "{}" with properties: \n{}'.format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')))) new_databricks_id = self._deploy_job(resource_properties, databricks_id, headers=headers) elif resource_service == WORKSPACE_SERVICE: click.echo( 'Deploying workspace asset "{}" with properties \n{}'.format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')))) overwrite = kwargs.get('overwrite', False) new_databricks_id = self._deploy_workspace(resource_properties, databricks_id, overwrite, headers=headers) elif resource_service == DBFS_SERVICE: click.echo('Deploying DBFS asset "{}" with properties \n{}'.format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')))) overwrite = kwargs.get('overwrite', False) new_databricks_id = self._deploy_dbfs(resource_properties, databricks_id, overwrite, headers=headers) else: raise StackError( 'Resource service "{}" not supported'.format(resource_service)) new_resource_status = { RESOURCE_ID: resource_id, RESOURCE_SERVICE: resource_service, RESOURCE_DATABRICKS_ID: new_databricks_id } return new_resource_status def _download_resource(self, resource_config, headers=None, **kwargs): """ Downloads a resource given a resource information extracted from the stack JSON configuration template. :param resource_config: A dict of the resource with fields of 'id', 'service' and 'properties'. ex. {'id': 'example-resource', 'service': 'jobs', 'properties': {...}} """ resource_id = resource_config.get(RESOURCE_ID) resource_service = resource_config.get(RESOURCE_SERVICE) resource_properties = resource_config.get(RESOURCE_PROPERTIES) if resource_service == WORKSPACE_SERVICE: click.echo( 'Downloading workspace asset "{}" with properties \n{}'.format( resource_id, json.dumps(resource_properties, indent=2, separators=(',', ': ')))) overwrite = kwargs.get('overwrite', False) self._download_workspace(resource_properties, overwrite, headers=headers) else: click.echo('Resource service "{}" not supported for download. ' 'skipping.'.format(resource_service)) def _deploy_job(self, resource_properties, databricks_id=None, headers=None): """ Deploys a job resource by either creating a job if the job isn't kept track of through the databricks_id of the job or updating an existing job. The job is created or updated using the the settings specified in the inputted job_settings. :param resource_properties: A dict of the Databricks JobSettings data structure :param databricks_id: A dict object containing 'job_id' field of job identifier in Databricks server :return: databricks_id: dict containing a 'job_id' field of the physical job_id of the job on databricks. """ job_settings = resource_properties # resource_properties of jobs are solely job settings. if databricks_id: job_id = databricks_id.get(JOBS_RESOURCE_JOB_ID) self._update_job(job_settings, job_id, headers=headers) else: job_id = self._put_job(job_settings, headers=headers) click.echo("Job deployed on Databricks with Job ID {}".format(job_id)) databricks_id = {JOBS_RESOURCE_JOB_ID: job_id} return databricks_id def _put_job(self, job_settings, headers=None): """ Given settings of the job in job_settings, create a new job. For purposes of idempotency and to reduce leaked resources in alpha versions of stack deployment, if a job exists with the same name, that job will be updated. If multiple jobs are found with the same name, the deployment will abort. :param job_settings: :return: job_id, Physical ID of job on Databricks server. """ job_name = job_settings.get(JOBS_RESOURCE_NAME) jobs_same_name = self.jobs_client._list_jobs_by_name(job_name, headers=headers) if len(jobs_same_name) > 1: raise StackError( 'Multiple jobs with the same name "{}" already exist, aborting' ' stack deployment'.format(job_name)) if len(jobs_same_name) == 1: existing_job = jobs_same_name[0] creator_name = existing_job.get('creator_user_name') timestamp = existing_job.get( 'created_time') / MS_SEC # Convert to readable date. date_created = datetime.fromtimestamp(timestamp).strftime( '%Y-%m-%d %H:%M:%S') click.echo( 'Warning: Job exists with same name "{}" created by {} on {}. Job will ' 'be overwritten'.format(job_name, creator_name, date_created)) # Calling jobs_client.reset_job directly so as to not call same level function. self.jobs_client.reset_job( { 'job_id': existing_job.get('job_id'), 'new_settings': job_settings }, headers=headers) return existing_job.get('job_id') else: job_id = self.jobs_client.create_job(job_settings, headers=headers).get('job_id') return job_id def _update_job(self, job_settings, job_id, headers=None): """ Given job settings and an existing job_id of a job, update the job settings on databricks. :param job_settings: job settings to update the job with. :param job_id: physical job_id of job in databricks server. """ try: self.jobs_client.reset_job( { 'job_id': job_id, 'new_settings': job_settings }, headers=headers) except HTTPError: raise StackError( 'Job ID {} in stack status could not be found in the workspace. ' 'Please remove or make necessary changes to the current stack status ' 'to resolve this inconsistency before proceeding. Aborting ' 'stack deployment ...'.format(job_id)) def _deploy_workspace(self, resource_properties, databricks_id, overwrite, headers=None): """ Deploy workspace asset. :param resource_properties: dict of properties for the workspace asset. Must contain the 'source_path', 'path' and 'object_type' fields. :param databricks_id: dict containing physical identifier of workspace asset on databricks. Should contain the field 'path'. :param overwrite: Whether or not to overwrite the contents of workspace notebooks. :return: databricks_id: dict containing the physical ID for the stack status that contains the workspace path of the notebook or directory on datbricks. deploy_output is the initial information about the asset on databricks at deploy time returned by the REST API. """ local_path = resource_properties.get(WORKSPACE_RESOURCE_SOURCE_PATH) workspace_path = resource_properties.get(WORKSPACE_RESOURCE_PATH) object_type = resource_properties.get(WORKSPACE_RESOURCE_OBJECT_TYPE) actual_object_type = DIRECTORY if os.path.isdir( local_path) else NOTEBOOK if object_type != actual_object_type: raise StackError('Field "{}" ({}) not consistent ' 'with actual object type ({})'.format( WORKSPACE_RESOURCE_OBJECT_TYPE, object_type, actual_object_type)) click.echo('Uploading {} from {} to Databricks workspace at {}'.format( object_type, local_path, workspace_path)) if object_type == NOTEBOOK: # Inference of notebook language and format language_fmt = WorkspaceLanguage.to_language_and_format(local_path) if language_fmt is None: raise StackError( "Workspace notebook language and format cannot be inferred. " "Please check file extension of notebook file.") language, fmt = language_fmt # Create needed directories in workspace. self.workspace_client.mkdirs(os.path.dirname(workspace_path), headers=headers) self.workspace_client.import_workspace(local_path, workspace_path, language, fmt, overwrite, headers=headers) elif object_type == DIRECTORY: self.workspace_client.import_workspace_dir( local_path, workspace_path, overwrite, exclude_hidden_files=True, headers=headers) else: # Shouldn't reach here because of verification of object_type above. assert False if databricks_id and databricks_id[ WORKSPACE_RESOURCE_PATH] != workspace_path: # databricks_id['path'] is the workspace path from the last deployment. Alert when # changed click.echo("Workspace asset had path changed from {} to {}".format( databricks_id[WORKSPACE_RESOURCE_PATH], workspace_path)) new_databricks_id = {WORKSPACE_RESOURCE_PATH: workspace_path} return new_databricks_id def _download_workspace(self, resource_properties, overwrite, headers=None): """ Download workspace asset. :param resource_properties: dict of properties for the workspace asset. Must contain the 'source_path', 'path' and 'object_type' fields. :param overwrite: Whether or not to overwrite the contents of workspace notebooks. """ local_path = resource_properties.get(WORKSPACE_RESOURCE_SOURCE_PATH) workspace_path = resource_properties.get(WORKSPACE_RESOURCE_PATH) object_type = resource_properties.get(WORKSPACE_RESOURCE_OBJECT_TYPE) click.echo('Downloading {} from Databricks path {} to {}'.format( object_type, workspace_path, local_path)) if object_type == NOTEBOOK: # Inference of notebook language and format. A tuple of (language, fmt) or Nonetype. language_fmt = WorkspaceLanguage.to_language_and_format(local_path) if language_fmt is None: raise StackError( "Workspace Notebook language and format cannot be inferred. " "Please check file extension of notebook 'source_path'.") (_, fmt) = language_fmt local_dir = os.path.dirname(os.path.abspath(local_path)) if not os.path.exists(local_dir): os.makedirs(local_dir) self.workspace_client.export_workspace(workspace_path, local_path, fmt, overwrite, headers=headers) elif object_type == DIRECTORY: self.workspace_client.export_workspace_dir(workspace_path, local_path, overwrite, headers=headers) else: raise StackError('Invalid value for "{}" field: {}'.format( WORKSPACE_RESOURCE_OBJECT_TYPE, object_type)) def _deploy_dbfs(self, resource_properties, databricks_id, overwrite, headers=None): """ Deploy dbfs asset. :param resource_properties: dict of properties for the dbfs asset. Must contain the 'source_path', 'path' and 'is_dir' fields. :param databricks_id: dict containing physical identifier of dbfs asset on Databricks. Should contain the field 'path'. :param overwrite: Whether or not to overwrite the contents of dbfs files. :return: databricks_id: a dict that contains the dbfs path of the file on Databricks. ex.{"path":"dbfs:/path/in/dbfs"} """ local_path = resource_properties.get(DBFS_RESOURCE_SOURCE_PATH) dbfs_path = resource_properties.get(DBFS_RESOURCE_PATH) is_dir = resource_properties.get(DBFS_RESOURCE_IS_DIR) if is_dir != os.path.isdir(local_path): dir_or_file = 'directory' if os.path.isdir(local_path) else 'file' raise StackError( 'local source_path "{}" is found to be a {}, but is not specified' ' as one with is_dir: {}.'.format(local_path, dir_or_file, str(is_dir).lower())) if is_dir: click.echo('Uploading directory from {} to DBFS at {}'.format( local_path, dbfs_path)) self.dbfs_client.cp(recursive=True, overwrite=overwrite, src=local_path, dst=dbfs_path, headers=headers) else: click.echo('Uploading file from {} to DBFS at {}'.format( local_path, dbfs_path)) self.dbfs_client.cp(recursive=False, overwrite=overwrite, src=local_path, dst=dbfs_path, headers=headers) if databricks_id and databricks_id[DBFS_RESOURCE_PATH] != dbfs_path: # databricks_id['path'] is the dbfs path from the last deployment. Alert when changed click.echo("Dbfs asset had path changed from {} to {}".format( databricks_id[DBFS_RESOURCE_PATH], dbfs_path)) new_databricks_id = {DBFS_RESOURCE_PATH: dbfs_path} return new_databricks_id def _validate_config(self, stack_config): """ Validate fields within a stack configuration. This ensures that an inputted configuration has the necessary fields for stack deployment to function well. :param stack_config: dict- stack config that is inputted by the user. :return: None. Raises errors to stop deployment if there is a problem. """ click.echo('Validating fields in stack configuration...') self._assert_fields_in_dict([STACK_NAME, STACK_RESOURCES], stack_config) seen_resource_ids = set( ) # Store seen resources to restrict duplicates. for resource in stack_config.get(STACK_RESOURCES): # Get validate resource ID exists, then get it. self._assert_fields_in_dict([RESOURCE_ID], resource) resource_id = resource.get(RESOURCE_ID) click.echo('Validating fields in resource with ID "{}"'.format( resource_id)) self._assert_fields_in_dict( [RESOURCE_SERVICE, RESOURCE_PROPERTIES], resource) resource_service = resource.get(RESOURCE_SERVICE) resource_properties = resource.get(RESOURCE_PROPERTIES) # Error on duplicate resource ID's if resource_id in seen_resource_ids: raise StackError( 'Duplicate resource ID "{}" found, please resolve.'.format( resource_id)) seen_resource_ids.add(resource_id) # Resource service-specific validations click.echo('Validating fields in "{}" of {} resource.'.format( RESOURCE_PROPERTIES, resource_service)) if resource_service == JOBS_SERVICE: self._assert_fields_in_dict([JOBS_RESOURCE_NAME], resource_properties) elif resource_service == WORKSPACE_SERVICE: self._assert_fields_in_dict([ WORKSPACE_RESOURCE_PATH, WORKSPACE_RESOURCE_SOURCE_PATH, WORKSPACE_RESOURCE_OBJECT_TYPE ], resource_properties) elif resource_service == DBFS_SERVICE: self._assert_fields_in_dict([ DBFS_RESOURCE_PATH, DBFS_RESOURCE_SOURCE_PATH, DBFS_RESOURCE_IS_DIR ], resource_properties) else: raise StackError('Resource service "{}" not supported'.format( resource_service)) def _validate_status(self, stack_status): """ Validate fields within a stack status. This ensures that a stack status has the necessary fields for stack deployment to function well. If there is an error here, then it is either an implementation error that must be fixed by a developer or the User edited the stack status file created by the program. :param stack_status: dict- stack status that is created by the program. :return: None. Raises errors to stop deployment if there is a problem. """ click.echo('Validating fields in stack status...') self._assert_fields_in_dict([STACK_NAME, STACK_DEPLOYED], stack_status) for resource_status in stack_status.get(STACK_DEPLOYED): self._assert_fields_in_dict([RESOURCE_ID], resource_status) resource_id = resource_status.get(RESOURCE_ID) click.echo( 'Validating fields in resource status of resource with ID "{}"' .format(resource_id)) self._assert_fields_in_dict( [RESOURCE_SERVICE, RESOURCE_DATABRICKS_ID], resource_status) resource_service = resource_status.get(RESOURCE_SERVICE) resource_databricks_id = resource_status.get( RESOURCE_DATABRICKS_ID) click.echo( 'Validating fields in "{}" of {} resource status'.format( RESOURCE_DATABRICKS_ID, resource_service)) if resource_service == JOBS_SERVICE: self._assert_fields_in_dict([JOBS_RESOURCE_JOB_ID], resource_databricks_id) elif resource_service == WORKSPACE_SERVICE: self._assert_fields_in_dict([WORKSPACE_RESOURCE_PATH], resource_databricks_id) elif resource_service == DBFS_SERVICE: self._assert_fields_in_dict([DBFS_RESOURCE_PATH], resource_databricks_id) else: raise StackError( "{} not a valid resource status service".format( resource_service)) def _assert_fields_in_dict(self, fields, dictionary): for field in fields: if field not in dictionary: raise StackError('Required field "{}" not found'.format(field)) def _get_resource_to_status_map(self, stack_status): """ Returns a dictionary that maps a resource's (id, service) to the resource's status from the last deployment The key for this dictionary is the resource's (id, service) so that we don't load persisted resources with the wrong resource service. """ return {(resource_status.get(RESOURCE_ID), resource_status.get(RESOURCE_SERVICE)): resource_status for resource_status in stack_status.get(STACK_DEPLOYED)}
class PipelinesApi(object): def __init__(self, api_client): self.client = DeltaPipelinesService(api_client) self.dbfs_client = DbfsApi(api_client) def create(self, spec, allow_duplicate_names, headers=None): data = self._upload_libraries_and_update_spec(spec) data['allow_duplicate_names'] = allow_duplicate_names return self.client.client.perform_query('POST', '/pipelines', data=data, headers=headers) def deploy(self, spec, allow_duplicate_names, headers=None): data = self._upload_libraries_and_update_spec(spec) data['allow_duplicate_names'] = allow_duplicate_names pipeline_id = data['id'] self.client.client.perform_query('PUT', '/pipelines/{}'.format(pipeline_id), data=data, headers=headers) def delete(self, pipeline_id, headers=None): self.client.delete(pipeline_id, headers) def get(self, pipeline_id, headers=None): return self.client.get(pipeline_id, headers) def list(self, headers=None): def call(page_token=None, max_results=None, order_by=None): _data = {} if page_token: _data["pagination.page_token"] = page_token if max_results: _data["pagination.max_results"] = max_results if order_by: _data["pagination.order_by"] = order_by return self.client.client.perform_query('GET', '/pipelines', data=_data, headers=headers) response = call() pipelines = response.get("statuses", []) while "next_page_token" in response.get("pagination", {}): response = call( page_token=response["pagination"]["next_page_token"]) pipelines.extend(response.get("statuses", [])) return pipelines def reset(self, pipeline_id, headers=None): self.client.reset(pipeline_id, headers) def run(self, pipeline_id, headers=None): self.client.run(pipeline_id, headers) def stop(self, pipeline_id, headers=None): self.client.stop(pipeline_id, headers) def _upload_libraries_and_update_spec(self, spec): spec = copy.deepcopy(spec) lib_objects = LibraryObject.from_json(spec.get('libraries', [])) local_lib_objects, external_lib_objects = self._identify_local_libraries( lib_objects) spec['libraries'] = LibraryObject.to_json( external_lib_objects + self._upload_local_libraries(local_lib_objects)) return spec @staticmethod def _identify_local_libraries(lib_objects): """ Partitions the given set of libraries into local and those already present in dbfs/s3 etc. Local libraries are (currently) jar files with a file scheme or no scheme at all. All other libraries should be present in a supported external source. :param lib_objects: List[LibraryObject] :return: List[List[LibraryObject], List[LibraryObject]] ([Local, External]) """ local_lib_objects, external_lib_objects = [], [] for lib_object in lib_objects: if lib_object.lib_type == 'maven': external_lib_objects.append(lib_object) continue parsed_uri = urllib.parse.urlparse(lib_object.path) if lib_object.lib_type in supported_lib_types and parsed_uri.scheme == '': local_lib_objects.append(lib_object) elif lib_object.lib_type in supported_lib_types and parsed_uri.scheme.lower( ) == 'file': # exactly 1 or 3 if parsed_uri.path.startswith('//') or parsed_uri.netloc != '': raise RuntimeError( 'invalid file uri scheme, ' 'did you mean to use file:/ or file:///') local_lib_objects.append( LibraryObject(lib_object.lib_type, parsed_uri.path)) else: external_lib_objects.append(lib_object) return local_lib_objects, external_lib_objects def _upload_local_libraries(self, local_lib_objects): remote_lib_objects = [ LibraryObject(llo.lib_type, self._get_hashed_path(llo.path)) for llo in local_lib_objects ] transformed_remote_lib_objects = [ LibraryObject(rlo.lib_type, DbfsPath(rlo.path)) for rlo in remote_lib_objects ] upload_files = [ llo_tuple for llo_tuple in zip(local_lib_objects, transformed_remote_lib_objects) if not self.dbfs_client.file_exists(llo_tuple[1].path) ] for llo, rlo in upload_files: self.dbfs_client.put_file(llo.path, rlo.path, False) return remote_lib_objects @staticmethod def _get_hashed_path(path): """ Finds the corresponding dbfs file path for the file located at the supplied path by calculating its hash using SHA1. :param path: Local File Path :return: Remote Path (pipeline_base_dir + file_hash (dot) file_extension) """ hash_buffer = sha1() with open(path, 'rb') as f: while True: data = f.read(BUFFER_SIZE) if not data: break hash_buffer.update(data) file_hash = hash_buffer.hexdigest() # splitext includes the period in the extension extension = os.path.splitext(path)[1][1:] if extension == 'whl': # Wheels need to follow the format described in the PEP, so we simply # pre-pend the content hash to the wheel_name # basename in Python returns the extension as well wheel_name = os.path.basename(path) path = '{}/{}/{}'.format(base_pipelines_dir, file_hash, wheel_name) else: path = '{}/{}.{}'.format(base_pipelines_dir, file_hash, extension) return path
def cat_cli(api_client, src): """ Show the contents of a file. Does not work for directories. """ DbfsApi(api_client).cat(src)
def __init__(self, api_client): self.client = DeltaPipelinesService(api_client) self.dbfs_client = DbfsApi(api_client)
def test_src_dbfs_files(src_dbfs_api: DbfsApi): print(src_dbfs_api.list_files(DbfsPath("dbfs:/example_notebook.py"))) assert src_dbfs_api.list_files( DbfsPath("dbfs:/example_notebook.py")) is not None