def test_list_jobs(): with mock.patch('databricks_cli.sdk.ApiClient') as api_client_mock: api = JobsApi(api_client_mock) api.list_jobs() api_client_mock.perform_query.assert_called_with('GET', '/jobs/list', data={}, headers=None, version=None) api.list_jobs(version='3.0') api_client_mock.perform_query.assert_called_with('GET', '/jobs/list', data={}, headers=None, version='3.0')
class SdkClient(): def __init__(self, profile=None): client = utils.get_api_client(profile) self.cluster_client = ClusterApi(client) self.jobs_client = JobsApi(client) def list_clusters(self): return self.cluster_client.list_clusters() def get_cluster(self, cluster_id): return self.cluster_client.get_cluster(cluster_id) def list_jobs(self): return self.jobs_client.list_jobs()
def export_cli(dry_run, tag, delete, git_ssh_url, api_client: ApiClient, hcl, pattern_matches): block_key_map = { "new_cluster": handle_block, "notebook_task": handle_block, "aws_attributes": handle_block, "spark_env_vars": handle_block, "autoscale": handle_block, "spark_submit_task": handle_block, "libraries": handle_libraries, "email_notifications": handle_map, "custom_tags": handle_map } ignore_attribute_key = { "created_time", "creator_user_name", "job_id" } required_attributes_key = { "max_concurrent_runs", "name" } if hcl: job_api = JobsApi(api_client) jobs = job_api.list_jobs()["jobs"] log.info(jobs) with GitExportHandler(git_ssh_url, "jobs", delete_not_found=delete, dry_run=dry_run, tag=tag) as gh: for job in jobs: if not pattern_matches(job["settings"]["name"]): log.debug(f"{job['settings']['name']} did not match pattern function {pattern_matches}") continue log.debug(f"{job['settings']['name']} matched the pattern function {pattern_matches}") job_resource_data = prep_json(block_key_map, ignore_attribute_key, job['settings'], required_attributes_key) base_name = normalize_identifier(job['settings']['name']) name = "databricks_job" identifier = f"databricks_job-{base_name}" #need to escape quotes in the name. job_resource_data['name'] = job_resource_data['name'].replace('"','\\"') instance_job_hcl = create_resource_from_dict(name, identifier, job_resource_data, False) file_name_identifier = f"{identifier}.tf" gh.add_file(file_name_identifier, instance_job_hcl) log.debug(instance_job_hcl)
def list_cli(api_client, output): """ Lists the jobs in the Databricks Job Service. By default the output format will be a human readable table with the following fields - Job ID - Job name A JSON formatted output can also be requested by setting the --output parameter to "JSON" In table mode, the jobs are sorted by their name. """ jobs_api = JobsApi(api_client) jobs_json = jobs_api.list_jobs() if OutputClickType.is_json(output): click.echo(pretty_format(jobs_json)) else: click.echo(tabulate(_jobs_to_table(jobs_json), tablefmt='plain', disable_numparse=True))
def list_cli(api_client, output, job_type, version, expand_tasks, offset, limit, _all): """ Lists the jobs in the Databricks Job Service. By default the output format will be a human readable table with the following fields - Job ID - Job name A JSON formatted output can also be requested by setting the --output parameter to "JSON" In table mode, the jobs are sorted by their name. """ check_version(api_client, version) api_version = version or api_client.jobs_api_version if api_version != '2.1' and (expand_tasks or offset or limit or _all): click.echo(click.style('ERROR', fg='red') + ': the options --expand-tasks, ' + '--offset, --limit, and --all are only available in API 2.1', err=True) return jobs_api = JobsApi(api_client) has_more = True jobs = [] if _all: offset = 0 limit = 20 while has_more: jobs_json = jobs_api.list_jobs(job_type=job_type, expand_tasks=expand_tasks, offset=offset, limit=limit, version=version) jobs += jobs_json['jobs'] if 'jobs' in jobs_json else [] has_more = jobs_json.get('has_more', False) and _all if has_more: offset = offset + \ (len(jobs_json['jobs']) if 'jobs' in jobs_json else 20) out = {'jobs': jobs} if OutputClickType.is_json(output): click.echo(pretty_format(out)) else: click.echo(tabulate(_jobs_to_table(out), tablefmt='plain', disable_numparse=True))
class DeployToDatabricks(Step): def __init__(self, env: ApplicationVersion, config: dict): super().__init__(env, config) self.vault_name, self.vault_client = KeyVaultClient.vault_and_client(self.config, self.env) self.databricks_client = Databricks(self.vault_name, self.vault_client).api_client(self.config) self.jobs_api = JobsApi(self.databricks_client) self.runs_api = RunsApi(self.databricks_client) def schema(self) -> vol.Schema: return SCHEMA def run(self): self.deploy_to_databricks() @staticmethod def _job_is_streaming(job_config: dict): """ If there is no schedule, the job would not run periodically, therefore we assume that is a streaming job :param job_config: the configuration of the Databricks job :return: (bool) if it is a streaming job """ return "schedule" not in job_config.keys() def deploy_to_databricks(self): """ The application parameters (cosmos and eventhub) will be removed from this file as they will be set as databricks secrets eventually If the job is a streaming job this will directly start the new job_run given the new configuration. If the job is batch this will not start it manually, assuming the schedule has been set correctly. """ for job in self.config["jobs"]: app_name = self._construct_name(job["name"]) job_name = f"{app_name}-{self.env.artifact_tag}" job_config = self.create_config(job_name, job) is_streaming = self._job_is_streaming(job_config) logger.info("Removing old job") self.remove_job(self.env.artifact_tag, is_streaming=is_streaming) logger.info("Submitting new job with configuration:") logger.info(pprint.pformat(job_config)) self._submit_job(job_config, is_streaming) def create_config(self, job_name: str, job_config: dict): common_arguments = dict( config_file=job_config["config_file"], application_name=job_name, log_destination=job_name, parameters=self._construct_arguments(job_config["arguments"]), schedule=self._get_schedule(job_config), environment=self.env.environment_formatted, ) root_library_folder = self.config["common"]["databricks_fs_libraries_mount_path"] storage_base_path = f"{root_library_folder}/{self.application_name}" artifact_path = f"{storage_base_path}/{self.application_name}-{self.env.artifact_tag}" if job_config["lang"] == "python": wheel_name = get_whl_name(self.application_name, self.env.artifact_tag, ".whl") py_main_name = get_main_py_name(self.application_name, self.env.artifact_tag, ".py") run_config = DeployToDatabricks._construct_job_config( **common_arguments, whl_file=f"{root_library_folder}/{wheel_name}", python_file=f"{root_library_folder}/{py_main_name}", ) else: # java/scala jobs run_config = DeployToDatabricks._construct_job_config( **common_arguments, class_name=job_config["main_name"], jar_file=f"{artifact_path}.jar" ) return run_config def _get_schedule(self, job_config: dict) -> Optional[dict]: schedule = job_config.get("schedule", None) if schedule: if "quartz_cron_expression" in schedule: return schedule else: return schedule.get(self.env.environment.lower(), None) return schedule def _construct_name(self, name: str) -> str: postfix = f"-{name}" if name else "" return f"{self.application_name}{postfix}" @staticmethod def _construct_arguments(args: List[dict]) -> list: params = [] for named_arguments_pair in args: for k, v in named_arguments_pair.items(): params.extend([f"--{k}", v]) return params @staticmethod def _construct_job_config(config_file: str, **kwargs) -> dict: return util.render_file_with_jinja(config_file, kwargs, json.loads) def remove_job(self, branch: str, is_streaming: bool): """ Removes the existing job and cancels any running job_run if the application is streaming. If the application is batch, it'll let the batch job finish but it will remove the job, making sure no other job_runs can start for that old job. """ job_configs = [ JobConfig(_["settings"]["name"], _["job_id"]) for _ in self.jobs_api.list_jobs()["jobs"] ] job_ids = self._application_job_id(self.application_name, branch, job_configs) if not job_ids: logger.info(f"Could not find jobs in list of {pprint.pformat(job_configs)}") for job_id in job_ids: logger.info(f"Found Job with ID {job_id}") if is_streaming: self._kill_it_with_fire(job_id) logger.info(f"Deleting Job with ID {job_id}") self.jobs_api.delete_job(job_id) @staticmethod def _application_job_id(application_name: str, branch: str, jobs: List[JobConfig]) -> List[int]: snapshot = "SNAPSHOT" tag = "\d+\.\d+\.\d+" pattern = re.compile(rf"^({application_name})-({snapshot}|{tag}|{branch})$") return [_.job_id for _ in jobs if has_prefix_match(_.name, application_name, pattern)] def _kill_it_with_fire(self, job_id): logger.info(f"Finding runs for job_id {job_id}") runs = self.runs_api.list_runs(job_id, active_only=True, completed_only=None, offset=None, limit=None) # If the runs is empty, there are no jobs at all # TODO: Check if the has_more flag is true, this means we need to go over the pages if "runs" in runs: active_run_ids = [_["run_id"] for _ in runs["runs"]] logger.info(f"Canceling active runs {active_run_ids}") [self.runs_api.cancel_run(_) for _ in active_run_ids] def _submit_job(self, job_config: dict, is_streaming: bool): job_resp = self.jobs_api.create_job(job_config) logger.info(f"Created Job with ID {job_resp['job_id']}") if is_streaming: resp = self.jobs_api.run_now( job_id=job_resp["job_id"], jar_params=None, notebook_params=None, python_params=None, spark_submit_params=None, ) logger.info(f"Created run with ID {resp['run_id']}")