def test_delete_job(): with mock.patch('databricks_cli.sdk.ApiClient') as api_client_mock: api = JobsApi(api_client_mock) api.delete_job('1') api_client_mock.perform_query.assert_called_with('POST', '/jobs/delete', data={'job_id': '1'}, headers=None, version=None) api.delete_job('1', version='3.0') api_client_mock.perform_query.assert_called_with('POST', '/jobs/delete', data={'job_id': '1'}, headers=None, version='3.0')
class DeployToDatabricks(Step): def __init__(self, env: ApplicationVersion, config: dict): super().__init__(env, config) self.vault_name, self.vault_client = KeyVaultClient.vault_and_client(self.config, self.env) self.databricks_client = Databricks(self.vault_name, self.vault_client).api_client(self.config) self.jobs_api = JobsApi(self.databricks_client) self.runs_api = RunsApi(self.databricks_client) def schema(self) -> vol.Schema: return SCHEMA def run(self): self.deploy_to_databricks() @staticmethod def _job_is_streaming(job_config: dict): """ If there is no schedule, the job would not run periodically, therefore we assume that is a streaming job :param job_config: the configuration of the Databricks job :return: (bool) if it is a streaming job """ return "schedule" not in job_config.keys() def deploy_to_databricks(self): """ The application parameters (cosmos and eventhub) will be removed from this file as they will be set as databricks secrets eventually If the job is a streaming job this will directly start the new job_run given the new configuration. If the job is batch this will not start it manually, assuming the schedule has been set correctly. """ for job in self.config["jobs"]: app_name = self._construct_name(job["name"]) job_name = f"{app_name}-{self.env.artifact_tag}" job_config = self.create_config(job_name, job) is_streaming = self._job_is_streaming(job_config) logger.info("Removing old job") self.remove_job(self.env.artifact_tag, is_streaming=is_streaming) logger.info("Submitting new job with configuration:") logger.info(pprint.pformat(job_config)) self._submit_job(job_config, is_streaming) def create_config(self, job_name: str, job_config: dict): common_arguments = dict( config_file=job_config["config_file"], application_name=job_name, log_destination=job_name, parameters=self._construct_arguments(job_config["arguments"]), schedule=self._get_schedule(job_config), environment=self.env.environment_formatted, ) root_library_folder = self.config["common"]["databricks_fs_libraries_mount_path"] storage_base_path = f"{root_library_folder}/{self.application_name}" artifact_path = f"{storage_base_path}/{self.application_name}-{self.env.artifact_tag}" if job_config["lang"] == "python": wheel_name = get_whl_name(self.application_name, self.env.artifact_tag, ".whl") py_main_name = get_main_py_name(self.application_name, self.env.artifact_tag, ".py") run_config = DeployToDatabricks._construct_job_config( **common_arguments, whl_file=f"{root_library_folder}/{wheel_name}", python_file=f"{root_library_folder}/{py_main_name}", ) else: # java/scala jobs run_config = DeployToDatabricks._construct_job_config( **common_arguments, class_name=job_config["main_name"], jar_file=f"{artifact_path}.jar" ) return run_config def _get_schedule(self, job_config: dict) -> Optional[dict]: schedule = job_config.get("schedule", None) if schedule: if "quartz_cron_expression" in schedule: return schedule else: return schedule.get(self.env.environment.lower(), None) return schedule def _construct_name(self, name: str) -> str: postfix = f"-{name}" if name else "" return f"{self.application_name}{postfix}" @staticmethod def _construct_arguments(args: List[dict]) -> list: params = [] for named_arguments_pair in args: for k, v in named_arguments_pair.items(): params.extend([f"--{k}", v]) return params @staticmethod def _construct_job_config(config_file: str, **kwargs) -> dict: return util.render_file_with_jinja(config_file, kwargs, json.loads) def remove_job(self, branch: str, is_streaming: bool): """ Removes the existing job and cancels any running job_run if the application is streaming. If the application is batch, it'll let the batch job finish but it will remove the job, making sure no other job_runs can start for that old job. """ job_configs = [ JobConfig(_["settings"]["name"], _["job_id"]) for _ in self.jobs_api.list_jobs()["jobs"] ] job_ids = self._application_job_id(self.application_name, branch, job_configs) if not job_ids: logger.info(f"Could not find jobs in list of {pprint.pformat(job_configs)}") for job_id in job_ids: logger.info(f"Found Job with ID {job_id}") if is_streaming: self._kill_it_with_fire(job_id) logger.info(f"Deleting Job with ID {job_id}") self.jobs_api.delete_job(job_id) @staticmethod def _application_job_id(application_name: str, branch: str, jobs: List[JobConfig]) -> List[int]: snapshot = "SNAPSHOT" tag = "\d+\.\d+\.\d+" pattern = re.compile(rf"^({application_name})-({snapshot}|{tag}|{branch})$") return [_.job_id for _ in jobs if has_prefix_match(_.name, application_name, pattern)] def _kill_it_with_fire(self, job_id): logger.info(f"Finding runs for job_id {job_id}") runs = self.runs_api.list_runs(job_id, active_only=True, completed_only=None, offset=None, limit=None) # If the runs is empty, there are no jobs at all # TODO: Check if the has_more flag is true, this means we need to go over the pages if "runs" in runs: active_run_ids = [_["run_id"] for _ in runs["runs"]] logger.info(f"Canceling active runs {active_run_ids}") [self.runs_api.cancel_run(_) for _ in active_run_ids] def _submit_job(self, job_config: dict, is_streaming: bool): job_resp = self.jobs_api.create_job(job_config) logger.info(f"Created Job with ID {job_resp['job_id']}") if is_streaming: resp = self.jobs_api.run_now( job_id=job_resp["job_id"], jar_params=None, notebook_params=None, python_params=None, spark_submit_params=None, ) logger.info(f"Created run with ID {resp['run_id']}")