Ejemplo n.º 1
0
 def __init__(self, env: ApplicationVersion, config: dict):
     super().__init__(env, config)
     self.vault_name, self.vault_client = KeyVaultClient.vault_and_client(
         self.config, self.env)
     self.databricks_client = Databricks(
         self.vault_name, self.vault_client).api_client(self.config)
     self.jobs_api = JobsApi(self.databricks_client)
     self.runs_api = RunsApi(self.databricks_client)
Ejemplo n.º 2
0
def get_output_cli(api_client, run_id):
    """
    Gets the output of a run

    The output schema is documented https://docs.databricks.com/api/latest/jobs.html#runs-get-output
    """
    click.echo(pretty_format(RunsApi(api_client).get_run_output(run_id)))
Ejemplo n.º 3
0
def get_cli(api_client, run_id):
    """
    Gets the metadata about a run in json form.

    The output schema is documented https://docs.databricks.com/api/latest/jobs.html#runs-get.
    """
    click.echo(pretty_format(RunsApi(api_client).get_run(run_id)))
Ejemplo n.º 4
0
def cancel_cli(api_client, run_id, version):
    """
    Cancels the run specified.
    """
    check_version(api_client, version)
    click.echo(pretty_format(
        RunsApi(api_client).cancel_run(run_id, version=version)))
Ejemplo n.º 5
0
def submit_cli(api_client, json_file, json):
    """
    Submits a one-time run.

    The specification for the request json can be found
    https://docs.databricks.com/api/latest/jobs.html#runs-submit
    """
    json_cli_base(json_file, json, lambda json: RunsApi(api_client).submit_run(json))
Ejemplo n.º 6
0
class ApiClient():
    def __init__(self, profile=None):
        api_client =  get_api_client(profile)
        self.dbfs_client = DbfsApi(api_client)
        self.runs_client = RunsApi(api_client)

    def mkdirs(self, dbfs_path):
        return self.dbfs_client.mkdirs(DbfsPath(dbfs_path))

    def list_files(self, dbfs_path):
        return self.dbfs_client.list_files(DbfsPath(dbfs_path))

    def put_file(self, src_path, dbfs_path, overwrite=True):
        return self.dbfs_client.put_file(src_path, DbfsPath(dbfs_path), overwrite)

    def submit_run(self, json_data):
        return self.runs_client.submit_run(json_data)

    def get_run(self, run_id):
        return self.runs_client.get_run(run_id)
Ejemplo n.º 7
0
def test_get_run_output():
    with mock.patch('databricks_cli.sdk.ApiClient') as api_client_mock:
        api = RunsApi(api_client_mock)
        api.get_run_output('1')
        api_client_mock.perform_query.assert_called_with(
            'GET', '/jobs/runs/get-output', data={'run_id': '1'}, headers=None, version=None
        )
        api.get_run_output('1', version='3.0')
        api_client_mock.perform_query.assert_called_with(
            'GET', '/jobs/runs/get-output', data={'run_id': '1'}, headers=None, version='3.0'
        )
Ejemplo n.º 8
0
def test_cancel_run():
    with mock.patch('databricks_cli.sdk.ApiClient') as api_client_mock:
        api = RunsApi(api_client_mock)
        api.cancel_run('1')
        api_client_mock.perform_query.assert_called_with(
            'POST', '/jobs/runs/cancel', data={'run_id': '1'},
            headers=None, version=None
        )

        api.cancel_run('1', version='3.0')
        api_client_mock.perform_query.assert_called_with(
            'POST', '/jobs/runs/cancel', data={'run_id': '1'},
            headers=None, version='3.0'
        )
Ejemplo n.º 9
0
def test_submit_run():
    with mock.patch('databricks_cli.sdk.ApiClient') as api_client_mock:
        api = RunsApi(api_client_mock)
        api.submit_run('{"tasks": [], "run_name": "mock"}', version=None)
        api_client_mock.perform_query.assert_called_with(
            'POST', '/jobs/runs/submit', data='{"tasks": [], "run_name": "mock"}',
            version=None
        )

        api.submit_run('{"tasks": [], "run_name": "mock"}', version='3.0')
        api_client_mock.perform_query.assert_called_with(
            'POST', '/jobs/runs/submit', data='{"tasks": [], "run_name": "mock"}',
            version='3.0'
        )
Ejemplo n.º 10
0
def list_cli(api_client, job_id, active_only, completed_only, offset, limit, output): # noqa
    """
    Lists job runs.

    The limit and offset determine which runs will be listed. Runs are always listed
    by descending order of run start time and run ID.

    In the TABLE output mode, the columns are as follows.

      - Run ID

      - Run name

      - Life cycle state

      - Result state (can be n/a)
    """
    runs_json = RunsApi(api_client).list_runs(job_id, active_only, completed_only, offset, limit)
    if OutputClickType.is_json(output):
        click.echo(pretty_format(runs_json))
    else:
        click.echo(tabulate(_runs_to_table(runs_json), tablefmt='plain'))
Ejemplo n.º 11
0
def test_list_runs():
    with mock.patch('databricks_cli.sdk.ApiClient') as api_client_mock:
        api = RunsApi(api_client_mock)
        api.list_runs('1', True, False, 0, 20)
        api_client_mock.perform_query.assert_called_with(
            'GET', '/jobs/runs/list',
            data={'job_id': '1', 'active_only': True,
                  "completed_only": False, "offset": 0, "limit": 20},
            headers=None, version=None
        )

        api.list_runs('1', True, False, 0, 20, version='3.0')
        api_client_mock.perform_query.assert_called_with(
            'GET', '/jobs/runs/list',
            data={'job_id': '1', 'active_only': True,
                  "completed_only": False, "offset": 0, "limit": 20},
            headers=None, version='3.0'
        )
Ejemplo n.º 12
0
def cancel_cli(api_client, run_id):
    """
    Cancels the run specified.
    """
    click.echo(pretty_format(RunsApi(api_client).cancel_run(run_id)))
Ejemplo n.º 13
0
 def __init__(self, profile=None):
     api_client =  get_api_client(profile)
     self.dbfs_client = DbfsApi(api_client)
     self.runs_client = RunsApi(api_client)
Ejemplo n.º 14
0
class DeployToDatabricks(Step):
    def __init__(self, env: ApplicationVersion, config: dict):
        super().__init__(env, config)
        self.vault_name, self.vault_client = KeyVaultClient.vault_and_client(self.config, self.env)
        self.databricks_client = Databricks(self.vault_name, self.vault_client).api_client(self.config)
        self.jobs_api = JobsApi(self.databricks_client)
        self.runs_api = RunsApi(self.databricks_client)

    def schema(self) -> vol.Schema:
        return SCHEMA

    def run(self):
        self.deploy_to_databricks()

    @staticmethod
    def _job_is_streaming(job_config: dict):
        """
        If there is no schedule, the job would not run periodically, therefore we assume that is a
        streaming job
        :param job_config: the configuration of the Databricks job
        :return: (bool) if it is a streaming job
        """
        return "schedule" not in job_config.keys()

    def deploy_to_databricks(self):
        """
        The application parameters (cosmos and eventhub) will be removed from this file as they
        will be set as databricks secrets eventually
        If the job is a streaming job this will directly start the new job_run given the new
        configuration. If the job is batch this will not start it manually, assuming the schedule
        has been set correctly.
        """
        for job in self.config["jobs"]:
            app_name = self._construct_name(job["name"])
            job_name = f"{app_name}-{self.env.artifact_tag}"
            job_config = self.create_config(job_name, job)
            is_streaming = self._job_is_streaming(job_config)

            logger.info("Removing old job")
            self.remove_job(self.env.artifact_tag, is_streaming=is_streaming)

            logger.info("Submitting new job with configuration:")
            logger.info(pprint.pformat(job_config))
            self._submit_job(job_config, is_streaming)

    def create_config(self, job_name: str, job_config: dict):
        common_arguments = dict(
            config_file=job_config["config_file"],
            application_name=job_name,
            log_destination=job_name,
            parameters=self._construct_arguments(job_config["arguments"]),
            schedule=self._get_schedule(job_config),
            environment=self.env.environment_formatted,
        )

        root_library_folder = self.config["common"]["databricks_fs_libraries_mount_path"]
        storage_base_path = f"{root_library_folder}/{self.application_name}"
        artifact_path = f"{storage_base_path}/{self.application_name}-{self.env.artifact_tag}"

        if job_config["lang"] == "python":
            wheel_name = get_whl_name(self.application_name, self.env.artifact_tag, ".whl")
            py_main_name = get_main_py_name(self.application_name, self.env.artifact_tag, ".py")
            run_config = DeployToDatabricks._construct_job_config(
                **common_arguments,
                whl_file=f"{root_library_folder}/{wheel_name}",
                python_file=f"{root_library_folder}/{py_main_name}",
            )
        else:  # java/scala jobs
            run_config = DeployToDatabricks._construct_job_config(
                **common_arguments, class_name=job_config["main_name"], jar_file=f"{artifact_path}.jar"
            )
        return run_config

    def _get_schedule(self, job_config: dict) -> Optional[dict]:
        schedule = job_config.get("schedule", None)
        if schedule:
            if "quartz_cron_expression" in schedule:
                return schedule
            else:
                return schedule.get(self.env.environment.lower(), None)

        return schedule

    def _construct_name(self, name: str) -> str:
        postfix = f"-{name}" if name else ""
        return f"{self.application_name}{postfix}"

    @staticmethod
    def _construct_arguments(args: List[dict]) -> list:
        params = []
        for named_arguments_pair in args:
            for k, v in named_arguments_pair.items():
                params.extend([f"--{k}", v])

        return params

    @staticmethod
    def _construct_job_config(config_file: str, **kwargs) -> dict:
        return util.render_file_with_jinja(config_file, kwargs, json.loads)

    def remove_job(self, branch: str, is_streaming: bool):
        """
        Removes the existing job and cancels any running job_run if the application is streaming.
        If the application is batch, it'll let the batch job finish but it will remove the job,
        making sure no other job_runs can start for that old job.
        """

        job_configs = [
            JobConfig(_["settings"]["name"], _["job_id"]) for _ in self.jobs_api.list_jobs()["jobs"]
        ]
        job_ids = self._application_job_id(self.application_name, branch, job_configs)

        if not job_ids:
            logger.info(f"Could not find jobs in list of {pprint.pformat(job_configs)}")

        for job_id in job_ids:
            logger.info(f"Found Job with ID {job_id}")
            if is_streaming:
                self._kill_it_with_fire(job_id)
            logger.info(f"Deleting Job with ID {job_id}")
            self.jobs_api.delete_job(job_id)

    @staticmethod
    def _application_job_id(application_name: str, branch: str, jobs: List[JobConfig]) -> List[int]:
        snapshot = "SNAPSHOT"
        tag = "\d+\.\d+\.\d+"
        pattern = re.compile(rf"^({application_name})-({snapshot}|{tag}|{branch})$")

        return [_.job_id for _ in jobs if has_prefix_match(_.name, application_name, pattern)]

    def _kill_it_with_fire(self, job_id):
        logger.info(f"Finding runs for job_id {job_id}")
        runs = self.runs_api.list_runs(job_id, active_only=True, completed_only=None, offset=None, limit=None)
        # If the runs is empty, there are no jobs at all
        # TODO: Check if the has_more flag is true, this means we need to go over the pages
        if "runs" in runs:
            active_run_ids = [_["run_id"] for _ in runs["runs"]]
            logger.info(f"Canceling active runs {active_run_ids}")
            [self.runs_api.cancel_run(_) for _ in active_run_ids]

    def _submit_job(self, job_config: dict, is_streaming: bool):
        job_resp = self.jobs_api.create_job(job_config)
        logger.info(f"Created Job with ID {job_resp['job_id']}")

        if is_streaming:
            resp = self.jobs_api.run_now(
                job_id=job_resp["job_id"],
                jar_params=None,
                notebook_params=None,
                python_params=None,
                spark_submit_params=None,
            )
            logger.info(f"Created run with ID {resp['run_id']}")
Ejemplo n.º 15
0
def runs_api():
    with mock.patch('databricks_cli.sdk.JobsService') as jobs_service_mock:
        jobs_service_mock.return_value = mock.MagicMock()
        runs_api_mock = RunsApi(None)
        yield runs_api_mock