Example #1
0
 def _get_hook(databricks_conn_secret, databricks_retry_limit,
               databricks_retry_delay):
     return DatabricksHook(
         databricks_conn_secret,
         retry_limit=databricks_retry_limit,
         retry_delay=databricks_retry_delay,
     )
Example #2
0
 def get_hook(self):
     return DatabricksHook(
         self.databricks_conn_secret,
         retry_limit=self.databricks_retry_limit,
         retry_delay=self.databricks_retry_delay,
     )
Example #3
0
    def run(
        self,
        databricks_conn_secret: dict = None,
        tasks: List[JobTaskSettings] = None,
        run_name: str = None,
        timeout_seconds: int = None,
        idempotency_token: str = None,
        access_control_list: List[AccessControlRequest] = None,
        polling_period_seconds: int = None,
        databricks_retry_limit: int = None,
        databricks_retry_delay: float = None,
    ):
        """
        Task run method. Any values passed here will overwrite the values used when initializing the
        task.

        Args:
            - databricks_conn_secret (dict, optional): Dictionary representation of the Databricks
                Connection String. Structure must be a string of valid JSON. To use token based
                authentication, provide the key `token` in the string for the connection and create
                the key `host`. `PREFECT__CONTEXT__SECRETS__DATABRICKS_CONNECTION_STRING=
                '{"host": "abcdef.xyz", "login": "******", "password": "******"}'`
                OR
                `PREFECT__CONTEXT__SECRETS__DATABRICKS_CONNECTION_STRING=
                '{"host": "abcdef.xyz", "token": "ghijklmn"}'`
            - tasks (List[JobTaskSettings]):" A list containing the Databricks task configuration. Should
                contain configuration for at least one task.
            - timeout_seconds (int, optional):  An optional timeout applied to each run of this job.
                The default behavior is to have no timeout.
            - run_name (str, optional): An optional name for the run.
                The default value is "Job run created by Prefect flow run {flow_run_name}".
            - idempotency_token (str, optional): An optional token that can be used to guarantee
                the idempotency of job run requests. Defaults to the flow run ID.
            - access_control_list (List[AccessControlRequest]): List of permissions to set on the job.
            - polling_period_seconds (int, optional): Controls the rate which we poll for the result of
                this run. By default the task will poll every 30 seconds.
            - databricks_retry_limit (int, optional): Amount of times retry if the Databricks backend is
                unreachable. Its value must be greater than or equal to 1.
            - databricks_retry_delay (float, optional): Number of seconds to wait between retries (it
                might be a floating point number).

        Returns:
            - run_id (str): Run id of the submitted run

        """
        if databricks_conn_secret is None or not isinstance(
                databricks_conn_secret, dict):
            raise ValueError(
                "Databricks connection info must be supplied as a dictionary.")
        if tasks is None or len(tasks) < 1:
            raise ValueError(
                "Please supply at least one Databricks task to be run.")
        run_name = (
            run_name or
            f"Job run created by Prefect flow run {prefect.context.flow_run_name}"
        )
        # Ensures that multiple job runs are not created on retries
        idempotency_token = idempotency_token or prefect.context.flow_run_id

        # Set polling_period_seconds on task because _handle_databricks_task_execution expects it
        if polling_period_seconds:
            self.polling_period_seconds = polling_period_seconds

        databricks_client = DatabricksHook(
            databricks_conn_secret,
            retry_limit=databricks_retry_limit,
            retry_delay=databricks_retry_delay,
        )

        # Set json on task instance because _handle_databricks_task_execution expects it
        self.json = _deep_string_coerce(
            dict(
                tasks=[task.dict() for task in tasks],
                run_name=run_name,
                timeout_seconds=timeout_seconds,
                idempotency_token=idempotency_token,
                access_control_list=[
                    entry.json() for entry in access_control_list or []
                ],
            ))

        submitted_run_id = databricks_client.submit_multi_task_run(self.json)
        _handle_databricks_task_execution(self, databricks_client, self.logger,
                                          submitted_run_id)

        return submitted_run_id