Example #1
0
    def execute(self, context: Dict) -> None:
        self.hook = AzureDataFactoryHook(azure_data_factory_conn_id=self.azure_data_factory_conn_id)
        self.log.info(f"Executing the {self.pipeline_name} pipeline.")
        response = self.hook.run_pipeline(
            pipeline_name=self.pipeline_name,
            resource_group_name=self.resource_group_name,
            factory_name=self.factory_name,
            reference_pipeline_run_id=self.reference_pipeline_run_id,
            is_recovery=self.is_recovery,
            start_activity_name=self.start_activity_name,
            start_from_failure=self.start_from_failure,
            parameters=self.parameters,
        )
        self.run_id = vars(response)["run_id"]
        # Push the ``run_id`` value to XCom regardless of what happens during execution. This allows for
        # retrieval the executed pipeline's ``run_id`` for downstream tasks especially if performing an
        # asynchronous wait.
        context["ti"].xcom_push(key="run_id", value=self.run_id)

        if self.wait_for_termination:
            self.log.info(f"Waiting for pipeline run {self.run_id} to terminate.")

            if self.hook.wait_for_pipeline_run_status(
                run_id=self.run_id,
                expected_statuses=AzureDataFactoryPipelineRunStatus.SUCCEEDED,
                check_interval=self.check_interval,
                timeout=self.timeout,
                resource_group_name=self.resource_group_name,
                factory_name=self.factory_name,
            ):
                self.log.info(f"Pipeline run {self.run_id} has completed successfully.")
            else:
                raise AzureDataFactoryPipelineRunException(
                    f"Pipeline run {self.run_id} has failed or has been cancelled."
                )
Example #2
0
    def poke(self, context: "Context") -> bool:
        self.hook = AzureDataFactoryHook(azure_data_factory_conn_id=self.azure_data_factory_conn_id)
        pipeline_run_status = self.hook.get_pipeline_run_status(
            run_id=self.run_id,
            resource_group_name=self.resource_group_name,
            factory_name=self.factory_name,
        )

        if pipeline_run_status == AzureDataFactoryPipelineRunStatus.FAILED:
            raise AzureDataFactoryPipelineRunException(f"Pipeline run {self.run_id} has failed.")

        if pipeline_run_status == AzureDataFactoryPipelineRunStatus.CANCELLED:
            raise AzureDataFactoryPipelineRunException(f"Pipeline run {self.run_id} has been cancelled.")

        return pipeline_run_status == AzureDataFactoryPipelineRunStatus.SUCCEEDED
Example #3
0
class AzureDataFactoryPipelineRunStatusSensor(BaseSensorOperator):
    """
    Checks the status of a pipeline run.

    :param azure_data_factory_conn_id: The connection identifier for connecting to Azure Data Factory.
    :type azure_data_factory_conn_id: str
    :param run_id: The pipeline run identifier.
    :type run_id: str
    :param resource_group_name: The resource group name.
    :type resource_group_name: str
    :param factory_name: The data factory name.
    :type factory_name: str
    """

    template_fields: Sequence[str] = (
        "azure_data_factory_conn_id",
        "resource_group_name",
        "factory_name",
        "run_id",
    )

    ui_color = "#50e6ff"

    def __init__(
        self,
        *,
        run_id: str,
        azure_data_factory_conn_id: str = AzureDataFactoryHook.
        default_conn_name,
        resource_group_name: Optional[str] = None,
        factory_name: Optional[str] = None,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.azure_data_factory_conn_id = azure_data_factory_conn_id
        self.run_id = run_id
        self.resource_group_name = resource_group_name
        self.factory_name = factory_name

    def poke(self, context: "Context") -> bool:
        self.hook = AzureDataFactoryHook(
            azure_data_factory_conn_id=self.azure_data_factory_conn_id)
        pipeline_run_status = self.hook.get_pipeline_run_status(
            run_id=self.run_id,
            resource_group_name=self.resource_group_name,
            factory_name=self.factory_name,
        )

        if pipeline_run_status == AzureDataFactoryPipelineRunStatus.FAILED:
            raise AzureDataFactoryPipelineRunException(
                f"Pipeline run {self.run_id} has failed.")

        if pipeline_run_status == AzureDataFactoryPipelineRunStatus.CANCELLED:
            raise AzureDataFactoryPipelineRunException(
                f"Pipeline run {self.run_id} has been cancelled.")

        return pipeline_run_status == AzureDataFactoryPipelineRunStatus.SUCCEEDED
Example #4
0
class AzureDataFactoryRunPipelineOperator(BaseOperator):
    """
    Executes a data factory pipeline.

    .. seealso::
        For more information on how to use this operator, take a look at the guide:
        :ref:`howto/operator:AzureDataFactoryRunPipelineOperator`

    :param azure_data_factory_conn_id: The connection identifier for connecting to Azure Data Factory.
    :param pipeline_name: The name of the pipeline to execute.
    :param wait_for_termination: Flag to wait on a pipeline run's termination.  By default, this feature is
        enabled but could be disabled to perform an asynchronous wait for a long-running pipeline execution
        using the ``AzureDataFactoryPipelineRunSensor``.
    :param resource_group_name: The resource group name. If a value is not passed in to the operator, the
        ``AzureDataFactoryHook`` will attempt to use the resource group name provided in the corresponding
        connection.
    :param factory_name: The data factory name. If a value is not passed in to the operator, the
        ``AzureDataFactoryHook`` will attempt to use the factory name name provided in the corresponding
        connection.
    :param reference_pipeline_run_id: The pipeline run identifier. If this run ID is specified the parameters
        of the specified run will be used to create a new run.
    :param is_recovery: Recovery mode flag. If recovery mode is set to `True`, the specified referenced
        pipeline run and the new run will be grouped under the same ``groupId``.
    :param start_activity_name: In recovery mode, the rerun will start from this activity. If not specified,
        all activities will run.
    :param start_from_failure: In recovery mode, if set to true, the rerun will start from failed activities.
        The property will be used only if ``start_activity_name`` is not specified.
    :param parameters: Parameters of the pipeline run. These parameters are referenced in a pipeline via
        ``@pipeline().parameters.parameterName`` and will be used only if the ``reference_pipeline_run_id`` is
        not specified.
    :param timeout: Time in seconds to wait for a pipeline to reach a terminal status for non-asynchronous
        waits. Used only if ``wait_for_termination`` is True.
    :param check_interval: Time in seconds to check on a pipeline run's status for non-asynchronous waits.
        Used only if ``wait_for_termination`` is True.
    """

    template_fields: Sequence[str] = (
        "azure_data_factory_conn_id",
        "resource_group_name",
        "factory_name",
        "pipeline_name",
        "reference_pipeline_run_id",
        "parameters",
    )
    template_fields_renderers = {"parameters": "json"}

    ui_color = "#0678d4"

    operator_extra_links = (AzureDataFactoryPipelineRunLink(), )

    def __init__(
        self,
        *,
        pipeline_name: str,
        azure_data_factory_conn_id: str = AzureDataFactoryHook.
        default_conn_name,
        wait_for_termination: bool = True,
        resource_group_name: Optional[str] = None,
        factory_name: Optional[str] = None,
        reference_pipeline_run_id: Optional[str] = None,
        is_recovery: Optional[bool] = None,
        start_activity_name: Optional[str] = None,
        start_from_failure: Optional[bool] = None,
        parameters: Optional[Dict[str, Any]] = None,
        timeout: int = 60 * 60 * 24 * 7,
        check_interval: int = 60,
        **kwargs,
    ) -> None:
        super().__init__(**kwargs)
        self.azure_data_factory_conn_id = azure_data_factory_conn_id
        self.pipeline_name = pipeline_name
        self.wait_for_termination = wait_for_termination
        self.resource_group_name = resource_group_name
        self.factory_name = factory_name
        self.reference_pipeline_run_id = reference_pipeline_run_id
        self.is_recovery = is_recovery
        self.start_activity_name = start_activity_name
        self.start_from_failure = start_from_failure
        self.parameters = parameters
        self.timeout = timeout
        self.check_interval = check_interval

    def execute(self, context: "Context") -> None:
        self.hook = AzureDataFactoryHook(
            azure_data_factory_conn_id=self.azure_data_factory_conn_id)
        self.log.info(f"Executing the {self.pipeline_name} pipeline.")
        response = self.hook.run_pipeline(
            pipeline_name=self.pipeline_name,
            resource_group_name=self.resource_group_name,
            factory_name=self.factory_name,
            reference_pipeline_run_id=self.reference_pipeline_run_id,
            is_recovery=self.is_recovery,
            start_activity_name=self.start_activity_name,
            start_from_failure=self.start_from_failure,
            parameters=self.parameters,
        )
        self.run_id = vars(response)["run_id"]
        # Push the ``run_id`` value to XCom regardless of what happens during execution. This allows for
        # retrieval the executed pipeline's ``run_id`` for downstream tasks especially if performing an
        # asynchronous wait.
        context["ti"].xcom_push(key="run_id", value=self.run_id)

        if self.wait_for_termination:
            self.log.info(
                f"Waiting for pipeline run {self.run_id} to terminate.")

            if self.hook.wait_for_pipeline_run_status(
                    run_id=self.run_id,
                    expected_statuses=AzureDataFactoryPipelineRunStatus.
                    SUCCEEDED,
                    check_interval=self.check_interval,
                    timeout=self.timeout,
                    resource_group_name=self.resource_group_name,
                    factory_name=self.factory_name,
            ):
                self.log.info(
                    f"Pipeline run {self.run_id} has completed successfully.")
            else:
                raise AzureDataFactoryPipelineRunException(
                    f"Pipeline run {self.run_id} has failed or has been cancelled."
                )

    def on_kill(self) -> None:
        if self.run_id:
            self.hook.cancel_pipeline_run(
                run_id=self.run_id,
                resource_group_name=self.resource_group_name,
                factory_name=self.factory_name,
            )

            # Check to ensure the pipeline run was cancelled as expected.
            if self.hook.wait_for_pipeline_run_status(
                    run_id=self.run_id,
                    expected_statuses=AzureDataFactoryPipelineRunStatus.
                    CANCELLED,
                    check_interval=self.check_interval,
                    timeout=self.timeout,
                    resource_group_name=self.resource_group_name,
                    factory_name=self.factory_name,
            ):
                self.log.info(
                    f"Pipeline run {self.run_id} has been cancelled successfully."
                )
            else:
                raise AzureDataFactoryPipelineRunException(
                    f"Pipeline run {self.run_id} was not cancelled.")