def transform(self, src_operator: BaseOperator,
               parent_fragment: DAGFragment,
               upstream_fragments: List[DAGFragment]) -> DAGFragment:
     op = LivyBatchSensor(batch_id="foo",
                          task_id=src_operator.task_id,
                          azure_conn_id="foo",
                          cluster_name="foo",
                          verify_in="yarn",
                          dag=self.dag)
     return DAGFragment([op])
    def test_transform_sub_dags_match_multi(self):
        """
            tests:
                finding multiple matching sub-dags and transforming them
                converting a sub-dag to another transformed sub-dag (with multiple roots)
                finding a sub-dag which isn't at the root
                returned sub-dag contains tasks which can be transformed
        """
        dag = self._get_subdag_test_dag()

        transformer = AirflowDagTransformer(
            DAG(dag_id='transformed_dag',
                default_args=DEFAULT_DAG_ARGS,
                dagrun_timeout=timedelta(hours=2),
                max_active_runs=1,
                schedule_interval=None),
            subdag_transformers=[TestSubDagTransformer1],
            transformer_resolvers=[
                ClassTransformerResolver(
                    {SparkSubmitOperator: TestTransformer5})
            ])

        src_dag = copy.deepcopy(dag)
        src_dag.dag_id = 'transformed_dag'
        transformer.transform_sub_dags(src_dag)

        exp_dag = self._get_expected_dag_sub_dags_match_multi(
            dag, TestSubDagTransformer1.op1)

        if self.show_graphs:
            rendering.show_multi_dag_graphviz([dag, exp_dag, src_dag])

        TestUtils.assert_dags_equals(self, exp_dag, src_dag)

        # transform operators in the transformed subdags
        transformer.transform_operators(src_dag)
        exp_dag = self._get_expected_dag_sub_dags_match_multi(
            dag,
            LivyBatchSensor(batch_id="foo",
                            task_id="foo",
                            azure_conn_id="foo",
                            cluster_name="foo",
                            verify_in="yarn",
                            dag=src_dag))

        if self.show_graphs:
            rendering.show_multi_dag_graphviz(
                [dag, exp_dag, transformer.target_dag])
        TestUtils.assert_dags_equals(self, exp_dag, transformer.target_dag)
    def transform(self, src_operator: BaseOperator,
                  parent_fragment: DAGFragment,
                  upstream_fragments: List[DAGFragment]) -> DAGFragment:
        TestTransformer2.tp1 = LivyBatchSensor(
            batch_id="foo",
            task_id="t2p1",
            azure_conn_id="foo",
            cluster_name=src_operator.dest_bucket_key,
            verify_in="yarn",
            dag=self.dag)

        TestTransformer2.tp2 = DummyOperator(task_id='t2p2', dag=self.dag)
        TestTransformer2.tp3 = DummyOperator(task_id='t2p3', dag=self.dag)
        TestTransformer2.tp4 = DummyOperator(task_id='t2p4', dag=self.dag)
        TestTransformer2.tp5 = PythonOperator(task_id='t2p5',
                                              python_callable=print,
                                              dag=self.dag)

        TestTransformer2.tp1 >> [TestTransformer2.tp2, TestTransformer2.tp3
                                 ] >> TestTransformer2.tp4

        return DAGFragment([TestTransformer2.tp1, TestTransformer2.tp5])
def create_dag():
    with DAG(dag_id='HDI_emr_job_flow_manual_steps_dag',
             default_args=DEFAULT_DAG_ARGS,
             dagrun_timeout=timedelta(hours=2),
             max_active_runs=1,
             schedule_interval=None) as dag:
        create_cluster1_op = ConnectedAzureHDInsightCreateClusterOperator(
            task_id="create_cluster_1",
            azure_conn_id=AZURE_CONN_ID,
            hdi_conn_id=HDI_CONN_ID,
            cluster_name='PiCalc')

        create_cluster2_op = ConnectedAzureHDInsightCreateClusterOperator(
            task_id="create_cluster_2",
            azure_conn_id=AZURE_CONN_ID,
            hdi_conn_id=HDI_CONN_ID,
            cluster_name='PiCalc2')

        monitor_prov_op_1 = AzureHDInsightClusterSensor(
            create_cluster1_op.cluster_name,
            azure_conn_id=create_cluster1_op.azure_conn_id,
            poke_interval=5,
            provisioning_only=True,
            task_id=f"{create_cluster1_op.task_id}_monitor_provisioning")

        monitor_prov_op_2 = AzureHDInsightClusterSensor(
            create_cluster2_op.cluster_name,
            azure_conn_id=create_cluster2_op.azure_conn_id,
            poke_interval=5,
            provisioning_only=True,
            task_id=f"{create_cluster2_op.task_id}_monitor_provisioning")

        monitor_cluster1_op = AzureHDInsightClusterSensor(
            create_cluster1_op.cluster_name,
            azure_conn_id=create_cluster1_op.azure_conn_id,
            poke_interval=5,
            task_id=f"{create_cluster1_op.task_id}_monitor_cluster")

        monitor_cluster2_op = AzureHDInsightClusterSensor(
            create_cluster2_op.cluster_name,
            azure_conn_id=create_cluster2_op.azure_conn_id,
            poke_interval=5,
            task_id=f"{create_cluster2_op.task_id}_monitor_cluster")

        livy_submit_cluster1_step1_op = LivyBatchOperator(
            name='calculate_pi',
            file='s3://psm-poc-dmp-temp/spark-examples.jar',
            arguments=['10'],
            class_name='org.apache.spark.examples.SparkPi',
            azure_conn_id=create_cluster1_op.azure_conn_id,
            cluster_name=create_cluster1_op.cluster_name,
            proxy_user='******',
            conf={'spark.shuffle.compress': 'false'},
            task_id="add_steps_cluster_1_0",
            trigger_rule=TriggerRule.ALL_SUCCESS)

        livy_submit_cluster2_step1_op = LivyBatchOperator(
            name='calculate_pi',
            file='s3://psm-poc-dmp-temp/spark-examples.jar',
            arguments=['10'],
            class_name='org.apache.spark.examples.SparkPi',
            azure_conn_id=create_cluster2_op.azure_conn_id,
            cluster_name=create_cluster2_op.cluster_name,
            proxy_user='******',
            conf={'spark.shuffle.compress': 'false'},
            task_id="add_steps_cluster_2_0",
            trigger_rule=TriggerRule.ALL_SUCCESS)

        livy_submit_cluster1_step2_op = LivyBatchOperator(
            name='calculate_pi_2',
            file='s3://psm-poc-dmp-temp/spark-examples.jar',
            arguments=['10'],
            class_name='org.apache.spark.examples.SparkPi2',
            azure_conn_id=create_cluster1_op.azure_conn_id,
            cluster_name=create_cluster1_op.cluster_name,
            proxy_user='******',
            conf={'spark.shuffle.compress': 'false'},
            task_id="add_steps_cluster_1_1",
            trigger_rule=TriggerRule.ALL_SUCCESS)

        livy_submit_cluster2_step2_op = LivyBatchOperator(
            name='calculate_pi_2',
            file='s3://psm-poc-dmp-temp/spark-examples.jar',
            arguments=['10'],
            class_name='org.apache.spark.examples.SparkPi2',
            azure_conn_id=create_cluster2_op.azure_conn_id,
            cluster_name=create_cluster2_op.cluster_name,
            proxy_user='******',
            conf={'spark.shuffle.compress': 'false'},
            task_id="add_steps_cluster_2_1",
            trigger_rule=TriggerRule.ALL_SUCCESS)

        livy_sensor_cluster1_step1_op = LivyBatchSensor(
            batch_id=
            f"{{{{ task_instance.xcom_pull('add_steps_cluster_1_0', key='return_value') }}}}",
            task_id='watch_step_cluster1_step0',
            azure_conn_id=create_cluster1_op.azure_conn_id,
            cluster_name=create_cluster1_op.cluster_name,
            verify_in="yarn")

        livy_sensor_cluster1_step2_op = LivyBatchSensor(
            batch_id=
            f"{{{{ task_instance.xcom_pull('add_steps_cluster_1_1', key='return_value') }}}}",
            task_id='watch_step_cluster1_step1',
            azure_conn_id=create_cluster1_op.azure_conn_id,
            cluster_name=create_cluster1_op.cluster_name,
            verify_in="yarn")

        livy_sensor_cluster2_step1_op = LivyBatchSensor(
            batch_id=
            f"{{{{ task_instance.xcom_pull('add_steps_cluster_2_0', key='return_value') }}}}",
            task_id='watch_step_cluster2_step0',
            azure_conn_id=create_cluster2_op.azure_conn_id,
            cluster_name=create_cluster2_op.cluster_name,
            verify_in="yarn")

        livy_sensor_cluster2_step2_op = LivyBatchSensor(
            batch_id=
            f"{{{{ task_instance.xcom_pull('add_steps_cluster_2_1', key='return_value') }}}}",
            task_id='watch_step_cluster2_step1',
            azure_conn_id=create_cluster2_op.azure_conn_id,
            cluster_name=create_cluster2_op.cluster_name,
            verify_in="yarn")

        terminate_cluster1_op = AzureHDInsightDeleteClusterOperator(
            task_id="remove_cluster_1",
            azure_conn_id=create_cluster1_op.azure_conn_id,
            cluster_name=create_cluster1_op.cluster_name,
            trigger_rule=TriggerRule.ALL_DONE)

        terminate_cluster2_op = AzureHDInsightDeleteClusterOperator(
            task_id="remove_cluster_2",
            azure_conn_id=create_cluster2_op.azure_conn_id,
            cluster_name=create_cluster2_op.cluster_name,
            trigger_rule=TriggerRule.ALL_DONE)

        handle_failure_op = PythonOperator(
            task_id='handle_failure',
            python_callable=handle_failure_task,
            trigger_rule=trigger_rule.TriggerRule.ONE_FAILED)

        clusters_created_op = DummyOperator(task_id='clusters_created')

        steps_added_op = DummyOperator(task_id='steps_added')
        steps_completed_op = DummyOperator(task_id='steps_completed')
        steps_cluster_1_added_op = DummyOperator(
            task_id='add_steps_cluster_1_added')
        steps_cluster_2_added_op = DummyOperator(
            task_id='add_steps_cluster_2_added')

        create_cluster1_op >> monitor_prov_op_1
        create_cluster2_op >> monitor_prov_op_2
        [monitor_prov_op_1, monitor_prov_op_2] >> clusters_created_op
        clusters_created_op >> [monitor_cluster1_op, monitor_cluster2_op
                                ] >> handle_failure_op
        clusters_created_op >> [
            livy_submit_cluster1_step1_op, livy_submit_cluster1_step2_op,
            livy_submit_cluster2_step1_op, livy_submit_cluster2_step2_op
        ]
        [livy_submit_cluster1_step1_op, livy_submit_cluster1_step2_op
         ] >> steps_cluster_1_added_op
        [livy_submit_cluster2_step1_op, livy_submit_cluster2_step2_op
         ] >> steps_cluster_2_added_op
        [steps_cluster_1_added_op, steps_cluster_2_added_op] >> steps_added_op
        steps_added_op >> [
            livy_sensor_cluster1_step1_op, livy_sensor_cluster1_step2_op,
            livy_sensor_cluster2_step1_op, livy_sensor_cluster2_step2_op
        ] >> steps_completed_op
        steps_completed_op >> [terminate_cluster1_op, terminate_cluster2_op]

    return dag
    def transform(self, src_operator: BaseOperator,
                  parent_fragment: DAGFragment,
                  upstream_fragments: List[DAGFragment]) -> DAGFragment:
        """
        This transformer assumes and relies on the fact that an upstream transformation
        of a :class:`~airflow.contrib.operators.emr_create_job_flow_operator.EmrCreateJobFlowOperator`
        has already taken place, since it needs to find the output of that transformation
        to get the `cluster_name` and `azure_conn_id` from that operator (which should have been a
        :class:`~airflowhdi.operators.AzureHDInsightCreateClusterOperator`)

        This transformer also requires than there would already be transformations of
        :class:`~airflow.contrib.operators.emr_add_steps_operator.EmrAddStepsOperator` to
        :class:`~airflowhdi.operators.LivyBatchOperator` or :class:`~airflowhdi.operators.AzureHDInsightSshOperator`
        in the `upstream_fragments` which can then be monitored by the output tasks of
        this transformer. It needs to search for those ops upstream to find their task IDs

        Adds :class:`~airflowhdi.sensors.LivyBatchSensor` if it was a livy spark job.
        There's no sensor required for a transformed :class:`~airflowhdi.operators.AzureHDInsightSshOperator`
        as it is synchronous.
        """
        create_op_task_id = TransformerUtils.get_task_id_from_xcom_pull(
            src_operator.job_flow_id)
        create_op: BaseOperator = \
            TransformerUtils.find_op_in_fragment_list(
                upstream_fragments,
                operator_type=ConnectedAzureHDInsightCreateClusterOperator,
                task_id=create_op_task_id)

        if not create_op:
            raise UpstreamOperatorNotFoundException(
                ConnectedAzureHDInsightCreateClusterOperator, EmrStepSensor)

        emr_step_sensor_op: EmrStepSensor = src_operator

        emr_add_step_task_id = TransformerUtils.get_task_id_from_xcom_pull(
            emr_step_sensor_op.step_id)
        emr_add_step_step_id = TransformerUtils.get_list_index_from_xcom_pull(
            emr_step_sensor_op.step_id)
        target_step_task_id = EmrAddStepsOperatorTransformer.get_target_step_task_id(
            emr_add_step_task_id, emr_add_step_step_id)

        add_step_op: BaseOperator = \
            TransformerUtils.find_op_in_fragment_list_strict(
                upstream_fragments,
                task_id=target_step_task_id)

        if isinstance(add_step_op, LivyBatchOperator):
            step_sensor_op = LivyBatchSensor(
                batch_id=
                f"{{{{ task_instance.xcom_pull('{target_step_task_id}', key='return_value') }}}}",
                task_id=emr_step_sensor_op.task_id,
                azure_conn_id=create_op.azure_conn_id,
                cluster_name=create_op.cluster_name,
                verify_in="yarn",
                dag=self.dag)
        else:
            # don't need a sensor for the ssh operator
            step_sensor_op = DummyOperator(task_id=emr_step_sensor_op.task_id,
                                           dag=self.dag)

        self.copy_op_attrs(step_sensor_op, emr_step_sensor_op)
        self.sign_op(step_sensor_op)

        return DAGFragment([step_sensor_op])
Exemple #6
0
        num_executors=1,
        azure_conn_id=AZURE_CONN_ID,
        cluster_name=cluster_name,
        conf={
            'spark.shuffle.compress': 'false',
        },
        class_name='org.apache.spark.examples.SparkPi',
        proxy_user='******',
        trigger_rule=TriggerRule.ALL_SUCCESS,
        execution_timeout=timedelta(minutes=10))

    livy_sensor = LivyBatchSensor(
        batch_id=
        "{{ task_instance.xcom_pull('livy_submit', key='return_value') }}",
        task_id='livy_sensor',
        azure_conn_id=AZURE_CONN_ID,
        cluster_name=cluster_name,
        verify_in="yarn",
        poke_interval=20,
        timeout=600,
    )

    terminate_cluster_op = AzureHDInsightDeleteClusterOperator(
        task_id="delete_cluster",
        azure_conn_id=AZURE_CONN_ID,
        cluster_name=cluster_name,
        trigger_rule=TriggerRule.ALL_DONE)

    create_cluster_op >> monitor_cluster_provisioning_op >> monitor_cluster_op >> handle_failure_op
    monitor_cluster_provisioning_op >> livy_submit >> livy_sensor >> terminate_cluster_op

if __name__ == '__main__':