Beispiel #1
0
 def setUp(self):
     self.dataflow = DataFlowJavaOperator(
         task_id=TASK_ID,
         jar=JAR_FILE,
         job_name=JOB_NAME,
         job_class=JOB_CLASS,
         dataflow_default_options=DEFAULT_OPTIONS_JAVA,
         options=ADDITIONAL_OPTIONS,
         poll_sleep=POLL_SLEEP)
    }
}

with models.DAG(
    "example_gcp_dataflow",
    default_args=default_args,
    schedule_interval=None,  # Override to match your needs
) as dag:

    # [START howto_operator_start_java_job]
    start_java_job = DataFlowJavaOperator(
        task_id="start-java-job",
        jar=GCS_JAR,
        job_name='{{task.task_id}}22222255sss{{ macros.uuid.uuid4() }}',
        options={
            'output': GCS_OUTPUT,
        },
        poll_sleep=10,
        job_class='org.apache.beam.examples.WordCount',
        check_if_running=CheckJobRunning.WaitForRun,
    )
    # [END howto_operator_start_java_job]

    # [START howto_operator_start_python_job]
    start_python_job = DataFlowPythonOperator(
        task_id="start-python-job",
        py_file='apache_beam.examples.wordcount',
        py_options=['-m'],
        job_name='{{task.task_id}}',
        options={
            'output': GCS_OUTPUT,
Beispiel #3
0
class TestDataFlowJavaOperator(unittest.TestCase):
    def setUp(self):
        self.dataflow = DataFlowJavaOperator(
            task_id=TASK_ID,
            jar=JAR_FILE,
            job_name=JOB_NAME,
            job_class=JOB_CLASS,
            dataflow_default_options=DEFAULT_OPTIONS_JAVA,
            options=ADDITIONAL_OPTIONS,
            poll_sleep=POLL_SLEEP)

    def test_init(self):
        """Test DataflowTemplateOperator instance is properly initialized."""
        self.assertEqual(self.dataflow.task_id, TASK_ID)
        self.assertEqual(self.dataflow.job_name, JOB_NAME)
        self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP)
        self.assertEqual(self.dataflow.dataflow_default_options,
                         DEFAULT_OPTIONS_JAVA)
        self.assertEqual(self.dataflow.job_class, JOB_CLASS)
        self.assertEqual(self.dataflow.jar, JAR_FILE)
        self.assertEqual(self.dataflow.options, EXPECTED_ADDITIONAL_OPTIONS)
        self.assertEqual(self.dataflow.check_if_running,
                         CheckJobRunning.WaitForRun)

    @mock.patch('airflow.gcp.operators.dataflow.DataFlowHook')
    @mock.patch(GCS_HOOK_STRING.format('GoogleCloudBucketHelper'))
    def test_exec(self, gcs_hook, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_java_workflow.

        """
        start_java_hook = dataflow_mock.return_value.start_java_dataflow
        gcs_download_hook = gcs_hook.return_value.google_cloud_to_local
        self.dataflow.check_if_running = CheckJobRunning.IgnoreJob
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        gcs_download_hook.assert_called_once_with(JAR_FILE)
        start_java_hook.assert_called_once_with(job_name=JOB_NAME,
                                                variables=mock.ANY,
                                                jar=mock.ANY,
                                                job_class=JOB_CLASS,
                                                append_job_name=True,
                                                multiple_jobs=None)

    @mock.patch('airflow.gcp.operators.dataflow.DataFlowHook')
    @mock.patch(GCS_HOOK_STRING.format('GoogleCloudBucketHelper'))
    def test_check_job_running_exec(self, gcs_hook, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_java_workflow.

        """
        dataflow_running = dataflow_mock.return_value.is_job_dataflow_running
        dataflow_running.return_value = True
        start_java_hook = dataflow_mock.return_value.start_java_dataflow
        gcs_download_hook = gcs_hook.return_value.google_cloud_to_local
        self.dataflow.check_if_running = True
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        gcs_download_hook.assert_not_called()
        start_java_hook.assert_not_called()
        dataflow_running.assert_called_once_with(name=JOB_NAME,
                                                 variables=mock.ANY)

    @mock.patch('airflow.gcp.operators.dataflow.DataFlowHook')
    @mock.patch(GCS_HOOK_STRING.format('GoogleCloudBucketHelper'))
    def test_check_job_not_running_exec(self, gcs_hook, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_java_workflow with option to check if job is running

        """
        dataflow_running = dataflow_mock.return_value.is_job_dataflow_running
        dataflow_running.return_value = False
        start_java_hook = dataflow_mock.return_value.start_java_dataflow
        gcs_download_hook = gcs_hook.return_value.google_cloud_to_local
        self.dataflow.check_if_running = True
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        gcs_download_hook.assert_called_once_with(JAR_FILE)
        start_java_hook.assert_called_once_with(job_name=JOB_NAME,
                                                variables=mock.ANY,
                                                jar=mock.ANY,
                                                job_class=JOB_CLASS,
                                                append_job_name=True,
                                                multiple_jobs=None)
        dataflow_running.assert_called_once_with(name=JOB_NAME,
                                                 variables=mock.ANY)

    @mock.patch('airflow.gcp.operators.dataflow.DataFlowHook')
    @mock.patch(GCS_HOOK_STRING.format('GoogleCloudBucketHelper'))
    def test_check_multiple_job_exec(self, gcs_hook, dataflow_mock):
        """Test DataFlowHook is created and the right args are passed to
        start_java_workflow with option to check multiple jobs

        """
        dataflow_running = dataflow_mock.return_value.is_job_dataflow_running
        dataflow_running.return_value = False
        start_java_hook = dataflow_mock.return_value.start_java_dataflow
        gcs_download_hook = gcs_hook.return_value.google_cloud_to_local
        self.dataflow.multiple_jobs = True
        self.dataflow.check_if_running = True
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        gcs_download_hook.assert_called_once_with(JAR_FILE)
        start_java_hook.assert_called_once_with(job_name=JOB_NAME,
                                                variables=mock.ANY,
                                                jar=mock.ANY,
                                                job_class=JOB_CLASS,
                                                append_job_name=True,
                                                multiple_jobs=True)
        dataflow_running.assert_called_once_with(name=JOB_NAME,
                                                 variables=mock.ANY)