Beispiel #1
0
 def setUp(self):
     self.dataflow = DataflowTemplatedJobStartOperator(
         task_id=TASK_ID,
         template=TEMPLATE,
         job_name=JOB_NAME,
         parameters=PARAMETERS,
         dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE,
         poll_sleep=POLL_SLEEP)
Beispiel #2
0
 def setUp(self):
     self.dataflow = DataflowTemplatedJobStartOperator(
         task_id=TASK_ID,
         template=TEMPLATE,
         job_name=JOB_NAME,
         parameters=PARAMETERS,
         options=DEFAULT_OPTIONS_TEMPLATE,
         dataflow_default_options={"EXTRA_OPTION": "TEST_A"},
         poll_sleep=POLL_SLEEP,
         location=TEST_LOCATION,
         environment={"maxWorkers": 2},
     )
Beispiel #3
0
class TestDataflowTemplateOperator(unittest.TestCase):
    def setUp(self):
        self.dataflow = DataflowTemplatedJobStartOperator(
            task_id=TASK_ID,
            template=TEMPLATE,
            job_name=JOB_NAME,
            parameters=PARAMETERS,
            dataflow_default_options=DEFAULT_OPTIONS_TEMPLATE,
            poll_sleep=POLL_SLEEP,
            location=TEST_LOCATION)

    def test_init(self):
        """Test DataflowTemplateOperator instance is properly initialized."""
        self.assertEqual(self.dataflow.task_id, TASK_ID)
        self.assertEqual(self.dataflow.job_name, JOB_NAME)
        self.assertEqual(self.dataflow.template, TEMPLATE)
        self.assertEqual(self.dataflow.parameters, PARAMETERS)
        self.assertEqual(self.dataflow.poll_sleep, POLL_SLEEP)
        self.assertEqual(self.dataflow.dataflow_default_options,
                         DEFAULT_OPTIONS_TEMPLATE)

    @mock.patch(
        'airflow.providers.google.cloud.operators.dataflow.DataflowHook')
    def test_exec(self, dataflow_mock):
        """Test DataflowHook is created and the right args are passed to
        start_template_workflow.

        """
        start_template_hook = dataflow_mock.return_value.start_template_dataflow
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        expected_options = {
            'project': 'test',
            'stagingLocation': 'gs://test/staging',
            'tempLocation': 'gs://test/temp',
            'zone': 'us-central1-f'
        }
        start_template_hook.assert_called_once_with(
            job_name=JOB_NAME,
            variables=expected_options,
            parameters=PARAMETERS,
            dataflow_template=TEMPLATE,
            on_new_job_id_callback=mock.ANY,
            project_id=None,
            location=TEST_LOCATION)
Beispiel #4
0
class TestDataflowTemplateOperator(unittest.TestCase):
    def setUp(self):
        self.dataflow = DataflowTemplatedJobStartOperator(
            task_id=TASK_ID,
            template=TEMPLATE,
            job_name=JOB_NAME,
            parameters=PARAMETERS,
            options=DEFAULT_OPTIONS_TEMPLATE,
            dataflow_default_options={"EXTRA_OPTION": "TEST_A"},
            poll_sleep=POLL_SLEEP,
            location=TEST_LOCATION,
            environment={"maxWorkers": 2},
        )

    @mock.patch(
        'airflow.providers.google.cloud.operators.dataflow.DataflowHook')
    def test_exec(self, dataflow_mock):
        """Test DataflowHook is created and the right args are passed to
        start_template_workflow.

        """
        start_template_hook = dataflow_mock.return_value.start_template_dataflow
        self.dataflow.execute(None)
        self.assertTrue(dataflow_mock.called)
        expected_options = {
            'project': 'test',
            'stagingLocation': 'gs://test/staging',
            'tempLocation': 'gs://test/temp',
            'zone': 'us-central1-f',
            'EXTRA_OPTION': "TEST_A",
        }
        start_template_hook.assert_called_once_with(
            job_name=JOB_NAME,
            variables=expected_options,
            parameters=PARAMETERS,
            dataflow_template=TEMPLATE,
            on_new_job_id_callback=mock.ANY,
            project_id=None,
            location=TEST_LOCATION,
            environment={'maxWorkers': 2},
        )
Beispiel #5
0
        job_id=
        "{{task_instance.xcom_pull('start-python-job-async')['job_id']}}",
        location='europe-west3',
        callback=check_autoscaling_event,
    )
    # [END howto_sensor_wait_for_job_autoscaling_event]

    start_python_job_async >> wait_for_python_job_async_done
    start_python_job_async >> wait_for_python_job_async_metric
    start_python_job_async >> wait_for_python_job_async_message
    start_python_job_async >> wait_for_python_job_async_autoscaling_event

with models.DAG(
        "example_gcp_dataflow_template",
        default_args=default_args,
        start_date=days_ago(1),
        schedule_interval=None,  # Override to match your needs
        tags=['example'],
) as dag_template:
    # [START howto_operator_start_template_job]
    start_template_job = DataflowTemplatedJobStartOperator(
        task_id="start-template-job",
        template='gs://dataflow-templates/latest/Word_Count',
        parameters={
            'inputFile': "gs://dataflow-samples/shakespeare/kinglear.txt",
            'output': GCS_OUTPUT
        },
        location='europe-west3',
    )
    # [END howto_operator_start_template_job]
Beispiel #6
0
with models.DAG(
        # The id you will see in the DAG airflow page
        "composer_dataflow_dag",
        default_args=default_args,
        # The interval with which to schedule the DAG
        schedule_interval=datetime.timedelta(
            days=1),  # Override to match your needs
) as dag:

    start_template_job = DataflowTemplatedJobStartOperator(
        # The task id of your job
        task_id="dataflow_operator_transform_csv_to_bq",
        # The name of the template that you're using.
        # Below is a list of all the templates you can use.
        # For versions in non-production environments, use the subfolder 'latest'
        # https://cloud.google.com/dataflow/docs/guides/templates/provided-batch#gcstexttobigquery
        template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery",
        # Use the link above to specify the correct parameters for your template.
        parameters={
            "javascriptTextTransformFunctionName": "transformCSVtoJSON",
            "JSONPath": bucket_path + "/jsonSchema.json",
            "javascriptTextTransformGcsPath":
            bucket_path + "/transformCSVtoJSON.js",
            "inputFilePattern": bucket_path + "/inputFile.txt",
            "outputTable": project_id + ":average_weather.average_weather",
            "bigQueryLoadingTemporaryDirectory": bucket_path + "/tmp/",
        },
    )

# [END composer_dataflow_dag]
                default_args=default_args,
                schedule_interval=None) as dag:
    start = dummy.DummyOperator(task_id='start', trigger_rule='all_success')

    end = dummy.DummyOperator(task_id='end', trigger_rule='all_success')

    # Bigquery Tables automatically created for demo porpuse.
    # Consider a dedicated pipeline or tool for a real life scenario.
    customers_import = DataflowTemplatedJobStartOperator(
        task_id="dataflow_customers_import",
        template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery",
        project_id=LOD_PRJ,
        location=DF_REGION,
        parameters={
            "javascriptTextTransformFunctionName": "transform",
            "JSONPath": ORC_GCS + "/customers_schema.json",
            "javascriptTextTransformGcsPath": ORC_GCS + "/customers_udf.js",
            "inputFilePattern": DRP_GCS + "/customers.csv",
            "outputTable":
            DWH_LAND_PRJ + ":" + DWH_LAND_BQ_DATASET + ".customers",
            "bigQueryLoadingTemporaryDirectory": LOD_GCS_STAGING + "/tmp/bq/",
        },
    )

    purchases_import = DataflowTemplatedJobStartOperator(
        task_id="dataflow_purchases_import",
        template="gs://dataflow-templates/latest/GCS_Text_to_BigQuery",
        project_id=LOD_PRJ,
        location=DF_REGION,
        parameters={
            "javascriptTextTransformFunctionName": "transform",