Esempio n. 1
0
    def test_hook_correct_region():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcPigOperator(task_id=TASK_ID,
                                                cluster_name=CLUSTER_NAME,
                                                region=GCP_REGION)

            dataproc_task.execute(None)
        mock_hook.return_value.submit.assert_called_once_with(
            mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
Esempio n. 2
0
    def test_dataproc_job_id_is_set():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcPigOperator(task_id=TASK_ID,
                                                cluster_name=CLUSTER_NAME,
                                                region=GCP_REGION)

            _assert_dataproc_job_id(mock_hook, dataproc_task)
Esempio n. 3
0
    def test_correct_job_definition(self, mock_hook, mock_uuid):
        # Expected job
        job_definition = deepcopy(DATAPROC_JOB_TO_SUBMIT)
        job_definition['job']['pigJob'] = {'queryFileUri': None}
        job_definition['job']['reference']['projectId'] = None
        job_definition['job']['reference']['jobId'] = DATAPROC_JOB_ID + "_test"

        # Prepare job using operator
        task = DataProcPigOperator(task_id=TASK_ID,
                                   region=GCP_REGION,
                                   cluster_name=CLUSTER_NAME,
                                   job_name=DATAPROC_JOB_ID,
                                   labels=LABELS)

        task.execute(context=None)
        self.assertDictEqual(job_definition, task.job_template.job)
    DataprocClusterDeleteOperator)

default_args = {"start_date": airflow.utils.dates.days_ago(1)}

CLUSTER_NAME = os.environ.get('GCP_DATAPROC_CLUSTER_NAME', 'example-project')
PROJECT_ID = os.environ.get('GCP_PROJECT_ID', 'an-id')
REGION = os.environ.get('GCP_LOCATION', 'europe-west1')

with models.DAG(
        "example_gcp_dataproc_pig_operator",
        default_args=default_args,
        schedule_interval=None,
) as dag:
    create_task = DataprocClusterCreateOperator(task_id="create_task",
                                                cluster_name=CLUSTER_NAME,
                                                project_id=PROJECT_ID,
                                                region=REGION,
                                                num_workers=2)

    pig_task = DataProcPigOperator(task_id="pig_task",
                                   query="define sin HiveUDF('sin');",
                                   region=REGION,
                                   cluster_name=CLUSTER_NAME)

    delete_task = DataprocClusterDeleteOperator(task_id="delete_task",
                                                project_id=PROJECT_ID,
                                                cluster_name=CLUSTER_NAME,
                                                region=REGION)

    create_task >> pig_task >> delete_task