Ejemplo n.º 1
0
    def test_hook_correct_region():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcHadoopOperator(task_id=TASK_ID,
                                                   region=GCP_REGION)

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(
                mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
Ejemplo n.º 2
0
    def test_hook_correct_region(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook'
                   ) as mock_hook:
            dataproc_task = DataProcHadoopOperator(task_id=TASK_ID,
                                                   region=REGION)

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(
                mock.ANY, mock.ANY, REGION)
    def test_hook_correct_region(self):
       with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
            dataproc_task = DataProcHadoopOperator(
                task_id=TASK_ID,
                region=REGION
            )

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, REGION)
    def test_hook_correct_region():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcHadoopOperator(
                task_id=TASK_ID,
                region=GCP_REGION
            )

            dataproc_task.execute(None)
            mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY,
                                                                  GCP_REGION, mock.ANY)
Ejemplo n.º 5
0
    def test_dataproc_job_id_is_set():
        with patch(HOOK) as mock_hook:
            dataproc_task = DataProcHadoopOperator(
                task_id=TASK_ID
            )

            _assert_dataproc_job_id(mock_hook, dataproc_task)
Ejemplo n.º 6
0
    def test_correct_job_definition(self, mock_hook, mock_uuid):
        # Expected job
        job_definition = deepcopy(DATAPROC_JOB_TO_SUBMIT)
        job_definition['job']['hadoopJob'] = {'mainClass': None}
        job_definition['job']['reference']['projectId'] = None
        job_definition['job']['reference']['jobId'] = DATAPROC_JOB_ID + "_test"

        # Prepare job using operator
        task = DataProcHadoopOperator(task_id=TASK_ID,
                                      region=GCP_REGION,
                                      cluster_name=CLUSTER_NAME,
                                      job_name=DATAPROC_JOB_ID,
                                      labels=LABELS)

        task.execute(context=None)
        self.assertDictEqual(job_definition, task.job_template.job)
    'retry_delay': timedelta(minutes=5),
    'project_id': Variable.get('gcp_project')
}

with DAG('composer-quickstart-geh', schedule_interval=timedelta(days=1),
         default_args=DEFAULT_DAG_ARGS) as dag:

    # Create a Cloud Dataproc cluster.
    CREATE_DATAPROC_CLUSTER = DataprocClusterCreateOperator(
        task_id='create_dataproc_cluster',
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        num_workers=2,
        zone=Variable.get('gce_zone')
    )

    # Run the Hadoop wordcount example installed on the Cloud Dataproc
    # cluster master node.
    RUN_DATAPROC_HADOOP = DataProcHadoopOperator(
        task_id='run_dataproc_hadoop',
        main_jar=WORDCOUNT_JAR,
        cluster_name='quickstart-cluster-{{ ds_nodash }}',
        arguments=WORDCOUNT_ARGS
        )

    # Delete the Cloud Dataproc cluster.
    DELETE_DATAPROC_CLUSTER = DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        cluster_name='quickstart-cluster-{{ ds_nodash }}'
    )
    # Define DAG dependencies (sequencing).
    CREATE_DATAPROC_CLUSTER >> RUN_DATAPROC_HADOOP >> DELETE_DATAPROC_CLUSTER
Ejemplo n.º 8
0
        main=PYSPARK_URI,
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    hive_task = DataProcHiveOperator(
        task_id="hive_task",
        query="SHOW DATABASES;",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    hadoop_task = DataProcHadoopOperator(
        task_id="hadoop_task",
        main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar",
        arguments=["wordcount", "gs://pub/shakespeare/rose.txt", OUTPUT_PATH],
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_cluster",
        project_id=PROJECT_ID,
        cluster_name=CLUSTER_NAME,
        region=REGION,
    )

    create_cluster >> scale_cluster
    scale_cluster >> hive_task >> delete_cluster
    scale_cluster >> pig_task >> delete_cluster
    scale_cluster >> spark_sql_task >> delete_cluster