def test_hook_correct_region(): with patch(HOOK) as mock_hook: dataproc_task = DataProcHadoopOperator(task_id=TASK_ID, region=GCP_REGION) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with( mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
def test_hook_correct_region(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook' ) as mock_hook: dataproc_task = DataProcHadoopOperator(task_id=TASK_ID, region=REGION) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with( mock.ANY, mock.ANY, REGION)
def test_hook_correct_region(self): with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook: dataproc_task = DataProcHadoopOperator( task_id=TASK_ID, region=REGION ) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, REGION)
def test_hook_correct_region(): with patch(HOOK) as mock_hook: dataproc_task = DataProcHadoopOperator( task_id=TASK_ID, region=GCP_REGION ) dataproc_task.execute(None) mock_hook.return_value.submit.assert_called_once_with(mock.ANY, mock.ANY, GCP_REGION, mock.ANY)
def test_dataproc_job_id_is_set(): with patch(HOOK) as mock_hook: dataproc_task = DataProcHadoopOperator( task_id=TASK_ID ) _assert_dataproc_job_id(mock_hook, dataproc_task)
def test_correct_job_definition(self, mock_hook, mock_uuid): # Expected job job_definition = deepcopy(DATAPROC_JOB_TO_SUBMIT) job_definition['job']['hadoopJob'] = {'mainClass': None} job_definition['job']['reference']['projectId'] = None job_definition['job']['reference']['jobId'] = DATAPROC_JOB_ID + "_test" # Prepare job using operator task = DataProcHadoopOperator(task_id=TASK_ID, region=GCP_REGION, cluster_name=CLUSTER_NAME, job_name=DATAPROC_JOB_ID, labels=LABELS) task.execute(context=None) self.assertDictEqual(job_definition, task.job_template.job)
'retry_delay': timedelta(minutes=5), 'project_id': Variable.get('gcp_project') } with DAG('composer-quickstart-geh', schedule_interval=timedelta(days=1), default_args=DEFAULT_DAG_ARGS) as dag: # Create a Cloud Dataproc cluster. CREATE_DATAPROC_CLUSTER = DataprocClusterCreateOperator( task_id='create_dataproc_cluster', cluster_name='quickstart-cluster-{{ ds_nodash }}', num_workers=2, zone=Variable.get('gce_zone') ) # Run the Hadoop wordcount example installed on the Cloud Dataproc # cluster master node. RUN_DATAPROC_HADOOP = DataProcHadoopOperator( task_id='run_dataproc_hadoop', main_jar=WORDCOUNT_JAR, cluster_name='quickstart-cluster-{{ ds_nodash }}', arguments=WORDCOUNT_ARGS ) # Delete the Cloud Dataproc cluster. DELETE_DATAPROC_CLUSTER = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='quickstart-cluster-{{ ds_nodash }}' ) # Define DAG dependencies (sequencing). CREATE_DATAPROC_CLUSTER >> RUN_DATAPROC_HADOOP >> DELETE_DATAPROC_CLUSTER
main=PYSPARK_URI, region=REGION, cluster_name=CLUSTER_NAME, ) hive_task = DataProcHiveOperator( task_id="hive_task", query="SHOW DATABASES;", region=REGION, cluster_name=CLUSTER_NAME, ) hadoop_task = DataProcHadoopOperator( task_id="hadoop_task", main_jar="file:///usr/lib/hadoop-mapreduce/hadoop-mapreduce-examples.jar", arguments=["wordcount", "gs://pub/shakespeare/rose.txt", OUTPUT_PATH], region=REGION, cluster_name=CLUSTER_NAME, ) delete_cluster = DataprocClusterDeleteOperator( task_id="delete_cluster", project_id=PROJECT_ID, cluster_name=CLUSTER_NAME, region=REGION, ) create_cluster >> scale_cluster scale_cluster >> hive_task >> delete_cluster scale_cluster >> pig_task >> delete_cluster scale_cluster >> spark_sql_task >> delete_cluster