Beispiel #1
0
    def test_update_cluster(self):
        with patch(HOOK) as mock_hook:
            hook = mock_hook()
            hook.get_conn.return_value = self.mock_conn
            hook.wait.return_value = None

            dataproc_task = DataprocClusterScaleOperator(
                task_id=TASK_ID,
                region=GCP_REGION,
                project_id=GCP_PROJECT_ID,
                cluster_name=CLUSTER_NAME,
                num_workers=NUM_WORKERS,
                num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
                dag=self.dag
            )
            dataproc_task.execute(None)

            self.mock_clusters.patch.assert_called_once_with(
                region=GCP_REGION,
                projectId=GCP_PROJECT_ID,
                clusterName=CLUSTER_NAME,
                requestId=mock.ANY,
                updateMask="config.worker_config.num_instances,"
                           "config.secondary_worker_config.num_instances",
                body={
                    'config': {
                        'workerConfig': {
                            'numInstances': NUM_WORKERS
                        },
                        'secondaryWorkerConfig': {
                            'numInstances': NUM_PREEMPTIBLE_WORKERS
                        }
                    }
                })
            hook.wait.assert_called_once_with(self.operation)
 def test_cluster_name_log_no_sub(self):
     with patch('airflow.contrib.hooks.gcp_dataproc_hook.DataProcHook') as mock_hook:
         mock_hook.return_value.get_conn = self.mock_conn
         dataproc_task = DataprocClusterScaleOperator(
             task_id=TASK_ID,
             cluster_name=CLUSTER_NAME,
             project_id=GCP_PROJECT_ID,
             num_workers=NUM_WORKERS,
             num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
             dag=self.dag
         )
         with patch.object(dataproc_task.log, 'info') as mock_info:
             with self.assertRaises(TypeError):
                 dataproc_task.execute(None)
             mock_info.assert_called_with('Scaling cluster: %s', CLUSTER_NAME)
 def test_cluster_name_log_no_sub(self):
     with patch('airflow.contrib.hooks.gcp_dataproc_hook.DataProcHook') as mock_hook:
         mock_hook.return_value.get_conn = self.mock_conn
         dataproc_task = DataprocClusterScaleOperator(
             task_id=TASK_ID,
             cluster_name=CLUSTER_NAME,
             project_id=PROJECT_ID,
             num_workers=NUM_WORKERS,
             num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
             dag=self.dag
         )
         with patch.object(dataproc_task.log, 'info') as mock_info:
             with self.assertRaises(TypeError):
                 dataproc_task.execute(None)
             mock_info.assert_called_with('Scaling cluster: %s', CLUSTER_NAME)
    def test_cluster_name_log_sub(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') as mock_hook:
            mock_hook.return_value.get_conn = self.mock_conn
            dataproc_task = DataprocClusterScaleOperator(
                task_id=TASK_ID,
                cluster_name='smoke-cluster-{{ ts_nodash }}',
                project_id=GCP_PROJECT_ID,
                num_workers=NUM_WORKERS,
                num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
                dag=self.dag
            )

            with patch.object(dataproc_task.log, 'info') as mock_info:
                context = {'ts_nodash': 'testnodash'}

                rendered = dataproc_task.render_template(
                    'cluster_name',
                    getattr(dataproc_task, 'cluster_name'), context)
                setattr(dataproc_task, 'cluster_name', rendered)
                with self.assertRaises(TypeError):
                    dataproc_task.execute(None)
                mock_info.assert_called_with('Scaling cluster: %s', u'smoke-cluster-testnodash')
Beispiel #5
0
    def test_cluster_name_log_sub(self):
        with patch('airflow.contrib.operators.dataproc_operator.DataProcHook') \
                as mock_hook:
            mock_hook.return_value.get_conn = self.mock_conn
            dataproc_task = DataprocClusterScaleOperator(
                task_id=TASK_ID,
                cluster_name='smoke-cluster-{{ ts_nodash }}',
                project_id=PROJECT_ID,
                num_workers=NUM_WORKERS,
                num_preemptible_workers=NUM_PREEMPTIBLE_WORKERS,
                dag=self.dag)

            with patch.object(dataproc_task.log, 'info') as mock_info:
                context = {'ts_nodash': 'testnodash'}

                rendered = dataproc_task.render_template(
                    'cluster_name', getattr(dataproc_task, 'cluster_name'),
                    context)
                setattr(dataproc_task, 'cluster_name', rendered)
                with self.assertRaises(TypeError):
                    dataproc_task.execute(None)
                mock_info.assert_called_with('Scaling cluster: %s',
                                             u'smoke-cluster-testnodash')
   	region=my_region,
   	query=create_external_src_table
    )
    
    create_external_dst_table = DataProcHiveOperator(
   	task_id='create_external_dst_table',
   	job_name='create_external_dst_table_job_name',
   	cluster_name=my_cluster_name,
   	region=my_region,
   	query=create_external_dst_table
    )
    
    dataproc_scale_out = DataprocClusterScaleOperator(
       task_id='dataproc_scale_out',
       cluster_name=my_cluster_name,
   	region=my_region,
       num_workers=2,
       num_preemptible_workers=num_preemptible_vms,
       graceful_decommission_timeout='1h',
       dag=dag)
     
 
       
    ##notice the insert overwrite was concatenated with set_dynamic_partitions check variable: insert_overwrite_with_transformation_query
    insert_overwrite_with_transformation_query = DataProcSparkSqlOperator(
   	task_id='insert_overwrite_with_transformation_query',
   	job_name='insert_overwrite_with_transformation_query_job_name',
   	cluster_name=my_cluster_name,
   	region=my_region,
   	query=insert_overwrite_with_transformation_query  
   	#query=evya_query  # for dev purposes, dummy query
    )
Beispiel #7
0
    "example_gcp_dataproc",
    default_args={"start_date": airflow.utils.dates.days_ago(1)},
    schedule_interval=None,
) as dag:
    create_cluster = DataprocClusterCreateOperator(
        task_id="create_cluster",
        cluster_name=CLUSTER_NAME,
        project_id=PROJECT_ID,
        num_workers=2,
        region=REGION,
    )

    scale_cluster = DataprocClusterScaleOperator(
        task_id="scale_cluster",
        num_workers=3,
        cluster_name=CLUSTER_NAME,
        project_id=PROJECT_ID,
        region=REGION,
    )

    pig_task = DataProcPigOperator(
        task_id="pig_task",
        query="define sin HiveUDF('sin');",
        region=REGION,
        cluster_name=CLUSTER_NAME,
    )

    spark_sql_task = DataProcSparkSqlOperator(
        task_id="spark_sql_task",
        query="SHOW DATABASES;",
        region=REGION,