Exemple #1
0
 def test_execute(self, mock_hook):
     op = DataprocDeleteClusterOperator(
         task_id=TASK_ID,
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster_name=CLUSTER_NAME,
         request_id=REQUEST_ID,
         gcp_conn_id=GCP_CONN_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
         impersonation_chain=IMPERSONATION_CHAIN,
     )
     op.execute(context={})
     mock_hook.assert_called_once_with(gcp_conn_id=GCP_CONN_ID, impersonation_chain=IMPERSONATION_CHAIN)
     mock_hook.return_value.delete_cluster.assert_called_once_with(
         region=GCP_LOCATION,
         project_id=GCP_PROJECT,
         cluster_name=CLUSTER_NAME,
         cluster_uuid=None,
         request_id=REQUEST_ID,
         retry=RETRY,
         timeout=TIMEOUT,
         metadata=METADATA,
     )
    pyspark_task = DataprocSubmitJobOperator(task_id="pyspark_task",
                                             job=PYSPARK_JOB,
                                             location=REGION,
                                             project_id=PROJECT_ID)

    hive_task = DataprocSubmitJobOperator(task_id="hive_task",
                                          job=HIVE_JOB,
                                          location=REGION,
                                          project_id=PROJECT_ID)

    hadoop_task = DataprocSubmitJobOperator(task_id="hadoop_task",
                                            job=HADOOP_JOB,
                                            location=REGION,
                                            project_id=PROJECT_ID)

    delete_cluster = DataprocDeleteClusterOperator(
        task_id="delete_cluster",
        project_id=PROJECT_ID,
        cluster_name=CLUSTER_NAME,
        region=REGION,
    )

    create_cluster >> scale_cluster
    scale_cluster >> hive_task >> delete_cluster
    scale_cluster >> pig_task >> delete_cluster
    scale_cluster >> spark_sql_task >> delete_cluster
    scale_cluster >> spark_task >> delete_cluster
    scale_cluster >> pyspark_task >> delete_cluster
    scale_cluster >> hadoop_task >> delete_cluster
Exemple #3
0
        cluster_name=CLUSTER_NAME,
        cluster_config=CLUSTER_CONFIG,
        labels={"tenant": TENANT, "created-by": USER, },
    )

    pr_producer_pyspark_task = DataprocSubmitJobOperator(
        task_id="run_cee_pr_producer",
        impersonation_chain=CONNECT_SA,
        job=PR_PYSPARK_JOB,
        location=REGION,
        cluster_name=CLUSTER_NAME,
        cluster_config=CLUSTER_CONFIG,
        labels={"tenant": TENANT, "created-by": USER, },
    )

    # Delete Dataproc cluster.
    delete_dataproc_cluster = DataprocDeleteClusterOperator(
        task_id="delete_dataproc_cluster",
        impersonation_chain=CONNECT_SA,
        region=REGION,
        cluster_name=CLUSTER_NAME,
        labels={"tenant": TENANT, "created-by": USER, },
    )

    (
            create_dataproc_cluster
            >> assign_permissions
            >> [cg_producer_pyspark_task, pr_producer_pyspark_task]
            >> delete_dataproc_cluster
    )
Exemple #4
0
default_args = {
    'owner': 'airflow',
    'depends_on_past': True,
    'start_date': datetime(2021, 3, 9, tzinfo=local_tz),
    'email': ['*****@*****.**'],
    'email_on_failure': True,
    'email_on_retry': True,
    'retries': 1,
    'retry_delay': timedelta(minutes=1), 
    'project_id': gcp_config['project_id'], 
    'region': gcp_config['region'], 
    'gcp_conn_id': gcp_config['conn_id']
}


with DAG(
    'delete_dataproc', 
    default_args=default_args,
    description='delete_dataproc',
    schedule_interval='@once'
    ) as dag:


    delete_dataproc = DataprocDeleteClusterOperator(
        task_id='delete_dataproc',
        cluster_name=dataproc_config['cluster_name']
    )

delete_dataproc
Exemple #5
0
    )

    pyspark_job_task_predict = DataprocSubmitJobOperator(
        task_id="pyspark_job_task",
        job=get_pyspark_job_config(
            predict_config_script['script_path'],
            predict_config_script['config_path_daily'],
            CLUSTER_NAME_DAILY
        ),
        location=REGION,
        project_id=PROJECT_ID
    )

    delete_cluster = DataprocDeleteClusterOperator(
        task_id="delete_cluster",
        project_id=PROJECT_ID,
        cluster_name=CLUSTER_NAME_DAILY,
        region=REGION
    )

    delete_cluster_on_failure = DataprocDeleteClusterOperator(
        task_id="delete_cluster",
        project_id=PROJECT_ID,
        cluster_name=CLUSTER_NAME_DAILY,
        region=REGION,
        trigger_rule=TriggerRule.ONE_FAILED
    )

    bq_delete = BigQueryOperator(
        task_id="delete_record_of_execution_date",
        sql=delete_query,
        use_legacy_sql=False,
        start_pipelines.append(
            PythonOperator(
                task_id=str(x) + "_" + DF_PIPELINES[x],
                python_callable=start_pipeline_function,
                dag=dag,
                pool=DF_PIPELINE_POOL,
                op_kwargs={"pipeline": DF_PIPELINES[x]},
                trigger_rule="all_done",
            ))

    delete_firewall_rule = PythonOperator(
        task_id="delete_firewall_rule",
        provide_context=True,
        python_callable=remove_firewall_function,
        dag=dag,
        trigger_rule="all_done",
    )

    delete_dataproc_cluster = DataprocDeleteClusterOperator(
        task_id="delete_dataproc_cluster",
        project_id="data-analytics-webinar",
        cluster_name="etl-cluster",
        region="us-central1",
        trigger_rule="all_done",
    )

    # Define DAG dependencies.
    for x in range(len(DF_PIPELINES)):
        create_dataproc_cluster >> create_firewall_rule >> start_pipelines[
            x] >> delete_firewall_rule >> delete_dataproc_cluster