job_name=dag_name + 'bike_share_retention_d7', cluster_name='{{ ti.xcom_pull(key="cluster_name", task_ids="push-cluster-name") }}' + '4', execution_timeout=timedelta(minutes=180), arguments=args) unique_user_sensor = GoogleCloudStorageObjectSensor( task_id='unique_user_sensor', bucket='jiuzhangsuanfa', object='bike/unique-user/_SUCCESS', poke_interval=30, timeout=2700) unique_user.set_upstream(dataproc_create_cluster_1) unique_user.set_downstream(bike_share_aggregator) bike_share_aggregator.set_downstream(dataproc_destroy_cluster_1) bike_share_retention_d1.set_upstream(dataproc_create_cluster_2) bike_share_retention_d1.set_downstream(dataproc_destroy_cluster_2) bike_share_retention_d3.set_upstream(dataproc_create_cluster_3) bike_share_retention_d3.set_downstream(dataproc_destroy_cluster_3) bike_share_retention_d7.set_upstream(dataproc_create_cluster_4) bike_share_retention_d7.set_downstream(dataproc_destroy_cluster_4)
}, { 'name': 'retention', 'type': 'INTEGER', 'mode': 'NULLABLE' }, ], create_disposition='CREATE_IF_NEEDED', write_disposition='WRITE_APPEND', dag=dag) # ================= # == tasks flow === # ================= # dataproc upstream & downstream for both create and delete dataproc create_cluster_1.set_upstream(push_unique_cluster_name) create_cluster_1.set_upstream(sensor_task) create_cluster_2.set_upstream(push_unique_cluster_name) create_cluster_2.set_upstream(delete_cluster_1) # create job upstream & downstream calc_unique_users.set_upstream(create_cluster_1) calc_unique_users.set_downstream(bq_load_user) calc_unique_users.set_downstream(calc_agg) calc_agg.set_downstream(bq_load_agg) calc_retention_day1.set_upstream(create_cluster_2) calc_retention_day1.set_downstream(bq_load_retention) # create cfs_to_bq upstream & downstream bq_load_agg.set_downstream(delete_cluster_1) bq_load_retention.set_downstream(delete_cluster_2)
# in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', image_version='1.5-debian10', num_workers=2, storage_bucket=CLUSTER_STORAGE_BUCKET, region=REGION, zone=ZONE) # Submit our Spark Job submit_scalaspark = DataProcSparkOperator( task_id=TASK_ID, region=REGION, main_class=MAIN_CLASS, cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', dataproc_spark_jars=MAIN_JAR) # Delete the Cloud Dataproc cluster. delete_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', region=REGION, # Obviously needs to match the name of cluster created in the prior two Operators. cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', # This will tear down the cluster even if there are failures in upstream tasks. trigger_rule=TriggerRule.ALL_DONE) create_cluster.dag = dag create_cluster.set_downstream(submit_scalaspark) submit_scalaspark.set_downstream(delete_cluster)