submit_sqoop = BashOperator( task_id="sqoop_full_table_import", bash_command= 'bash /home/airflow/gcs/plugins/sqoop_simple_table_imports_for_airflow.sh ephemeral-spark-cluster-{{ds_nodash}}' ) bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_flight_delays", bucket="spark-etl-1", source_objects=["sqoop-output/flights/*.avro"], destination_project_dataset_table= "bigdata-etl-20201027.data_analysis.flight_delays", autodetect=True, source_format="AVRO", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0) delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="us-central1", trigger_rule=TriggerRule.ALL_DONE) create_cluster.dag = dag create_cluster.set_downstream(submit_sqoop) submit_sqoop.set_downstream(bq_load_flight_delays) bq_load_flight_delays.set_downstream(delete_cluster)
# tasks. trigger_rule=TriggerRule.ALL_DONE) # Delete gcs files in the timestamped transformed folder. delete_transformed_files = BashOperator( task_id='delete_transformed_files', bash_command="gsutil -m rm -r gs://{{ var.value.gcs_bucket }}" + "/{{ dag_run.conf['transformed_path'] }}/") # If the spark job or BQ Load fails we rename the timestamped raw path to # a timestamped failed path. move_failed_files = BashOperator( task_id='move_failed_files', bash_command="gsutil mv gs://{{ var.value.gcs_bucket }}" + "/{{ dag_run.conf['raw_path'] }}/ " + "gs://{{ var.value.gcs_bucket}}" + "/{{ dag_run.conf['failed_path'] }}/", trigger_rule=TriggerRule.ONE_FAILED) # Set the dag property of the first Operators, this will be inherited by # downstream Operators. create_cluster.dag = dag create_cluster.set_downstream(submit_pyspark) submit_pyspark.set_downstream([delete_cluster, bq_load]) bq_load.set_downstream(delete_transformed_files) move_failed_files.set_upstream([bq_load, submit_pyspark])
task_id="hdfs_to_gcs", bash_command= "gcloud compute ssh ephemeral-spark-cluster-{{ds_nodash}}-m --zone='asia-southeast2-a' -- -T 'hadoop distcp /incremental_buckets/*.avro gs://bigdata-etl-2_flights/sqoop_output/'", dag=dag) bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_flight_delays", bucket="bigdata-etl-2_flights", source_objects=["sqoop_output/part.20190515_*.avro"], destination_project_dataset_table=PROJECT_ID + ".data_flights.flights_delays", autodetect=True, source_format="AVRO", create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0) # delete_cluster = DataprocClusterDeleteOperator( # task_id='delete_dataproc_cluster', # cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", # region='asia-east1', # trigger_rule=TriggerRule.ALL_DONE # ) create_cluster.dag = dag create_cluster.set_downstream(sqoop_inc_import) sqoop_inc_import.set_downstream(hdfs_to_gcs) hdfs_to_gcs.set_downstream(bq_load_flight_delays) # bq_load_delays_by_distance.set_downstream(delete_cluster)
cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}', num_workers=2, worker_machine_type='n1-standard-1', properties={ 'spark:spark.executor.cores': '1', 'spark:spark.executor.memory': '1g', # The maximum number of bytes to pack into a single partition when reading files. 256MB 'spark:spark.files.maxPartitionBytes': '268435456' }, zone=Variable.get('gcp_zone'), dag=SPARK_DAG) logger.debug('Submitting spark job on cluster') #Submit Spark Job submit_pyspark = DataProcPySparkOperator( task_id='run_dataproc_pyspark_job', main=PYSPARK_JOB, cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}', arguments=[CONFIG_FILE_ARG], dag=SPARK_DAG) logger.debug('Deleting cluster from GCP') # Delete the Cloud Dataproc cluster. delete_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}', trigger_rule=TriggerRule.ALL_DONE, dag=SPARK_DAG) start_cluster.set_downstream(submit_pyspark) submit_pyspark.set_downstream(delete_cluster)
region="us-east1") bq_load_profeco_data = GoogleCloudStorageToBigQueryOperator( task_id="bq_load_csv_profeco", bucket='gnp-storage', source_objects=["Profeco/resources/Sin-fecha/profeco.pdf"], destination_project_dataset_table=PROJECT_ID + ".GNP.Profeco_table", autodetect=True, source_format="CSV", field_delimiter=',', create_disposition="CREATE_IF_NEEDED", skip_leading_rows=0, write_disposition="WRITE_APPEND", max_bad_records=0) delete_cluster = DataprocClusterDeleteOperator( task_id="delete_dataproc_cluster", cluster_name="ephemeral-spark-cluster-{{ds_nodash}}", region="us-east1", trigger_rule=TriggerRule.ALL_DONE) unzip_files.dag = dag unzip_files.set_downstream(create_cluster) create_cluster.set_downstream(PythonOperator) PythonOperator.set_downstream([submit_pyspark, bq_load_profeco_data]) submit_pyspark.set_downstream(delete_cluster)
# in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', image_version='1.5-debian10', num_workers=2, storage_bucket=CLUSTER_STORAGE_BUCKET, region=REGION, zone=ZONE) # Submit our Spark Job submit_scalaspark = DataProcSparkOperator( task_id=TASK_ID, region=REGION, main_class=MAIN_CLASS, cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', dataproc_spark_jars=MAIN_JAR) # Delete the Cloud Dataproc cluster. delete_cluster = DataprocClusterDeleteOperator( task_id='delete_dataproc_cluster', region=REGION, # Obviously needs to match the name of cluster created in the prior two Operators. cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}', # This will tear down the cluster even if there are failures in upstream tasks. trigger_rule=TriggerRule.ALL_DONE) create_cluster.dag = dag create_cluster.set_downstream(submit_scalaspark) submit_scalaspark.set_downstream(delete_cluster)