コード例 #1
0
    submit_sqoop = BashOperator(
        task_id="sqoop_full_table_import",
        bash_command=
        'bash /home/airflow/gcs/plugins/sqoop_simple_table_imports_for_airflow.sh ephemeral-spark-cluster-{{ds_nodash}}'
    )

    bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_flight_delays",
        bucket="spark-etl-1",
        source_objects=["sqoop-output/flights/*.avro"],
        destination_project_dataset_table=
        "bigdata-etl-20201027.data_analysis.flight_delays",
        autodetect=True,
        source_format="AVRO",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-central1",
        trigger_rule=TriggerRule.ALL_DONE)

    create_cluster.dag = dag
    create_cluster.set_downstream(submit_sqoop)
    submit_sqoop.set_downstream(bq_load_flight_delays)
    bq_load_flight_delays.set_downstream(delete_cluster)
        # tasks.
        trigger_rule=TriggerRule.ALL_DONE)

    # Delete  gcs files in the timestamped transformed folder.
    delete_transformed_files = BashOperator(
        task_id='delete_transformed_files',
        bash_command="gsutil -m rm -r gs://{{ var.value.gcs_bucket }}" +
        "/{{ dag_run.conf['transformed_path'] }}/")

    # If the spark job or BQ Load fails we rename the timestamped raw path to
    # a timestamped failed path.
    move_failed_files = BashOperator(
        task_id='move_failed_files',
        bash_command="gsutil mv gs://{{ var.value.gcs_bucket }}" +
        "/{{ dag_run.conf['raw_path'] }}/ " +
        "gs://{{ var.value.gcs_bucket}}" +
        "/{{ dag_run.conf['failed_path'] }}/",
        trigger_rule=TriggerRule.ONE_FAILED)
    # Set the dag property of the first Operators, this will be inherited by
    # downstream Operators.

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_pyspark)

    submit_pyspark.set_downstream([delete_cluster, bq_load])

    bq_load.set_downstream(delete_transformed_files)

    move_failed_files.set_upstream([bq_load, submit_pyspark])
        task_id="hdfs_to_gcs",
        bash_command=
        "gcloud compute ssh ephemeral-spark-cluster-{{ds_nodash}}-m --zone='asia-southeast2-a' -- -T 'hadoop distcp /incremental_buckets/*.avro gs://bigdata-etl-2_flights/sqoop_output/'",
        dag=dag)

    bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_flight_delays",
        bucket="bigdata-etl-2_flights",
        source_objects=["sqoop_output/part.20190515_*.avro"],
        destination_project_dataset_table=PROJECT_ID +
        ".data_flights.flights_delays",
        autodetect=True,
        source_format="AVRO",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    # delete_cluster = DataprocClusterDeleteOperator(
    #     task_id='delete_dataproc_cluster',
    #     cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
    #     region='asia-east1',
    #     trigger_rule=TriggerRule.ALL_DONE
    # )

    create_cluster.dag = dag

    create_cluster.set_downstream(sqoop_inc_import)
    sqoop_inc_import.set_downstream(hdfs_to_gcs)
    hdfs_to_gcs.set_downstream(bq_load_flight_delays)
#    bq_load_delays_by_distance.set_downstream(delete_cluster)
コード例 #4
0
    cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}',
    num_workers=2,
    worker_machine_type='n1-standard-1',
    properties={
        'spark:spark.executor.cores': '1',
        'spark:spark.executor.memory': '1g',
        # The maximum number of bytes to pack into a single partition when reading files. 256MB
        'spark:spark.files.maxPartitionBytes': '268435456'
    },
    zone=Variable.get('gcp_zone'),
    dag=SPARK_DAG)

logger.debug('Submitting spark job on cluster')
#Submit Spark Job
submit_pyspark = DataProcPySparkOperator(
    task_id='run_dataproc_pyspark_job',
    main=PYSPARK_JOB,
    cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}',
    arguments=[CONFIG_FILE_ARG],
    dag=SPARK_DAG)

logger.debug('Deleting cluster from GCP')
# Delete the Cloud Dataproc cluster.
delete_cluster = DataprocClusterDeleteOperator(
    task_id='delete_dataproc_cluster',
    cluster_name='Reporting-smoke-cluster-{{ ds_nodash }}',
    trigger_rule=TriggerRule.ALL_DONE,
    dag=SPARK_DAG)

start_cluster.set_downstream(submit_pyspark)
submit_pyspark.set_downstream(delete_cluster)
コード例 #5
0
        region="us-east1")

    bq_load_profeco_data = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_csv_profeco",
        bucket='gnp-storage',
        source_objects=["Profeco/resources/Sin-fecha/profeco.pdf"],
        destination_project_dataset_table=PROJECT_ID + ".GNP.Profeco_table",
        autodetect=True,
        source_format="CSV",
        field_delimiter=',',
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-east1",
        trigger_rule=TriggerRule.ALL_DONE)

    unzip_files.dag = dag

    unzip_files.set_downstream(create_cluster)

    create_cluster.set_downstream(PythonOperator)

    PythonOperator.set_downstream([submit_pyspark, bq_load_profeco_data])

    submit_pyspark.set_downstream(delete_cluster)
コード例 #6
0
        # in YYYYMMDD format. See docs https://airflow.apache.org/code.html?highlight=macros#macros
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        image_version='1.5-debian10',
        num_workers=2,
        storage_bucket=CLUSTER_STORAGE_BUCKET,
        region=REGION,
        zone=ZONE)

    # Submit our Spark Job
    submit_scalaspark = DataProcSparkOperator(
        task_id=TASK_ID,
        region=REGION,
        main_class=MAIN_CLASS,
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        dataproc_spark_jars=MAIN_JAR)

    # Delete the Cloud Dataproc cluster.
    delete_cluster = DataprocClusterDeleteOperator(
        task_id='delete_dataproc_cluster',
        region=REGION,
        # Obviously needs to match the name of cluster created in the prior two Operators.
        cluster_name='ephemeral-spark-cluster-{{ ds_nodash }}',
        # This will tear down the cluster even if there are failures in upstream tasks.
        trigger_rule=TriggerRule.ALL_DONE)

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_scalaspark)

    submit_scalaspark.set_downstream(delete_cluster)