Ejemplo n.º 1
0
    submit_sqoop = BashOperator(
        task_id="sqoop_full_table_import",
        bash_command=
        'bash /home/airflow/gcs/plugins/sqoop_simple_table_imports_for_airflow.sh ephemeral-spark-cluster-{{ds_nodash}}'
    )

    bq_load_flight_delays = GoogleCloudStorageToBigQueryOperator(
        task_id="bq_load_flight_delays",
        bucket="spark-etl-1",
        source_objects=["sqoop-output/flights/*.avro"],
        destination_project_dataset_table=
        "bigdata-etl-20201027.data_analysis.flight_delays",
        autodetect=True,
        source_format="AVRO",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0)

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="us-central1",
        trigger_rule=TriggerRule.ALL_DONE)

    create_cluster.dag = dag
    create_cluster.set_downstream(submit_sqoop)
    submit_sqoop.set_downstream(bq_load_flight_delays)
    bq_load_flight_delays.set_downstream(delete_cluster)
        # tasks.
        trigger_rule=TriggerRule.ALL_DONE)

    # Delete  gcs files in the timestamped transformed folder.
    delete_transformed_files = BashOperator(
        task_id='delete_transformed_files',
        bash_command="gsutil -m rm -r gs://{{ var.value.gcs_bucket }}" +
        "/{{ dag_run.conf['transformed_path'] }}/")

    # If the spark job or BQ Load fails we rename the timestamped raw path to
    # a timestamped failed path.
    move_failed_files = BashOperator(
        task_id='move_failed_files',
        bash_command="gsutil mv gs://{{ var.value.gcs_bucket }}" +
        "/{{ dag_run.conf['raw_path'] }}/ " +
        "gs://{{ var.value.gcs_bucket}}" +
        "/{{ dag_run.conf['failed_path'] }}/",
        trigger_rule=TriggerRule.ONE_FAILED)
    # Set the dag property of the first Operators, this will be inherited by
    # downstream Operators.

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_pyspark)

    submit_pyspark.set_downstream([delete_cluster, bq_load])

    bq_load.set_downstream(delete_transformed_files)

    move_failed_files.set_upstream([bq_load, submit_pyspark])
Ejemplo n.º 3
0
        },
        {
            'name': 'retention',
            'type': 'INTEGER',
            'mode': 'NULLABLE'
        },
    ],
    create_disposition='CREATE_IF_NEEDED',
    write_disposition='WRITE_APPEND',
    dag=dag)

# =================
# == tasks flow ===
# =================

# dataproc upstream & downstream for both create and delete dataproc
create_cluster_1.set_upstream(push_unique_cluster_name)
create_cluster_1.set_upstream(sensor_task)
create_cluster_2.set_upstream(push_unique_cluster_name)
create_cluster_2.set_upstream(delete_cluster_1)
# create job upstream & downstream
calc_unique_users.set_upstream(create_cluster_1)
calc_unique_users.set_downstream(bq_load_user)
calc_unique_users.set_downstream(calc_agg)
calc_agg.set_downstream(bq_load_agg)
calc_retention_day1.set_upstream(create_cluster_2)
calc_retention_day1.set_downstream(bq_load_retention)
# create cfs_to_bq upstream & downstream
bq_load_agg.set_downstream(delete_cluster_1)
bq_load_retention.set_downstream(delete_cluster_2)
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0,
    )

    delete_cluster = DataprocClusterDeleteOperator(

        task_id ="delete_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="asia-southeast2",
        trigger_rule = TriggerRule.ALL_DONE
    )

    delete_tranformed_files = BashOperator(
        task_id = "delete_tranformed_files",
        bash_command = "gsutil -m rm -r " +BUCKET + "/series_data_output/*"
    )

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_pyspark)

    submit_pyspark.set_downstream(bq_load_series_1)

    bq_load_series_1.set_downstream(bq_load_series_2)

    bq_load_series_2.set_downstream(delete_cluster)
    
    delete_cluster.set_downstream(delete_tranformed_files)
Ejemplo n.º 5
0
        autodetect=True,
        source_format="NEWLINE_DELIMITED_JSON",
        create_disposition="CREATE_IF_NEEDED",
        skip_leading_rows=0,
        write_disposition="WRITE_APPEND",
        max_bad_records=0,
    )

    delete_cluster = DataprocClusterDeleteOperator(
        task_id="delete_dataproc_cluster",
        cluster_name="ephemeral-spark-cluster-{{ds_nodash}}",
        region="asia-southeast2",
        trigger_rule=TriggerRule.ALL_DONE)

    delete_tranformed_files = BashOperator(task_id="delete_tranformed_files",
                                           bash_command="gsutil -m rm -r " +
                                           BUCKET + "/movies_data_output/*")

    create_cluster.dag = dag

    create_cluster.set_downstream(submit_pyspark)

    submit_pyspark.set_downstream(bq_load_movies_1)

    bq_load_movies_1.set_downstream(bq_load_movies_2)

    bq_load_movies_2.set_downstream(bq_load_movies_3)

    bq_load_movies_3.set_downstream(delete_cluster)

    delete_cluster.set_downstream(delete_tranformed_files)
            "type": "INTEGER"
        }, {
            "mode": "NULLABLE",
            "name": "upvote_ratio",
            "type": "FLOAT"
        }, {
            "mode": "NULLABLE",
            "name": "upvote_category",
            "type": "INTEGER"
        }, {
            "mode": "NULLABLE",
            "name": "num_comments",
            "type": "INTEGER"
        }, {
            "mode": "NULLABLE",
            "name": "comments_per_upvote",
            "type": "NUMERIC"
        }, {
            "mode": "NULLABLE",
            "name": "permalink",
            "type": "STRING"
        }])

    submit_reddit.dag = dag

    submit_reddit.set_downstream(bq_load_submissions)

    bq_load_submissions.set_downstream(submit_pyspark)

    submit_pyspark.set_downstream(bq_load_analysis)