Esempio n. 1
0
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=["{{ ds }}"],
    dag=dag,
)


dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    dag=dag,
    project_id="gdd-ea393e48abe0a85089b6b551da",
    trigger_rule=TriggerRule.ALL_DONE,
)


gcs_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id="write_to_bq",
    bucket="airflow-training-knab-jochem",
    source_objects=["average_prices/transfer_date={{ ds }}/*.parquet"],
    destination_project_dataset_table="gdd-ea393e48abe0a85089b6b551da:prices.land_registry_price${{ ds_nodash }}",
    source_format="PARQUET",
    write_disposition="WRITE_TRUNCATE",
    dag=dag,
)


pgsl_to_gcs >> dataproc_create_cluster
dataproc_create_cluster >> compute_aggregates
compute_aggregates >> dataproc_delete_cluster
dataproc_create_cluster >> df_to_bq
compute_aggregates >> gcs_to_bq
Esempio n. 2
0
    project_id=PROJECT_ID,
    trigger_rule=TriggerRule.ALL_DONE,
)

flow_to_bq = DataFlowPythonOperator(
    task_id="land_registry_prices_to_bigquery",
    dataflow_default_options={
        "project": "gdd-eb47dfd7557212651320890d28",
        "region": "europe-west1",
    },
    py_file="gs://airflow-training-arjan/dataflow_job.py",
    dag=dag,
)

gcs_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id="write_to_bq",
    bucket=BUCKET,
    source_objects=["average_prices/transfer_date={{ ds }}/*.parquet"],
    destination_project_dataset_table=PROJECT_ID +
    ":prices.land_registry_price${{ ds_nodash }}",
    source_format="PARQUET",
    write_disposition="WRITE_TRUNCATE",
    dag=dag,
)

pgsl_to_gcs >> dataproc_create_cluster
dataproc_create_cluster >> compute_aggregates
compute_aggregates >> dataproc_delete_cluster
dataproc_delete_cluster >> flow_to_bq
dataproc_delete_cluster >> gcs_to_bq
    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    dag=dag,
    project_id=PROJECT_ID,
    trigger_rule=TriggerRule.ALL_DONE,
)

pgsl_to_gcs >> dataproc_create_cluster

dataproc_create_cluster >> compute_aggregates

dataproc_create_cluster >> land_registry_prices_to_bigquery

compute_aggregates >> dataproc_delete_cluster

write_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id="write_to_bq",
    bucket=BUCKET,
    source_objects=["average_prices/transfer_date={{ ds }}/*"],
    destination_project_dataset_table=
    "gdd-990fd90d0db6efbabdc6b70f1c:prices.land_registry_prices${{ ds_nodash }}",
    source_format="PARQUET",
    write_disposition="WRITE_TRUNCATE",
    dag=dag,
)

compute_aggregates >> write_to_bq
Esempio n. 4
0
    main="gs://airflow_training/build_statistics.py",
    cluster_name="analyse-pricing-{{ ds }}",
    arguments=["{{ ds }}"],
    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    dag=dag,
    project_id="gdd-05b583b94256b6965bb8c8119a",
    trigger_rule=TriggerRule.ALL_DONE,
)

gcs_to_bigquery = GoogleCloudStorageToBigQueryOperator(
    task_id="write_to_bq",
    bucket="airflow_training",
    source_objects=["average_prices/transfer_date={{ ds }}/*"],
    destination_project_dataset_table=(
        "gdd-05b583b94256b6965bb8c8119a:"
        "prices.land_registry_price${{ ds_nodash }}"),
    source_format="PARQUET",
    write_disposition="WRITE_TRUNCATE",
    dag=dag,
)

pgsq_to_gcs >> dataproc_create_cluster
pgsq_to_gcs >> dataflow_job
dataproc_create_cluster >> compute_aggregates >> dataproc_delete_cluster
compute_aggregates >> gcs_to_bigquery
Esempio n. 5
0
compute_aggregates = DataProcPySparkOperator(
    task_id='compute_aggregates',
    main='gs://airflow-training-knab-geert/build_statistics.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=["{{ ds }}"],
    dag=dag,
)
dataproc_create_cluster >> compute_aggregates


dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_cluster",
    cluster_name="analyse-pricing-{{ ds }}",
    project_id="gdd-25d677142443a8e2ace1927d48",
    trigger_rule=TriggerRule.ALL_DONE,
    dag=dag,
)
compute_aggregates >> dataproc_delete_cluster


gcs_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id="gcs_to_bq",
    bucket="airflow-training-knab-geert",
    source_objects=["average_prices/transfer_date={{ ds }}/*"],
    destination_project_dataset_table="gdd-25d677142443a8e2ace1927d48:prices.land_registry_price${{ ds_nodash }}",
    source_format="PARQUET",
    write_disposition="WRITE_TRUNCATE",
    dag=dag,
)
compute_aggregates >> gcs_to_bq
    task_id='compute_aggregates',
    main='gs://airflow_training_bp/build_statistics.py',
    cluster_name='analyse-pricing-{{ ds }}',
    arguments=["{{ ds }}"],
    dag=dag,
)

dataproc_delete_cluster = DataprocClusterDeleteOperator(
    task_id="delete_dataproc",
    cluster_name="analyse-pricing-{{ ds }}",
    dag=dag,
    project_id="gdd-32ba4f8b4a2ca57e5b201b0062",
)

gcs_to_bq = GoogleCloudStorageToBigQueryOperator(
    task_id="gcs_to_BigQuery",
    bucket="airflow_training_bp",
    source_objects=["average_prices/transfer_date={{ ds }}/*"],
    destination_project_dataset_table=
    "gdd-32ba4f8b4a2ca57e5b201b0062:prices.land_registry_price${{ ds_nodash }}",
    source_format="PARQUET",
    write_disposition="WRITE_TRUNCATE",
    dag=dag,
)

pgsl_to_gcs >> dataproc_create_cluster
dataproc_create_cluster >> compute_aggregates
compute_aggregates >> dataproc_delete_cluster
compute_aggregates >> gcs_to_bq
pgsl_to_gcs >> df_to_bg