Beispiel #1
0
    task_id='computronix_domi_permits_bq',
    destination_project_dataset_table='{}:computronix.domi_permits_raw'.format(os.environ['GCLOUD_PROJECT']),
    bucket='{}_computronix'.format(os.environ['GCS_PREFIX']),
    source_objects=["domi_permits/avro_output/{{ ds|get_ds_year }}/{{ ds|get_ds_month }}/{{ ds }}/*.avro"],
    write_disposition='WRITE_TRUNCATE',
    create_disposition='CREATE_IF_NEEDED',
    time_partitioning={'type': 'DAY'},
    source_format='AVRO',
    autodetect=True,
    dag=dag
)

computronix_domi_permits_geojoin = BigQueryOperator(
    task_id='computronix_domi_permits_geojoin',
    sql=build_revgeo_query('computronix', 'domi_permits_raw', 'JOBID'),
    use_legacy_sql=False,
    destination_dataset_table='{}:computronix.domi_permits'.format(os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_TRUNCATE',
    time_partitioning={'type': 'DAY'},
    dag=dag
)

computronix_domi_permits_beam_cleanup = BashOperator(
    task_id='computronix_domi_permits_beam_cleanup',
    bash_command=airflow_utils.beam_cleanup_statement('{}_computronix'.format(os.environ['GCS_PREFIX'])),
    dag=dag
)

computronix_domi_permits_gcs >> computronix_domi_permits_dataflow >> computronix_domi_permits_bq >> \
    (computronix_domi_permits_geojoin, computronix_domi_permits_beam_cleanup)
Beispiel #2
0
#     task_id='firearms_dataflow',
#     job_name='firearms-dataflow',
#     py_file=os.getcwd() + '/airflow_scripts/dags/dependencies/dataflow_scripts/firearms_dataflow.py'),
#     dag=dag
# )

dataflow_task = BashOperator(
    task_id='firearms_dataflow',
    bash_command='python {}'.format(
        os.getcwd() + '/airflow_scripts/dags/dependencies/dataflow_scripts'
        '/firearms_dataflow.py'),
    dag=dag)

beam_cleanup = BashOperator(task_id='firearms_beam_cleanup',
                            bash_command=airflow_utils.beam_cleanup_statement(
                                '{}_firearm_seizures'.format(
                                    os.environ['GCS_PREFIX'])),
                            dag=dag)

bq_insert_temp = GoogleCloudStorageToBigQueryOperator(
    task_id='firearms_bq_insert',
    destination_project_dataset_table='{}:firearm_seizures.seizures_temp'.
    format(os.environ['GCP_PROJECT']),
    bucket='{}_firearm_seizures'.format(os.environ['GCS_PREFIX']),
    source_objects=[
        "avro_output/{}/{}/{}/*.avro".format(dt.strftime('%Y'),
                                             dt.strftime('%m').lower(),
                                             dt.strftime("%Y-%m-%d"))
    ],
    write_disposition='WRITE_APPEND',
    source_format='AVRO',
    "gs://{}_community_centers/attendance/".format(os.environ['DAGS_PATH'],
                                                   os.environ['GCS_PREFIX']) +
    "{{ ds|get_ds_year }}/{{ ds|get_ds_month }}/{{ ds }}_attendance.json --avro_output "
    + "gs://{}_community_centers/attendance/avro_output/".format(
        os.environ['GCS_PREFIX']) +
    "{{ ds|get_ds_year }}/{{ ds|get_ds_month }}/{{ ds }}/",
    dag=dag)

comm_ctrs_bq = GoogleCloudStorageToBigQueryOperator(
    task_id='comm_ctrs_bq',
    destination_project_dataset_table='{}:community_centers.attendance'.format(
        os.environ['GCLOUD_PROJECT']),
    bucket='{}_community_centers'.format(os.environ['GCS_PREFIX']),
    source_objects=[
        "attendance/avro_output/{{ ds|get_ds_year }}/{{ ds|get_ds_month }}/{{ ds }}/*.avro"
    ],
    write_disposition='WRITE_TRUNCATE',
    create_disposition='CREATE_IF_NEEDED',
    source_format='AVRO',
    autodetect=True,
    time_partitioning={'type': 'DAY'},
    dag=dag)

comm_ctrs_beam_cleanup = BashOperator(
    task_id='comm_ctrs_beam_cleanup',
    bash_command=airflow_utils.beam_cleanup_statement(
        '{}_community_center'.format(os.environ['GCS_PREFIX'])),
    dag=dag)

comm_ctrs_gcs >> comm_ctrs_dataflow >> (comm_ctrs_bq, comm_ctrs_beam_cleanup)