Ejemplo n.º 1
0
    destination_project_dataset_table='{}:firearm_seizures.seizures_temp'.
    format(os.environ['GCP_PROJECT']),
    bucket='{}_firearm_seizures'.format(os.environ['GCS_PREFIX']),
    source_objects=[
        "avro_output/{}/{}/{}/*.avro".format(dt.strftime('%Y'),
                                             dt.strftime('%m').lower(),
                                             dt.strftime("%Y-%m-%d"))
    ],
    write_disposition='WRITE_APPEND',
    source_format='AVRO',
    time_partitioning={'type': 'DAY'},
    dag=dag)

bq_geo_join = BigQueryOperator(
    task_id='firearms_bq_geojoin',
    sql=build_revgeo_query('firearm_seizures', 'seizures_temp'),
    use_legacy_sql=False,
    destination_dataset_table='{}:firearm_seizures.seizures'.format(
        os.environ['GCP_PROJECT']),
    write_disposition='WRITE_APPEND',
    time_partitioning={'type': 'DAY'},
    dag=dag)

bq_drop_temp = BigQueryOperator(
    task_id='firearms_bq_drop_temp',
    sql='DROP TABLE `{}.firearm_seizures.seizures_temp`'.format(
        os.environ['GCP_PROJECT']),
    use_legacy_sql=False,
    dag=dag)

gcs_load >> dataflow_task >> (bq_insert_temp,
Ejemplo n.º 2
0
                                        dag=dag)

accela_permits_bq_merge = BigQueryOperator(
    task_id='accela_permits_bq_merge',
    sql='SELECT * FROM {}.accela.permits_temp'.format(
        os.environ['GCLOUD_PROJECT']),
    use_legacy_sql=False,
    destination_dataset_table='{}:accela.permits_raw'.format(
        os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_APPEND',
    time_partitioning={'type': 'DAY'},
    dag=dag)

accela_permits_geojoin = BigQueryOperator(
    task_id='accela_permits_geojoin',
    sql=build_revgeo_query('accela', 'permits_raw', 'id'),
    use_legacy_sql=False,
    destination_dataset_table='{}:accela.permits'.format(
        os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_APPEND',
    time_partitioning={'type': 'DAY'},
    dag=dag)

accela_bq_drop_temp = BigQueryOperator(
    task_id='accela_bq_drop_temp',
    sql='DROP TABLE `{}.accela.permits_temp`'.format(
        os.environ['GCLOUD_PROJECT']),
    use_legacy_sql=False,
    dag=dag)

accela_permits_beam_cleanup = BashOperator(
Ejemplo n.º 3
0
computronix_domi_permits_bq = GoogleCloudStorageToBigQueryOperator(
    task_id='computronix_domi_permits_bq',
    destination_project_dataset_table='{}:computronix.domi_permits_raw'.format(os.environ['GCLOUD_PROJECT']),
    bucket='{}_computronix'.format(os.environ['GCS_PREFIX']),
    source_objects=["domi_permits/avro_output/{{ ds|get_ds_year }}/{{ ds|get_ds_month }}/{{ ds }}/*.avro"],
    write_disposition='WRITE_TRUNCATE',
    create_disposition='CREATE_IF_NEEDED',
    time_partitioning={'type': 'DAY'},
    source_format='AVRO',
    autodetect=True,
    dag=dag
)

computronix_domi_permits_geojoin = BigQueryOperator(
    task_id='computronix_domi_permits_geojoin',
    sql=build_revgeo_query('computronix', 'domi_permits_raw', 'JOBID'),
    use_legacy_sql=False,
    destination_dataset_table='{}:computronix.domi_permits'.format(os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_TRUNCATE',
    time_partitioning={'type': 'DAY'},
    dag=dag
)

computronix_domi_permits_beam_cleanup = BashOperator(
    task_id='computronix_domi_permits_beam_cleanup',
    bash_command=airflow_utils.beam_cleanup_statement('{}_computronix'.format(os.environ['GCS_PREFIX'])),
    dag=dag
)

computronix_domi_permits_gcs >> computronix_domi_permits_dataflow >> computronix_domi_permits_bq >> \
    (computronix_domi_permits_geojoin, computronix_domi_permits_beam_cleanup)
Ejemplo n.º 4
0
    dag=dag
)

qalert_requests_bq_merge = BigQueryOperator(
    task_id='qalert_requests_bq_merge',
    sql='SELECT * FROM {}.qalert.requests_temp'.format(os.environ['GCLOUD_PROJECT']),
    use_legacy_sql=False,
    destination_dataset_table='{}:qalert.requests_raw'.format(os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_APPEND',
    time_partitioning={'type': 'DAY'},
    dag=dag
)

qalert_requests_geojoin = BigQueryOperator(
    task_id='qalert_geojoin',
    sql=build_revgeo_query('qalert', 'requests_raw', 'id'),
    use_legacy_sql=False,
    destination_dataset_table='{}:qalert.requests'.format(os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_TRUNCATE',
    time_partitioning={'type': 'DAY'},
    dag=dag
)

qalert_bq_drop_temp = BigQueryOperator(
    task_id='qalert_bq_drop_temp',
    sql='DROP TABLE `{}.qalert.requests_temp`'.format(os.environ['GCLOUD_PROJECT']),
    use_legacy_sql=False,
    dag=dag
)

qalert_beam_cleanup = BashOperator(
Ejemplo n.º 5
0
    task_id='firearms_bq_insert',
    destination_project_dataset_table='{}:firearm_seizures.seizures_raw'.
    format(os.environ['GCLOUD_PROJECT']),
    bucket='{}_firearm_seizures'.format(os.environ['GCS_PREFIX']),
    source_objects=[
        "avro_output/{{ ds|get_ds_year }}/{{ ds|get_ds_month }}/{{ ds }}/*.avro"
    ],
    write_disposition='WRITE_APPEND',
    source_format='AVRO',
    autodetect=True,
    time_partitioning={'type': 'DAY'},
    dag=dag)

bq_geojoin = BigQueryOperator(
    task_id='qalert_geojoin',
    sql=build_revgeo_query('firearm_seizures', 'seizures_raw', 'address'),
    use_legacy_sql=False,
    destination_dataset_table='{}:firearm_seizures.seizures'.format(
        os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_APPEND',
    time_partitioning={'type': 'DAY'},
    dag=dag)

beam_cleanup = BashOperator(task_id='firearms_beam_cleanup',
                            bash_command=airflow_utils.beam_cleanup_statement(
                                '{}_firearm_seizures'.format(
                                    os.environ['GCS_PREFIX'])),
                            dag=dag)

gcs_load >> dataflow_task >> (bq_insert, beam_cleanup) >> bq_geojoin
    time_partitioning={'type': 'DAY'},
    dag=dag)

wprdc_dedup_fire = BigQueryOperator(
    task_id='wprdc_dedup_fire',
    sql=dedup_table('ems_fire_calls', 'fire_raw'),
    use_legacy_sql=False,
    destination_dataset_table='{}:ems_fire_calls.fire_raw'.format(
        os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_TRUNCATE',
    time_partitioning={'type': 'DAY'},
    dag=dag)

wprdc_ems_geojoin = BigQueryOperator(
    task_id='wprdc_ems_geojoin',
    sql=build_revgeo_query('ems_fire_calls', 'ems_raw', 'call_id_hash'),
    use_legacy_sql=False,
    destination_dataset_table='{}:ems_fire_calls.ems_calls'.format(
        os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_TRUNCATE',
    time_partitioning={'type': 'DAY'},
    dag=dag)

wprdc_fire_geojoin = BigQueryOperator(
    task_id='wprdc_fire_geojoin',
    sql=build_revgeo_query('ems_fire_calls', 'fire_raw', 'call_id_hash'),
    use_legacy_sql=False,
    destination_dataset_table='{}:ems_fire_calls.fire_calls'.format(
        os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_TRUNCATE',
    time_partitioning={'type': 'DAY'},
Ejemplo n.º 7
0
        os.environ['GCP_PROJECT']),
    bucket='{}_311'.format(os.environ['GCS_PREFIX']),
    source_objects=[
        "requests/avro_output/{}/{}/{}/*.avro".format(
            dt.strftime('%Y'),
            dt.strftime('%m').lower(), dt.strftime("%Y-%m-%d"))
    ],
    write_disposition='WRITE_APPEND',
    create_disposition='CREATE_IF_NEEDED',
    source_format='AVRO',
    time_partitioning={'type': 'DAY'},
    dag=dag)

qalert_requests_geo_join = BigQueryOperator(
    task_id='qalert_requests_bq_geojoin',
    sql=build_revgeo_query('311', 'requests_temp'),
    use_legacy_sql=False,
    destination_dataset_table='{}:311.requests_geo_temp'.format(
        os.environ['GCP_PROJECT']),
    write_disposition='WRITE_APPEND',
    time_partitioning={'type': 'DAY'},
    dag=dag)

qalert_requests_bq_filter = BigQueryOperator(
    task_id='qalert_requests_bq_filter',
    sql=filter_old_values('311', 'requests_geo_temp', 'requests', 'id'),
    use_legacy_sql=False,
    dag=dag)

qalert_requests_bq_merge = BigQueryOperator(
    task_id='qalert_requests_bq_merge',
Ejemplo n.º 8
0
    destination_project_dataset_table='{}:parking_transactions.meters_raw'.
    format(os.environ['GCLOUD_PROJECT']),
    bucket='{}_parking'.format(os.environ['GCS_PREFIX']),
    source_objects=[
        "meters/avro_output/{{ ds|get_ds_year }}/{{ ds|get_ds_month }}/{{ ds }}/*.avro"
    ],
    write_disposition='WRITE_TRUNCATE',
    create_disposition='CREATE_IF_NEEDED',
    source_format='AVRO',
    autodetect=True,
    time_partitioning={'type': 'DAY'},
    dag=dag)

parking_meters_geojoin = BigQueryOperator(
    task_id='parking_meters_geojoin',
    sql=build_revgeo_query('parking_transactions', 'meters_raw', 'id'),
    use_legacy_sql=False,
    destination_dataset_table='{}:parking_transactions.meters'.format(
        os.environ['GCLOUD_PROJECT']),
    write_disposition='WRITE_TRUNCATE',
    time_partitioning={'type': 'DAY'},
    dag=dag)

parking_beam_cleanup = BashOperator(
    task_id='parking_beam_cleanup',
    bash_command=airflow_utils.beam_cleanup_statement('{}_parking'.format(
        os.environ['GCS_PREFIX'])),
    dag=dag)

parking_transactions_dataflow = BashOperator(
    task_id='parking_transactions_dataflow',