dest_s3_key='budget/'+f, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) pos = [i for i, e in enumerate(categories) if e == cat] if cat == "budget_reference": ds_task = get_refs md = 'budget-reference-'+f.split('_')[2] #: Update portal modified date update_budget_md = get_seaboard_update_dag(md+'.md', dag) #: update md task must run after the upload task update_budget_md.set_upstream(upload_task) else: if cat == "budget_capital": ds_task = make_capital if 'ptd' in f: md = 'capital-budget-ptd' else: md = 'capital-budget-fy' elif cat == "budget_operating": ds_task = make_operating md = 'operating-budget' elif cat == "actuals_capital":
def create_sde_tasks(dag, folder, layer, datasd_name, md, path_to_file, sde_to_shp): """Dynamically create SDE Airflow tasks. dag: DAG defined in _dags file. folder: subfolder in the sde folder on S3. layer: layer name. datasd_name: layer name + _datasd. md: name of md file on Seaboard. path_to_file: poseidon path + datasd_name. sde_to_shp: _jobs specific sde_to_shp function """ #: Latest Only Operator for sde layer sde_latest_only = LatestOnlyOperator(task_id='{layer}_latest_only' .format(layer=layer), dag=dag) #: Convert sde table to shapefile format to_shp = PythonOperator( task_id='{layer}_to_shp'.format(layer=layer), python_callable=sde_to_shp, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert shapefile to GeoJSON format to_geojson = BashOperator( task_id='{layer}_to_geojson'.format(layer=layer), bash_command=shp_to_geojson(path_to_file), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert shapefile to TopoJSON format to_topojson = BashOperator( task_id='{layer}_to_topojson'.format(layer=layer), bash_command=shp_to_topojson(path_to_file), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Compress shapefile components to_zip = PythonOperator( task_id='{layer}_shp_to_zip'.format(layer=layer), python_callable=shp_to_zip, op_kwargs={'datasd_name': datasd_name}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload shapefile to S3 shp_to_S3 = S3FileTransferOperator( task_id='{layer}_shp_to_S3'.format(layer=layer), source_base_path=conf['prod_data_dir'], source_key='{datasd_name}.zip'.format(datasd_name=datasd_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/{folder}/{datasd_name}.zip' .format(folder=folder, datasd_name=datasd_name), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Upload geojson to S3 geojson_to_S3 = S3FileTransferOperator( task_id='{layer}_geojson_to_S3'.format(layer=layer), source_base_path=conf['prod_data_dir'], source_key='{datasd_name}.geojson'.format(datasd_name=datasd_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/{folder}/{datasd_name}.geojson' .format(folder=folder, datasd_name=datasd_name), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Upload topojson to S3 topojson_to_S3 = S3FileTransferOperator( task_id='{layer}_topojson_to_S3'.format(layer=layer), source_base_path=conf['prod_data_dir'], source_key='{datasd_name}.topojson'.format(datasd_name=datasd_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/{folder}/{datasd_name}.topojson' .format(folder=folder, datasd_name=datasd_name), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_md = get_seaboard_update_dag('{md}.md'.format(md=md), dag) if layer not in no_pbf: #: Convert GeoJSON to Geobuf format to_geobuf = PythonOperator( task_id='{layer}_to_geobuf'.format(layer=layer), python_callable=geojson_to_geobuf, op_kwargs={'path_to_file': path_to_file}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Convert geobuf to gzipped geobuf to_gzip = PythonOperator( task_id='{layer}_geobuf_to_gzip'.format(layer=layer), python_callable=geobuf_to_gzip, op_kwargs={'datasd_name': datasd_name}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload geobuf to S3 geobuf_to_S3 = S3FileTransferOperator( task_id='{layer}_geobuf_to_S3'.format(layer=layer), source_base_path=conf['prod_data_dir'], source_key='{datasd_name}.pbf'.format(datasd_name=datasd_name), dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/{folder}/{datasd_name}.pbf' .format(folder=folder, datasd_name=datasd_name), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, use_gzip=True, dag=dag) #: Conversion to geobuf is triggered after conversion to geojson. to_geobuf.set_upstream(to_geojson) #: Compression to gzip is triggered after conversion to geobuf. to_gzip.set_upstream(to_geobuf) #: geobuf upload to S3 is triggered after compression to gzipped geobuf. geobuf_to_S3.set_upstream(to_gzip) #: Github update depends on shapefile S3 upload success. update_md.set_upstream(geobuf_to_S3) #: Execution rules: #: sde_latest_only must run before shp conversion. to_shp.set_upstream(sde_latest_only) #: Conversion to geojson is triggered after conversion to shp. to_geojson.set_upstream(to_shp) #: Conversion to topojson is triggered after conversion to shapefile. to_topojson.set_upstream(to_shp) #: Compression to zip is triggered after conversion to geojson and topojson. to_zip.set_upstream(to_geojson) to_zip.set_upstream(to_topojson) #: shapefile upload to S3 is triggered after conversion to zip. shp_to_S3.set_upstream(to_zip) #: geojson upload to S3 is triggered after conversion to geojson. geojson_to_S3.set_upstream(to_geojson) #: topojson upload to S3 is triggered after conversion to topojson. topojson_to_S3.set_upstream(to_topojson) #: Github update depends on shapefile S3 upload success. update_md.set_upstream(shp_to_S3) update_md.set_upstream(geojson_to_S3) update_md.set_upstream(topojson_to_S3)
#: Upload prod SE file to S3 upload_special_events = S3FileTransferOperator( task_id='upload_special_events', source_base_path=conf['prod_data_dir'], source_key='special_events_list_datasd.csv', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='special_events/special_events_list_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_special_events_md = get_seaboard_update_dag('special-events.md', dag) #: Execution rules #: se_latest_only must run before get_special_events get_special_events.set_upstream(se_latest_only) #: process_special_events dependent on get_special_events process_special_events.set_upstream(get_special_events) #: upload_special_events dependent on process_special_events upload_special_events.set_upstream(process_special_events) #: update github modified date after S3 upload update_special_events_md.set_upstream(upload_special_events)
#: Upload prod file to S3 cfs_to_S3 = S3FileTransferOperator( task_id='cfs_to_S3', source_base_path=conf['prod_data_dir'], source_key='pd_calls_for_service_'+curr_year+'_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='pd/pd_calls_for_service_'+curr_year+'_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_pd_cfs_md = get_seaboard_update_dag('police-calls-for-service.md', dag) #: Execution rules: #: pd_cfs_latest_only must run before pd_cfs_data get_cfs_data.set_upstream(pd_cfs_latest_only) #: Data processing is triggered after data retrieval. process_cfs_data.set_upstream(get_cfs_data) #: Data upload to S3 is triggered after data processing completion. cfs_to_S3.set_upstream(process_cfs_data) #: Github update depends on S3 upload success. update_pd_cfs_md.set_upstream(cfs_to_S3)
on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload Inventory CSV to S3 upload_inventory = S3FileTransferOperator( task_id='upload_inventory', source_base_path=conf['prod_data_dir'], source_key='inventory_datasd_v1.csv', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='inventory/inventory_datasd_v1.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_inventory_md = get_seaboard_update_dag('data-inventory.md', dag) #: Execution Rules #: Latest only for inventory to csv inventory_to_csv.set_upstream(inv_latest_only) #: Inventory csv gets created before its uploaded upload_inventory.set_upstream(inventory_to_csv) #: upload_gid_requests must succeed before updating github update_inventory_md.set_upstream(upload_inventory)
#: Upload topojson GIS file to S3 upload_pbf_file = S3FileTransferOperator( task_id='sidewalks_pbf_to_S3', source_base_path=conf['prod_data_dir'], source_key='sidewalks.pbf', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='tsw/sidewalks.pbf', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_gis_md = get_seaboard_update_dag('sidewalk-gis.md', dag) #: Execution order #: Latest only operator must run before getting sidewalk data get_sidewalk_data.set_upstream(sidewalk_latest_only) #: Getting sidewalk data must run before uploading upload_oci_file.set_upstream(get_sidewalk_data) #: get_sidewalk_data must run before get shapefiles so they can be joined get_sw_shapefiles.set_upstream(sidewalk_latest_only) #: get_sw_shapefiles must run before converting to geojson sidewalks_to_geojson.set_upstream(get_sw_shapefiles)
get_approvals_files = PythonOperator( task_id='get_approvals_files', python_callable=dfg.get_files, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, op_kwargs={'fname_list': fnames, 'target_dir': dsd_temp_dir}, dag=dag) #: dsd_approvals_latest_only must run before get_approvals_files get_approvals_files.set_upstream(dsd_approvals_latest_only) #: update github modified date (solar permits) update_solar_md = get_seaboard_update_dag('solar-permits.md', dag) for key in app.approval_dict: #: Consolidate weekly permitting data by scraping OpenDSD API scrape_dsd = PythonOperator( task_id='scrape_dsd_' + key, python_callable=app.scrape_dsd, op_kwargs={'key': key}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Add consolidated weekly data to current prod data update_dsd = PythonOperator(
on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update data inventory json update_json_date = PythonOperator( task_id='update_json_date', python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'civic_art_collection'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_public_art_md = get_seaboard_update_dag('public-art.md', dag) #: Execution rules #: public_art_latest_only must run before get_public_art get_public_art.set_upstream(public_art_latest_only) #: public_art_latest_only must run before get_public_art process_public_art.set_upstream(get_public_art) #: get_public_art must run before file upload upload_public_art.set_upstream(process_public_art) #: upload_gid_requests must succeed before updating github update_public_art_md.set_upstream(upload_public_art) #: upload data must succeed before updating json update_json_date.set_upstream(upload_public_art)
on_success_callback=notify, replace=True, dag=dag) update_json_date = PythonOperator( task_id='update_json_date', python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'special_events'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_special_events_md = get_seaboard_update_dag('special-events.md', dag) #: Execution rules #: se_latest_only must run before get_special_events get_special_events.set_upstream(se_latest_only) #: process_special_events dependent on get_special_events process_special_events.set_upstream(get_special_events) #: process_special_events dependent on get_special_events addresses_to_S3.set_upstream(process_special_events) #: upload_special_events dependent on process_special_events upload_special_events.set_upstream(process_special_events) #: upload_special_events dependent on process_special_events
#: Upload topojson GIS file to S3 upload_pbf_file = S3FileTransferOperator( task_id='tree_canopy_pbf_to_S3', source_base_path=conf['prod_data_dir'], source_key='tree_canopy_datasd.pbf', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='sde/tree_canopy_datasd.pbf', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_gis_md = get_seaboard_update_dag('tree-canopy-2014.md', dag) #: Execution order #: Latest only operator must run before getting tree canopy data get_shapefiles.set_upstream(treecan_latest_only) #: get_shapefiles must run before converting to geojson shp_to_geojson.set_upstream(get_shapefiles) #: to_geojson must run before converting to geobuf geojson_to_geobuf.set_upstream(shp_to_geojson) #: to_geobuf must run before zipping geobuf geobuf_zip.set_upstream(geojson_to_geobuf)
for index, dataset in enumerate(datasets): update_date_mod_json = PythonOperator( task_id=f"update_json_date_{dataset}", python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': dataset}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: upload data must succeed before updating json update_date_mod_json.set_upstream(s3_uploaders[index]) #: Update leases portal modified date update_leases_md = get_seaboard_update_dag('city-owned-properties-leases.md', dag) #: Update details portal modified date update_details_md = get_seaboard_update_dag('city-owned-properties-details.md', dag) #: Update parcels portal modified date update_parcels_md = get_seaboard_update_dag('city-owned-properties-parcels.md', dag) #: Execution Rules #: read_latest_only must run before get_billing get_billing.set_upstream(read_latest_only) #: read_latest_only must run before get_leases
#: Uploads the generated agg file upload_by_day_agg = S3FileTransferOperator( task_id='upload_by_day_agg', source_base_path=conf['prod_data_dir'], source_key=flist['by_day'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='parking_meters/' + flist['by_day'], replace=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_parking_trans_md = get_seaboard_update_dag('parking-meters-transactions.md', dag) #: Execution Rules #: parking_meters_latest_only must run before get_parking_files get_parking_files.set_upstream(parking_meters_latest_only) #: Download Files, build prod file. #: build_prod_file depends on get_parking_files: build_prod_file.set_upstream(get_parking_files) #: Upload Prod File #: upload_prod_file depends on build_prod_file upload_prod_file.set_upstream(build_prod_file)
task_id='create_sdif_{}_miles_paved_sonar'.format(i), range_id='days_30', value_key='sdif_{}_miles'.format(i), value_desc='Miles Paved {}'.format(i), python_callable=build_sonar_miles_aggs, op_kwargs={'mode': 'sdif', 'pav_type': i}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Depends on successful run of get_streets_data sonar_task.set_upstream(get_streets_data) #: Update portal modified date update_streets_md = get_seaboard_update_dag('streets-repair-projects.md', dag) #: Execution order #: streets_latest_only must run before get_streets_data get_streets_data.set_upstream(streets_latest_only) #: upload_streets_data is dependent on successful run of get_streets_data upload_streets_data.set_upstream(get_streets_data) #: update md file after upload to S3FileTransferOperator update_streets_md.set_upstream(upload_streets_data)
on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Create subsets create_subsets = PythonOperator( task_id='create_subsets', python_callable=make_prod_files, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_ttcs_md = get_seaboard_update_dag('business-listings.md', dag) #: Execution Rules #: ttcs_latest_only must run before get_active get_active_businesses.set_upstream(ttcs_latest_only) #: ttcs_latest_only must run before get_bids clean_data.set_upstream(get_active_businesses) #: Data cleaning occurs after BIDs data retrieval. geocode_data.set_upstream(clean_data) #: spatial join occurs after geocoding. join_bids.set_upstream(geocode_data) #: last 3mo subsetting occurs after spatial join create_subsets.set_upstream(join_bids) subset_names = [os.path.basename(x) for x in glob.glob(conf['prod_data_dir']+'/sd_businesses_*.csv')]
upload_latest_indicator_bac_tests = S3FileTransferOperator( task_id='upload_latest_indicator_bac_tests', source_base_path=conf['prod_data_dir'], source_key='latest_indicator_bac_tests_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='water_testing/latest_indicator_bac_tests_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_water_md = get_seaboard_update_dag( 'monitoring-of-indicator-bacteria-in-drinking-water.md', dag) #: Execution Rules #: water_latest_only must run before get_indicator_bac_tests get_indicator_bac_tests.set_upstream(wtr_latest_only) #: Upload indicator bac tests after it has successfully run upload_indicator_bac_tests.set_upstream(get_indicator_bac_tests) #: get_last_bac_tests is dependent on get_indicator_bac_tests get_latest_bac_tests.set_upstream(get_indicator_bac_tests) #: Upload latest indicator bac tests after the file has been generated upload_latest_indicator_bac_tests.set_upstream(get_latest_bac_tests)
upload_prod_file = S3FileTransferOperator( task_id='upload_meter_locs', source_base_path=conf['prod_data_dir'], source_key='treas_parking_meters_loc_datasd_v1.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='parking_meters/treas_parking_meters_loc_datasd_v1.csv', replace=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update data inventory json update_json_date = PythonOperator( task_id='update_json_date', python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'parking_meters_locations'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_parking_trans_md = get_seaboard_update_dag('parking_meters_locations.md', dag) #: Execution Rules get_parking_files >> build_prod_file >> clean_daily_files >> upload_prod_file >> [update_parking_trans_md, update_json_date]
on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Upload prod art file to S3 upload_public_art = S3FileTransferOperator( task_id='upload_public_art', source_base_path=conf['prod_data_dir'], source_key='public_art_locations_datasd.csv', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='public_art/public_art_locations_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_public_art_md = get_seaboard_update_dag('public-art.md', dag) #: Execution rules #: public_art_latest_only must run before get_public_art get_public_art.set_upstream(public_art_latest_only) #: get_public_art must run before file upload upload_public_art.set_upstream(get_public_art) #: upload_gid_requests must succeed before updating github update_public_art_md.set_upstream(upload_public_art)
on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) update_json_date = PythonOperator(task_id='update_json_date', python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'traffic_volumes'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_traffic_md = get_seaboard_update_dag('traffic-volumes.md', dag) #: Execution Rules #: traffic_counts_latest_only must run before get_traffic_counts get_traffic_counts.set_upstream(tc_latest_only) #: Cleaning task triggered after data retrieval. clean_traffic_counts.set_upstream(get_traffic_counts) #: Production build task triggered after cleaning task. build_traffic_counts.set_upstream(clean_traffic_counts) #: Data upload to S3 triggered after production build task. upload_traffic_counts.set_upstream(build_traffic_counts) #: Update .md file after S3 upload update_traffic_md.set_upstream(upload_traffic_counts) #: upload data must succeed before updating json update_json_date.set_upstream(upload_traffic_counts)
on_success_callback=notify, replace=True, dag=dag) update_json_date = PythonOperator( task_id='update_json_date', python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'indicator_bacteria_monitoring'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_water_md = get_seaboard_update_dag( 'monitoring-of-indicator-bacteria-in-drinking-water.md', dag) #: Execution Rules #: water_latest_only must run before get_indicator_bac_tests get_indicator_bac_tests.set_upstream(wtr_latest_only) #: Upload indicator bac tests after it has successfully run upload_indicator_bac_tests.set_upstream(get_indicator_bac_tests) #: get_last_bac_tests is dependent on get_indicator_bac_tests get_latest_bac_tests.set_upstream(get_indicator_bac_tests) #: Upload latest indicator bac tests after the file has been generated upload_latest_indicator_bac_tests.set_upstream(get_latest_bac_tests) #: update .md file after S3 upload
on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update data inventory json update_json_date = PythonOperator(task_id='update_json_date', python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'traffic_collisions'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_pd_cls_md = get_seaboard_update_dag('police-collisions.md', dag) #: Execution rules: #: pd_col_latest_only must run before get_collisions_data get_collisions_data.set_upstream(pd_col_latest_only) #: Data processing is triggered after data retrieval. process_collisions_data.set_upstream(get_collisions_data) #: Data upload to S3 is triggered after data processing completion. collisions_to_S3.set_upstream(process_collisions_data) #: Github update depends on S3 upload success. update_pd_cls_md.set_upstream(collisions_to_S3)
upload_dsd_permits = S3FileTransferOperator( task_id='upload_dsd_permits', source_base_path=conf['prod_data_dir'], source_key='dsd_permits_{}_datasd_v1.csv'.format(year), dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='dsd/' + 'dsd_permits_{}_datasd_v1.csv'.format(year), replace=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: update permits.md file update_permits_md = get_seaboard_update_dag('permits-dsd.md', dag) #: Execution rules #: dsd_permits_latest_only must run before get_permits_files get_permits_files.set_upstream(dsd_permits_latest_only) #: clean_data tasks are executed after get_approvals_files task clean_data.set_upstream(get_permits_files) #: upload_dsd tasks are executed after clean_data tasks join_bids.set_upstream(clean_data) #: upload_dsd tasks are executed after join bids tasks upload_dsd_permits.set_upstream(join_bids)
task_id='upload_solar_permits', source_base_path=conf['prod_data_dir'], source_key='solar_permits_{}_datasd.csv'.format(year), dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='dsd/' + 'solar_permits_{}_datasd.csv'.format(year), replace=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: update permits.md file update_permits_md = get_seaboard_update_dag('permits.md', dag) #: update permits.md file update_solar_md = get_seaboard_update_dag('solar-permits.md', dag) #: Execution rules #: dsd_permits_latest_only must run before get_permits_files get_permits_files.set_upstream(dsd_permits_latest_only) #: clean_data tasks are executed after get_approvals_files task clean_data.set_upstream(get_permits_files) #: upload_dsd tasks are executed after clean_data tasks join_bids.set_upstream(clean_data)
#: Upload prod file to S3 collisions_to_S3 = S3FileTransferOperator( task_id='collisions_to_S3', source_base_path=conf['prod_data_dir'], source_key='pd_collisions_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='pd/pd_collisions_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_pd_cls_md = get_seaboard_update_dag('police-collisions.md', dag) #: Execution rules: #: pd_col_latest_only must run before get_collisions_data get_collisions_data.set_upstream(pd_col_latest_only) #: Data processing is triggered after data retrieval. process_collisions_data.set_upstream(get_collisions_data) #: Data upload to S3 is triggered after data processing completion. collisions_to_S3.set_upstream(process_collisions_data) #: Github update depends on S3 upload success. update_pd_cls_md.set_upstream(collisions_to_S3)
task_id='dsd_code_enf_latest_only', dag=dag) #: Download code enforcement files and unzip them. get_code_enf_files = PythonOperator( task_id='get_code_enf_files', python_callable=dfg.get_files, op_kwargs={'fname_list': fname_list, 'target_dir': dsd_temp_dir}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_code_enf_md = get_seaboard_update_dag('code-enforcement-violations.md', dag) #: Execution rules #: dsd_code_enf_latest_only must run before get_code_enf_files get_code_enf_files.set_upstream(dsd_ce_latest_only) for i in fname_list: #: Create fme shell command build_csv_task = BashOperator( task_id='get_' + i, bash_command=get_bash_command(i), on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag)
python_callable=join_bids, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Create subsets create_subsets = PythonOperator(task_id='create_subsets', python_callable=make_prod_files, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_ttcs_md = get_seaboard_update_dag('business-listings.md', dag) #: Execution Rules #: ttcs_latest_only must run before get_active get_active_businesses.set_upstream(ttcs_latest_only) #: ttcs_latest_only must run before get_bids clean_data.set_upstream(get_active_businesses) #: Data cleaning occurs after BIDs data retrieval. geocode_data.set_upstream(clean_data) #: Address book is uploaded after geocoding occurs addresses_to_S3.set_upstream(geocode_data) #: Spatial join occurs after geocoding. join_bids.set_upstream(geocode_data) #: Subsetting occurs after spatial join create_subsets.set_upstream(join_bids)
#: Upload prod file to S3 hc_to_S3 = S3FileTransferOperator(task_id='prod_file_to_S3', source_base_path=conf['prod_data_dir'], source_key='hate_crimes_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='pd/hate_crimes_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update data inventory json update_hc_date = PythonOperator(task_id='update_json_date', python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'hate_crimes'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_pd_hc_md = get_seaboard_update_dag('police-hate-crimes.md', dag) #: Execution rules: pd_hc_latest_only >> get_hc_data >> process_hc_data >> hc_to_S3 >> [ update_hc_date, update_pd_hc_md ]
on_success_callback=notify, dag=dag) #: Update data inventory json update_json_date = PythonOperator( task_id='update_json_date', python_callable=update_json_date, provide_context=True, op_kwargs={'ds_fname': 'parking_meters_transactions'}, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_parking_trans_md = get_seaboard_update_dag('parking-meters-transactions.md', dag) #: Execution Rules #: parking_meters_latest_only must run before get_parking_files get_parking_files.set_upstream(parking_meters_latest_only) #: Download Files, build prod file. #: build_prod_file depends on get_parking_files: build_prod_file.set_upstream(get_parking_files) #: Upload Prod File #: upload_prod_file depends on build_prod_file upload_prod_file.set_upstream(build_prod_file)
#: Uploads the generated production file upload_traffic_counts = S3FileTransferOperator( task_id='upload_traffic_counts', source_base_path=conf['prod_data_dir'], source_key='traffic_counts_datasd.csv', dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_key='traffic_counts/traffic_counts_datasd.csv', replace=True, on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, dag=dag) #: Update portal modified date update_traffic_md = get_seaboard_update_dag('traffic-volumes.md', dag) #: Execution Rules #: traffic_counts_latest_only must run before get_traffic_counts get_traffic_counts.set_upstream(tc_latest_only) #: Cleaning task triggered after data retrieval. clean_traffic_counts.set_upstream(get_traffic_counts) #: Production build task triggered after cleaning task. build_traffic_counts.set_upstream(clean_traffic_counts) #: Data upload to S3 triggered after production build task. upload_traffic_counts.set_upstream(build_traffic_counts) #: Update .md file after S3 upload update_traffic_md.set_upstream(upload_traffic_counts)
#: Upload OCI file to S3 upload_oci_file = S3FileTransferOperator( task_id='upload_oci', source_base_path=conf['prod_data_dir'], source_key='sidewalk_cond_datasd.csv', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='tsw/sidewalk_cond_datasd.csv', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag) #: Update portal modified date update_oci_md = get_seaboard_update_dag('sidewalk-oci.md', dag) #: Upload shp GIS file to S3 upload_shp_file = S3FileTransferOperator( task_id='sidewalks_shp_to_S3', source_base_path=conf['prod_data_dir'], source_key='sidewalks.zip', dest_s3_conn_id=conf['default_s3_conn_id'], dest_s3_bucket=conf['dest_s3_bucket'], dest_s3_key='tsw/sidewalks.zip', on_failure_callback=notify, on_retry_callback=notify, on_success_callback=notify, replace=True, dag=dag)