""" autovacuum_cleanup A maintenance workflow that runs `VACUUM ANALYZE` nightly on all materialized views in the MOVE database, to mitigate service degradation and/or downtime due to autovacuum operations. This was put in place after [issue #482](https://github.com/CityofToronto/bdit_flashcrow/issues/482). The default autovacuum settings in AWS RDS allowed a large number of dead tuples to build up in our `collisions` schema, resulting in unplanned downtime once autovacuum ran. """ from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2020, 8, 19) SCHEDULE_INTERVAL = '0 23 * * *' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) AUTOVACUUM_CLEANUP = create_bash_task_nested(DAG, 'autovacuum_cleanup')
# - in there, look for the URL under `result.resources[].url`. # TASKS = { 'centreline': { 'resource_url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/1d079757-377b-4564-82df-eb5638583bfb/resource/6b5415cc-864f-4702-9296-46520f479b0f/download/centreline_wgs84_v2.zip', 'source_srid': 3857 }, 'centreline_intersection': { 'resource_url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2c83f641-7808-49ba-b80f-7011851d4e27/resource/1d84f2f9-f551-477e-a7fa-f92caf2ae28d/download/intersection-file-wgs84.zip', 'source_srid': 4326 } } INDEX_OPENDATA = create_bash_task_nested(DAG, 'index_opendata') for task_id, params in TASKS.items(): task_id_extract = '{0}_extract'.format(task_id) EXTRACT_OPENDATA_SHAPEFILE = BashOperator( task_id=task_id_extract, bash_command='/copy_opendata_shapefiles/extract_opendata_shapefile.sh', params={ 'name': task_id, 'resource_url': params['resource_url'] }, dag=DAG) task_id_load = '{0}_load'.format(task_id) LOAD_SHAPEFILE = BashOperator( task_id=task_id_load,
""" centreline_vector_tiles Generates vector tiles from the MOVE conflation target, which is built by the `centreline_conflation_target` DAG. These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render interactive centreline features. This is intended to run after `centreline_conflation_target`. """ # pylint: disable=pointless-statement from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2019, 5, 5) SCHEDULE_INTERVAL = '10 5 * * 6' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) LOAD_VOLUME = create_bash_task_nested(DAG, 'load_volume') BUILD_VECTOR_TILES = create_bash_task_nested(DAG, 'build_vector_tiles') EXTRACT_VECTOR_TILES = create_bash_task_nested(DAG, 'extract_vector_tiles') LOAD_VOLUME >> BUILD_VECTOR_TILES BUILD_VECTOR_TILES >> EXTRACT_VECTOR_TILES
""" collisions_vector_tiles Generates vector tiles from collisions data, which is built by the `crash_geocoding` DAG. These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render collisions heatmaps when zoomed out. This is intended to run after `crash_geocoding`. """ # pylint: disable=pointless-statement from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2019, 12, 8) SCHEDULE_INTERVAL = '0 3 * * 6' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) BUILD_COLLISIONS_TILES = create_bash_task_nested(DAG, 'build_collisions_tiles') EXTRACT_COLLISIONS_TILES = create_bash_task_nested(DAG, 'extract_collisions_tiles') BUILD_COLLISIONS_TILES >> EXTRACT_COLLISIONS_TILES
""" font_glyphs Generates signed-distance field font glyphs using the `openmaptiles/fonts` repo on GitHub. This allows us to serve our own fonts for use with MapLibre GL. The repo is cloned to `/data/fonts`, and the resulting font glyphs are stored in `/data/glyphs`. These are served at `/glyphs` on our web EC2 instances. """ # pylint: disable=pointless-statement from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2020, 10, 29) SCHEDULE_INTERVAL = '0 0 1 * *' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) FONT_GLYPHS = create_bash_task_nested(DAG, 'font_glyphs')
Builds the [Traffic Volumes at Intersections for All Modes](https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/) dataset for the City of Toronto Open Data Portal. The dataset is exposed in two ways: via database, and via HTTP. We store the dataset as a series of views in the `open_data` schema. We also dump those views to CSV files at `/data/open_data`, which is served from `/open_data` on our ETL EC2 instances. This is intended to run after `group_multiday_counts`. """ # pylint: disable=pointless-statement from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2021, 3, 2) SCHEDULE_INTERVAL = '15 7 * * 6' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) A1_TMCS_COUNT_DATA = create_bash_task_nested(DAG, 'A1_tmcs_count_data') A1_TMCS_COUNT_METADATA = create_bash_task_nested(DAG, 'A1_tmcs_count_metadata') A2_TMCS_LOCATIONS = create_bash_task_nested(DAG, 'A2_tmcs_locations') A3_TMCS_JOINED = create_bash_task_nested(DAG, 'A3_tmcs_joined') A4_TMCS_DECADES = create_bash_task_nested(DAG, 'A4_tmcs_decades') A4_TMCS_PREVIEW = create_bash_task_nested(DAG, 'A4_tmcs_preview') A1_TMCS_COUNT_DATA >> A2_TMCS_LOCATIONS A1_TMCS_COUNT_METADATA >> A2_TMCS_LOCATIONS A2_TMCS_LOCATIONS >> A3_TMCS_JOINED A3_TMCS_JOINED >> A4_TMCS_DECADES A3_TMCS_JOINED >> A4_TMCS_PREVIEW
not correspond to actual intersections. To fix this, the routing target fills in vertices for those intersection IDs. When routing corridors between centreline features, we use the routing target, then filter the result down to only those features in the conflation target. This is intended to run after `copy_opendata_shapefiles`. """ # pylint: disable=pointless-statement from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2020, 12, 21) SCHEDULE_INTERVAL = '30 5 * * 6' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) A0_INTERSECTIONS_BASE = create_bash_task_nested(DAG, 'A0_intersections_base') A0_MIDBLOCKS_BASE = create_bash_task_nested(DAG, 'A0_midblocks_base') A1_INTERSECTION_IDS = create_bash_task_nested(DAG, 'A1_intersection_ids') A2_INTERSECTIONS = create_bash_task_nested(DAG, 'A2_intersections') A3_MIDBLOCK_NAMES = create_bash_task_nested(DAG, 'A3_midblock_names') A4_MIDBLOCKS = create_bash_task_nested(DAG, 'A4_midblocks') A5_ROUTING_VERTICES = create_bash_task_nested(DAG, 'A5_routing_vertices') A6_ROUTING_EDGES = create_bash_task_nested(DAG, 'A6_routing_edges') A0_INTERSECTIONS_BASE >> A1_INTERSECTION_IDS A0_MIDBLOCKS_BASE >> A1_INTERSECTION_IDS A1_INTERSECTION_IDS >> A2_INTERSECTIONS A2_INTERSECTIONS >> A3_MIDBLOCK_NAMES A3_MIDBLOCK_NAMES >> A4_MIDBLOCKS A4_MIDBLOCKS >> A5_ROUTING_VERTICES A5_ROUTING_VERTICES >> A6_ROUTING_EDGES
This same heuristic was used by the legacy CRASH system to assign collisions to intersections and midblocks. (However, CRASH did not use the Toronto Centreline, but instead used a legacy map layer that has been deprecated and is no longer maintained by the City.) This is intended to run after `replicator_transfer_crash` and `centreline_conflation_target`. """ # pylint: disable=pointless-statement from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2019, 7, 17) SCHEDULE_INTERVAL = '20 19 * * 1-5' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) A1_EVENTS_FIELDS_RAW = create_bash_task_nested(DAG, 'A1_events_fields_raw') A2_EVENTS_FIELDS_NORM = create_bash_task_nested(DAG, 'A2_events_fields_norm') A2_INVOLVED_FIELDS_RAW = create_bash_task_nested(DAG, 'A2_involved_fields_raw') A3_INVOLVED_FIELDS_NORM = create_bash_task_nested(DAG, 'A3_involved_fields_norm') A4_INVOLVED = create_bash_task_nested(DAG, 'A4_involved') A5_EVENTS = create_bash_task_nested(DAG, 'A5_events') A6_EVENTS_INTERSECTIONS = create_bash_task_nested(DAG, 'A6_events_intersections') A6_EVENTS_SEGMENTS = create_bash_task_nested(DAG, 'A6_events_segments') A7_EVENTS_CENTRELINE = create_bash_task_nested(DAG, 'A7_events_centreline') A1_EVENTS_FIELDS_RAW >> A2_EVENTS_FIELDS_NORM A1_EVENTS_FIELDS_RAW >> A2_INVOLVED_FIELDS_RAW A2_EVENTS_FIELDS_NORM >> A3_INVOLVED_FIELDS_NORM A2_INVOLVED_FIELDS_RAW >> A3_INVOLVED_FIELDS_NORM
""" docs_pg_autodoc A documentation workflow that uses `postgresql_autodoc` to generate an HTML page with info on tables and views in the `flashcrow` database. The resulting page is stored at `/data/docs`, and is served at `/docs` from our ETL EC2 instances. """ from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2021, 8, 20) SCHEDULE_INTERVAL = '0 0 * * *' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) DOCS_PG_AUTODOC = create_bash_task_nested(DAG, 'docs_pg_autodoc')
of the traffic study, so that we can get all six relevant counts in database. Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no reliable way to visualize that much data at once. This is intended to run after `arteries_geocoding`. """ # pylint: disable=pointless-statement from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2020, 5, 6) SCHEDULE_INTERVAL = '30 7 * * 6' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) A1_ARTERIES_DOUBLE_LINK_PAIRS = create_bash_task_nested( DAG, 'A1_arteries_double_link_pairs') A1_ARTERIES_MIDBLOCK_SOLO = create_bash_task_nested( DAG, 'A1_arteries_midblock_solo') A2_ARTERIES_GROUPS_PRE = create_bash_task_nested(DAG, 'A2_arteries_groups_pre') A3_ARTERIES_GROUPS_RANKED = create_bash_task_nested( DAG, 'A3_arteries_groups_ranked') A4_ARTERIES_GROUPS_POST = create_bash_task_nested(DAG, 'A4_arteries_groups_post') A1_ARTERIES_DOUBLE_LINK_PAIRS >> A2_ARTERIES_GROUPS_PRE A1_ARTERIES_MIDBLOCK_SOLO >> A2_ARTERIES_GROUPS_PRE A2_ARTERIES_GROUPS_PRE >> A3_ARTERIES_GROUPS_RANKED A3_ARTERIES_GROUPS_RANKED >> A4_ARTERIES_GROUPS_POST
""" location_search_index Builds the views and indexes that support location search, and also builds an index of midblock names. This is intended to run after `centreline_conflation_target` and `copy_gis_layers` """ # pylint: disable=pointless-statement from datetime import datetime from airflow_utils import create_dag, create_bash_task_nested START_DATE = datetime(2020, 4, 5) SCHEDULE_INTERVAL = '30 6 * * 6' DAG = create_dag(__file__, __doc__, START_DATE, SCHEDULE_INTERVAL) TRANSFORM_CENTRELINE_INDEX = create_bash_task_nested(DAG, 'transform_centreline_index') TRANSFORM_INTERSECTIONS_INDEX = create_bash_task_nested(DAG, 'transform_intersections_index') TRANSFORM_TRAFFIC_SIGNAL = create_bash_task_nested(DAG, 'transform_traffic_signal') TRANSFORM_CENTRELINE_INDEX >> TRANSFORM_INTERSECTIONS_INDEX