def test_build_task_group_context_manager(): execution_date = pendulum.parse("20200101") with DAG("test_build_task_group_context_manager", start_date=execution_date) as dag: task1 = DummyOperator(task_id="task1") with TaskGroup("group234") as group234: _ = DummyOperator(task_id="task2") with TaskGroup("group34") as group34: _ = DummyOperator(task_id="task3") _ = DummyOperator(task_id="task4") task5 = DummyOperator(task_id="task5") task1 >> group234 group34 >> task5 assert task1.get_direct_relative_ids(upstream=False) == { 'group234.group34.task4', 'group234.group34.task3', 'group234.task2', } assert task5.get_direct_relative_ids(upstream=True) == { 'group234.group34.task4', 'group234.group34.task3', } assert dag.task_group.group_id is None assert dag.task_group.is_root assert set( dag.task_group.children.keys()) == {"task1", "group234", "task5"} assert group34.group_id == "group234.group34" assert task_group_to_dict(dag.task_group) == EXPECTED_JSON
def test_build_task_group_with_prefix(): """ Tests that prefix_group_id turns on/off prefixing of task_id with group_id. """ execution_date = pendulum.parse("20200101") with DAG("test_build_task_group_with_prefix", start_date=execution_date) as dag: task1 = DummyOperator(task_id="task1") with TaskGroup("group234", prefix_group_id=False) as group234: task2 = DummyOperator(task_id="task2") with TaskGroup("group34") as group34: task3 = DummyOperator(task_id="task3") with TaskGroup("group4", prefix_group_id=False) as group4: task4 = DummyOperator(task_id="task4") task5 = DummyOperator(task_id="task5") task1 >> group234 group34 >> task5 assert task2.task_id == "task2" assert group34.group_id == "group34" assert task3.task_id == "group34.task3" assert group4.group_id == "group34.group4" assert task4.task_id == "task4" assert task5.task_id == "task5" assert group234.get_child_by_label("task2") == task2 assert group234.get_child_by_label("group34") == group34 assert group4.get_child_by_label("task4") == task4 assert extract_node_id(task_group_to_dict(dag.task_group), include_label=True) == { 'id': None, 'label': None, 'children': [ { 'id': 'group234', 'label': 'group234', 'children': [ { 'id': 'group34', 'label': 'group34', 'children': [ { 'id': 'group34.group4', 'label': 'group4', 'children': [{'id': 'task4', 'label': 'task4'}], }, {'id': 'group34.task3', 'label': 'task3'}, {'id': 'group34.downstream_join_id', 'label': ''}, ], }, {'id': 'task2', 'label': 'task2'}, {'id': 'group234.upstream_join_id', 'label': ''}, ], }, {'id': 'task1', 'label': 'task1'}, {'id': 'task5', 'label': 'task5'}, ], }
def test_task_group_serialization(self): """ Test TaskGroup serialization/deserialization. """ from airflow.operators.dummy_operator import DummyOperator from airflow.utils.task_group import TaskGroup execution_date = datetime(2020, 1, 1) with DAG("test_task_group_serialization", start_date=execution_date) as dag: task1 = DummyOperator(task_id="task1") with TaskGroup("group234") as group234: _ = DummyOperator(task_id="task2") with TaskGroup("group34") as group34: _ = DummyOperator(task_id="task3") _ = DummyOperator(task_id="task4") task5 = DummyOperator(task_id="task5") task1 >> group234 group34 >> task5 dag_dict = SerializedDAG.to_dict(dag) SerializedDAG.validate_schema(dag_dict) json_dag = SerializedDAG.from_json(SerializedDAG.to_json(dag)) self.validate_deserialized_dag(json_dag, dag) serialized_dag = SerializedDAG.deserialize_dag( SerializedDAG.serialize_dag(dag)) assert serialized_dag.task_group.children assert serialized_dag.task_group.children.keys( ) == dag.task_group.children.keys() def check_task_group(node): try: children = node.children.values() except AttributeError: # Round-trip serialization and check the result expected_serialized = SerializedBaseOperator.serialize_operator( dag.get_task(node.task_id)) expected_deserialized = SerializedBaseOperator.deserialize_operator( expected_serialized) expected_dict = SerializedBaseOperator.serialize_operator( expected_deserialized) assert node assert SerializedBaseOperator.serialize_operator( node) == expected_dict return for child in children: check_task_group(child) check_task_group(serialized_dag.task_group)
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() cleanup_cluster = installer.get_cleanup_task() with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) with TaskGroup("Index Results", prefix_group_id=False, dag=self.dag) as post_steps: index_status_task = self._get_status_indexer().get_index_task() install_cluster >> benchmarks >> [post_steps, cleanup_cluster]
def build(self): installer = self._get_openshift_installer() initialize_cluster = installer.initialize_cluster_task() connect_to_platform = self._get_platform_connector().get_task() with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils: utils_tasks = self._get_scale_ci_diagnosis().get_utils() chain(*utils_tasks) with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) initialize_cluster >> connect_to_platform >> benchmarks >> utils
def create_section(): """ Create tasks in the outer section. There is broken link in the course, so I copypasted example from gridU """ dummies = [DummyOperator(task_id=f'task-{i + 1}') for i in range(5)] with TaskGroup("inside_section_1") as inside_section_1: _ = [DummyOperator(task_id=f'task-{i + 1}',) for i in range(3)] with TaskGroup("inside_section_2") as inside_section_2: _ = [DummyOperator(task_id=f'task-{i + 1}',) for i in range(3)] dummies[-1] >> inside_section_1 dummies[-2] >> inside_section_2
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() cleanup_cluster = installer.get_cleanup_task() with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils: utils_tasks = self._get_scale_ci_diagnosis().get_utils() chain(*utils_tasks) utils_tasks[-1] >> cleanup_cluster with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) benchmark_tasks[-1] >> utils install_cluster >> benchmarks
def meetime_slowly_changing_dimensions(): """ Exctraction of Meetime dimensions which has no control for updates, these dimensions are being uploaded once a day at 17:00 (check the cron expression above) """ items = ['users', 'cadences', 'company'] start = DummyOperator(task_id='start') end = DummyOperator(task_id='end', trigger_rule='none_failed') with TaskGroup(group_id="meetime") as tg: for item in items: extract_recents = MeetimeRecentsOperator( task_id=str('extract_' + item), item=item, since_timestamp=Variable.get( f"meetime_last_update_timestamp_{item}", default_var='2021-01-01 00:00:00'), s3_connection_id='movilake', connection_id='meetime_api') extract_recents start >> tg >> end
def meetime_incremental(): """ Incremental extraction of bellow entities from Meetime API """ items = [ 'calls', 'demos', 'leads', 'leads/custom-fields', 'prospections', 'prospections/activities', 'prospections/lost-reasons' ] start = DummyOperator(task_id='start') end = DummyOperator(task_id='end', trigger_rule='none_failed') with TaskGroup(group_id="meetime") as tg: for item in items: #Airflow does not support slashes as task id item_remove_slash = item.replace('/', '_') extract_recents = MeetimeRecentsOperator( task_id=str('extract_' + item_remove_slash), item=item, since_timestamp=Variable.get( f"meetime_last_update_timestamp_{item}", default_var='2021-01-01 00:00:00'), s3_connection_id='movilake', connection_id='meetime_api') extract_recents start >> tg >> end
def _add_benchmarks(self, task_group): with TaskGroup(task_group, prefix_group_id=True, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks( task_group).get_benchmarks() chain(*benchmark_tasks) return benchmarks
def home_scraper_section(): """ Create tasks in the outer section. """ params = ({ "workers": 5, "start_page": start_page, "end_page": end_page, "pagesize": 15, "verbose": True, } for start_page, end_page in {(1, 5), (5, 15), (15, 20)}) home_ok = DummyOperator(task_id="home-ok") # replace with with apicheck with TaskGroup("home_scraper") as home_scraper: _ = [ ScrapEstateOperator( task_id=f"home-{i + 1}", url="https://home.dk/umbraco/backoffice/home-api/Search", api_name="home.dk", scraper_cls=Home, params=param, ) for i, param in enumerate(params) ] home_ok >> home_scraper
def _get_benchmarks(self, benchmarks): for index, benchmark in enumerate(benchmarks): if 'benchmarks' not in benchmark: benchmarks[index] = self._get_benchmark(benchmark) elif 'group' in benchmark: with TaskGroup(benchmark['group'], prefix_group_id=False, dag=self.dag) as task_group: benchmarks[index] = self._get_benchmarks(benchmark['benchmarks']) else: benchmarks[index] = self._get_benchmarks(benchmark['benchmarks']) return benchmarks
def test_build_task_group(): """ This is an alternative syntax to use TaskGroup. It should result in the same TaskGroup as using context manager. """ execution_date = pendulum.parse("20200101") dag = DAG("test_build_task_group", start_date=execution_date) task1 = DummyOperator(task_id="task1", dag=dag) group234 = TaskGroup("group234", dag=dag) _ = DummyOperator(task_id="task2", dag=dag, task_group=group234) group34 = TaskGroup("group34", dag=dag, parent_group=group234) _ = DummyOperator(task_id="task3", dag=dag, task_group=group34) _ = DummyOperator(task_id="task4", dag=dag, task_group=group34) task5 = DummyOperator(task_id="task5", dag=dag) task1 >> group234 group34 >> task5 assert task_group_to_dict(dag.task_group) == EXPECTED_JSON
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() connect_to_platform = self._get_platform_connector().get_task() final_status=final_dag_status.get_task(self.dag) with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils: utils_tasks = self._get_scale_ci_diagnosis().get_utils() chain(*utils_tasks) with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() install_cluster >> connect_to_platform >> benchmarks >> utils >> cleanup_cluster >> final_status else: install_cluster >> connect_to_platform >> benchmarks >> utils
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() cleanup_cluster = installer.get_cleanup_task() with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) benchmark_tasks[-1] >> cleanup_cluster install_cluster >> benchmarks
def factory(*args, **kwargs): # Generate signature for decorated function and bind the arguments when called # we do this to extract parameters so we can annotate them on the DAG object. # In addition, this fails if we are missing any args/kwargs with TypeError as expected. # Apply defaults to capture default values if set. # Initialize TaskGroup with bound arguments with TaskGroup(*task_group_bound_args.args, add_suffix_on_collision=True, **task_group_bound_args.kwargs): # Invoke function to run Tasks inside the TaskGroup return f(*args, **kwargs)
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() install_cluster >> benchmarks >> cleanup_cluster else: install_cluster >> benchmarks
def test_taskgroup_dag(): """Creates a test DAG with a few operators to test on, with some in a task group.""" def f(task_id): return f"OP:{task_id}" with DAG(dag_id="test_xcom_dag", default_args=DEFAULT_ARGS) as dag: op1 = PythonOperator(python_callable=f, task_id="test_op_1") op4 = PythonOperator(python_callable=f, task_id="test_op_4") with TaskGroup("group_1") as group: op2 = PythonOperator(python_callable=f, task_id="test_op_2") op3 = PythonOperator(python_callable=f, task_id="test_op_3") return dag, group, (op1, op2, op3, op4)
def test_duplicate_group_id(): from airflow.exceptions import DuplicateTaskIdFound execution_date = pendulum.parse("20200101") with pytest.raises(DuplicateTaskIdFound, match=r".* 'task1' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): _ = DummyOperator(task_id="task1") with TaskGroup("task1"): pass with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): _ = DummyOperator(task_id="task1") with TaskGroup("group1", prefix_group_id=False): with TaskGroup("group1"): pass with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): with TaskGroup("group1", prefix_group_id=False): _ = DummyOperator(task_id="group1") with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1.downstream_join_id' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): _ = DummyOperator(task_id="task1") with TaskGroup("group1"): _ = DummyOperator(task_id="downstream_join_id") with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1.upstream_join_id' .*"): with DAG("test_duplicate_group_id", start_date=execution_date): _ = DummyOperator(task_id="task1") with TaskGroup("group1"): _ = DummyOperator(task_id="upstream_join_id")
def dagGroup(): """ * with DAG를 쓰면, 그 내부에 있는 Operator는 파라미터에 dag를 안 넣어줘도 되는 것 같음. * Group을 만들면, 전체 DAG에서 Group으로 테스크 연결하면, 그룹에도 DAG가 있어서, 그룹내의 DAG 생성. """ with DAG(dag_id="example_task_group", start_date=days_ago(2), tags=["example"]) as dag: start = DummyOperator(task_id="start") # [START howto_task_group_section_1] with TaskGroup("section_1", tooltip="Tasks for section_1") as section_1: task_1 = DummyOperator(task_id="task_1") task_2 = BashOperator(task_id="task_2", bash_command='echo 1') task_3 = DummyOperator(task_id="task_3") task_1 >> [task_2, task_3] # [END howto_task_group_section_1] # [START howto_task_group_section_2] with TaskGroup("section_2", tooltip="Tasks for section_2") as section_2: task_1 = DummyOperator(task_id="task_1") # [START howto_task_group_inner_section_2] with TaskGroup("inner_section_2", tooltip="Tasks for inner_section2") as inner_section_2: task_2 = BashOperator(task_id="task_2", bash_command='echo 1') task_3 = DummyOperator(task_id="task_3") task_4 = DummyOperator(task_id="task_4") [task_2, task_3] >> task_4 # [END howto_task_group_inner_section_2] # [END howto_task_group_section_2] end = DummyOperator(task_id='end') start >> section_1 >> section_2 >> end
def __init__(self, dag=None, dbt_global_cli_flags=None, dbt_project_dir=None, dbt_profiles_dir=None, dbt_target=None, dbt_tag=None, dbt_run_group_name='dbt_run', dbt_test_group_name='dbt_test' ): self.dag = dag self.dbt_global_cli_flags = dbt_global_cli_flags self.dbt_project_dir = dbt_project_dir self.dbt_profiles_dir = dbt_profiles_dir self.dbt_target = dbt_target self.dbt_tag = dbt_tag self.dbt_run_group = TaskGroup(dbt_run_group_name) self.dbt_test_group = TaskGroup(dbt_test_group_name) # Compile the manifest, then parse it and populate the two task groups self.compile_dbt() self.make_dbt_task_groups()
def build(self): installer = self._get_openshift_installer() install_cluster = installer.get_install_task() connect_to_platform = self._get_platform_connector().get_task() final_status=final_dag_status.get_task(self.dag) with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks: benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks() chain(*benchmark_tasks) rosa_post_installation = self._get_rosa_postinstall_setup()._get_rosa_postinstallation() if self.config.cleanup_on_success: cleanup_cluster = installer.get_cleanup_task() install_cluster >> rosa_post_installation >> connect_to_platform >> benchmarks >> cleanup_cluster >> final_status else: install_cluster >> rosa_post_installation >> connect_to_platform >> benchmarks
def test_multiple_calls_in_task_group(self): """Test calling task multiple times in a TaskGroup""" @task_decorator def do_run(): return 4 group_id = "KnightsOfNii" with self.dag: with TaskGroup(group_id=group_id): do_run() assert [f"{group_id}.do_run"] == self.dag.task_ids do_run() assert [f"{group_id}.do_run", f"{group_id}.do_run__1"] == self.dag.task_ids assert len(self.dag.task_ids) == 2
def make_task_groups(task_groups: Dict[str, Any], dag: DAG) -> Dict[str, "TaskGroup"]: """Takes a DAG and task group configurations. Creates TaskGroup instances. :param task_groups: Task group configuration from the YAML configuration file. :param dag: DAG instance that task groups to be added. """ task_groups_dict: Dict[str, "TaskGroup"] = {} if version.parse(AIRFLOW_VERSION) >= version.parse("2.0.0"): for task_group_name, task_group_conf in task_groups.items(): task_group_conf["group_id"] = task_group_name task_group_conf["dag"] = dag task_group = TaskGroup( **{ k: v for k, v in task_group_conf.items() if k not in SYSTEM_PARAMS }) task_groups_dict[task_group.group_id] = task_group return task_groups_dict
def training_groups(): with TaskGroup("trainings") as group: model_settings = Variable.get('avocado_dag_model_settings', deserialize_json=True) for feature in model_settings['max_features']: for estimator in model_settings['n_estimators']: ml_id = f"{feature}_{estimator}" PapermillOperator( task_id=f'training_model_{ml_id}', input_nb='/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb=f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb', pool='training_pool', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id } ) return group
def training_group(): with TaskGroup("trainings", tooltip="Training tasks") as group: n_estimators = [100, 150] max_features = ['auto', 'sqrt'] for feature in max_features: for estimator in n_estimators: ml_id = f"{feature}_{estimator}" PapermillOperator( task_id=f'training_model_{ml_id}', input_nb= '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb', output_nb= f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb', pool='training_pool', parameters={ 'filepath': '/tmp/avocado.csv', 'n_estimators': estimator, 'max_features': feature, 'ml_id': ml_id }) return group
default_args = { 'owner': 'teste', 'depends_on_past': False, 'start_date': datetime(2019, 1, 1), 'retries': 0, } with DAG('dag-pipeline-iris-aula-v1', schedule_interval=timedelta(minutes=10), catchup=False, default_args=default_args) as dag: start = DummyOperator(task_id="start") with TaskGroup("etl", tooltip="etl") as etl: t1 = BashOperator(dag=dag, task_id='download_dataset', bash_command=""" cd {0}/featurestore curl -o iris.txt https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data """.format(pathScript)) [t1] with TaskGroup("preProcessing", tooltip="preProcessing") as preProcessing: t2 = BashOperator(dag=dag, task_id='encoder_dataset', bash_command=""" cd {0}
# under the License. """Example DAG demonstrating the usage of the TaskGroup.""" from airflow.models.dag import DAG from airflow.operators.bash import BashOperator from airflow.operators.dummy import DummyOperator from airflow.utils.dates import days_ago from airflow.utils.task_group import TaskGroup # [START howto_task_group] with DAG(dag_id="example_task_group", start_date=days_ago(2), tags=["example"]) as dag: start = DummyOperator(task_id="start") # [START howto_task_group_section_1] with TaskGroup("section_1", tooltip="Tasks for section_1") as section_1: task_1 = DummyOperator(task_id="task_1") task_2 = BashOperator(task_id="task_2", bash_command='echo 1') task_3 = DummyOperator(task_id="task_3") task_1 >> [task_2, task_3] # [END howto_task_group_section_1] # [START howto_task_group_section_2] with TaskGroup("section_2", tooltip="Tasks for section_2") as section_2: task_1 = DummyOperator(task_id="task_1") # [START howto_task_group_inner_section_2] with TaskGroup("inner_section_2", tooltip="Tasks for inner_section2") as inner_section_2: task_2 = BashOperator(task_id="task_2", bash_command='echo 1')
def taskflow_dag(): # Update replicator tables # This task group will take the new tables created by the on-prem replicator under the schema TRAFFIC_NEW # and alter the schema to TRAFFIC_INTER and then create the materialized view TRAFFIC on top # but it will only do this is the TRAFFIC_NEW table exists (ie. a new dataset was written) with TaskGroup( group_id="replicator_update_schema") as replicator_update_schema: group_id = "replicator_update_schema" ACC = create_bash_task_nested(group_id, 'acc') ARC_LINK = create_bash_task_nested(group_id, 'arc_link') ARTERYDATA = create_bash_task_nested(group_id, 'arterydata') CATEGORY = create_bash_task_nested(group_id, "category") CNT_DET = create_bash_task_nested(group_id, 'cnt_det') CNT_SPD = create_bash_task_nested(group_id, 'cnt_spd') COUTNINFO = create_bash_task_nested(group_id, 'countinfo') COUNTINFOMICS = create_bash_task_nested(group_id, 'countinfomics') DET = create_bash_task_nested(group_id, 'det') NODE = create_bash_task_nested(group_id, 'node') ACC >> ARC_LINK ARC_LINK >> ARTERYDATA ARTERYDATA >> CATEGORY CATEGORY >> CNT_DET CNT_DET >> CNT_SPD CNT_SPD >> COUTNINFO COUTNINFO >> COUNTINFOMICS COUNTINFOMICS >> DET DET >> NODE NODE # GCC's ArcGIS REST API server exposes a series of "services", each with a name like # `cot_geospatial2`. Within those services, individual layers have an ID # (in parentheses, after the layer name). with TaskGroup(group_id="copy_gis_layers") as copy_gis_layers: TASKS = { 'bikeway': ('cot_geospatial2', 2), 'accessible_signal': ('cot_geospatial2', 4), 'pedestrian_crossover': ('cot_geospatial2', 7), 'traffic_signal': ('cot_geospatial2', 9), 'hospital': ('cot_geospatial10', 21), 'toinview_program_point': ('cot_geospatial12', 46), 'toinview_program_line': ('cot_geospatial12', 47), 'toinview_program_polygon': ('cot_geospatial12', 48), 'school': ('cot_geospatial28', 17) } for task_id, task_args in TASKS.items(): mapserver_name, layer_id = task_args params = {'mapserver_name': mapserver_name, 'layer_id': layer_id} bash_task = BashOperator(task_id=task_id, bash_command='/copy_gis_layer.sh', params=params) bash_task # The Open Data Portal (i.e. CKAN) stores resources at URLs of format # `${BASE_URL}/dataset/${DATASET_ID}/resource/${RESOURCE_ID}/download/${FILENAME}`. # # To find these resource URLs: # # - find the dataset in the Open Data Portal (for instance, the Toronto Centreline # is at https://open.toronto.ca/dataset/toronto-centreline-tcl/); # - open the "For Developers" tab in the carousel; # - find the dataset ID listed in `params`; # - use this to request `${BASE_URL}/action/package_show?id=${DATASET_ID}`; # - in there, look for the URL under `result.resources[].url`. with TaskGroup( group_id="copy_opendata_shapefiles") as copy_opendata_shapefiles: group_id = "copy_opendata_shapefiles" TASK_GROUP = { 'centreline': { 'resource_url': 'https://ckanadmin0.intra.prod-toronto.ca/dataset/1d079757-377b-4564-82df-eb5638583bfb/resource/7209841e-e59c-49e4-9205-3b0587f2eea9/download/centreline_wgs84_v2.zip', 'source_srid': 3857 }, 'centreline_intersection': { 'resource_url': 'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2c83f641-7808-49ba-b80f-7011851d4e27/resource/c2fc0db0-7dcd-4c13-a54c-f39debc441bd/download/intersection-file-wgs84.zip', 'source_srid': 4326 } } for task_id, params in TASK_GROUP.items(): task_id_extract = '{0}_extract'.format(task_id) with TaskGroup(group_id=f'{task_id}'): INDEX_OPENDATA = BashOperator( task_id='index_opendata', bash_command='/copy_opendata_shapefiles/index_opendata.sh'. format(task_id=task_id)) EXTRACT_OPENDATA_SHAPEFILE = BashOperator( task_id=task_id_extract, bash_command= '/copy_opendata_shapefiles/extract_opendata_shapefile.sh', params={ 'name': task_id, 'resource_url': params['resource_url'] }) task_id_load = '{0}_load'.format(task_id) LOAD_SHAPEFILE = BashOperator( task_id=task_id_load, bash_command='/copy_opendata_shapefiles/load_shapefile.sh', params={ 'name': task_id, 'source_srid': params['source_srid'] }) EXTRACT_OPENDATA_SHAPEFILE >> LOAD_SHAPEFILE >> INDEX_OPENDATA # centreline_conflation_target # # Normalize the Toronto Centreline into common _conflation target_ and _routing target_ # views, for use by other pipelines. # # The conflation target consists of two views `centreline.midblocks`, `centreline.intersections`. # The midblocks and intersections in these views are shown on MOVE's map. When we conflate # collisions and traffic studies to the centreline, we only conflate those to centreline features # that are in this conflation target. # # The routing target consists of two views `centreline.routing_vertices`, `centreline.routing_edges` # and is a superset of the conflation target. This exists because the conflation target is not a # valid graph (in the graph theory sense); some midblock endpoints refer to intersection IDs that do # not correspond to actual intersections. To fix this, the routing target fills in vertices for # those intersection IDs. When routing corridors between centreline features, we use the routing # target, then filter the result down to only those features in the conflation target. # # This is intended to run after `copy_opendata_shapefiles`. with TaskGroup(group_id="centreline_conflation_target" ) as centreline_conflation_target: group_id = "centreline_conflation_target" A0_INTERSECTIONS_BASE = create_bash_task_nested( group_id, 'A0_intersections_base') A0_MIDBLOCKS_BASE = create_bash_task_nested(group_id, 'A0_midblocks_base') A1_INTERSECTION_IDS = create_bash_task_nested(group_id, 'A1_intersection_ids') A2_INTERSECTIONS = create_bash_task_nested(group_id, "A2_intersections") A3_MIDBLOCK_NAMES = create_bash_task_nested(group_id, 'A3_midblock_names') A4_MIDBLOCKS = create_bash_task_nested(group_id, 'A4_midblocks') A5_ROUTING_VERTICES = create_bash_task_nested(group_id, 'A5_routing_vertices') A6_ROUTING_EDGES = create_bash_task_nested(group_id, 'A6_routing_edges') [A0_INTERSECTIONS_BASE, A0_MIDBLOCKS_BASE] >> A1_INTERSECTION_IDS A1_INTERSECTION_IDS >> A2_INTERSECTIONS A2_INTERSECTIONS >> A3_MIDBLOCK_NAMES A3_MIDBLOCK_NAMES >> A4_MIDBLOCKS A4_MIDBLOCKS >> A5_ROUTING_VERTICES A5_ROUTING_VERTICES >> A6_ROUTING_EDGES # """ # gis_layers_vector_tiles # # Generates vector tiles from GIS layers provided by GCC, which are loaded into our database by # the `copy_gis_layers` DAG. These are stored in `/data/tiles`, and are served from `/tiles` on # our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render information # about schools, hospitals, and other points of interest when zoomed in. # # This is intended to run after `copy_gis_layers`. # """ with TaskGroup( group_id="gis_layers_vector_tiles") as gis_layers_vector_tiles: BUILD_GIS_LAYERS_TILES = create_bash_task('build_gis_layers_tiles') EXTRACT_GIS_LAYERS_TILES = create_bash_task('extract_gis_layers_tiles') BUILD_GIS_LAYERS_TILES >> EXTRACT_GIS_LAYERS_TILES """ location_search_index Builds the views and indexes that support location search, and also builds an index of midblock names. This is intended to run after `centreline_conflation_target` and `copy_gis_layers` """ with TaskGroup(group_id="location_search_index") as location_search_index: group_id = "location_search_index" TRANSFORM_CENTRELINE_INDEX = create_bash_task_nested( group_id, 'transform_centreline_index') TRANSFORM_INTERSECTIONS_INDEX = create_bash_task_nested( group_id, 'transform_intersections_index') TRANSFORM_TRAFFIC_SIGNAL = create_bash_task_nested( group_id, 'transform_traffic_signal') TRANSFORM_TRAFFIC_SIGNAL TRANSFORM_CENTRELINE_INDEX >> TRANSFORM_INTERSECTIONS_INDEX """ centreline_vector_tiles Generates vector tiles from the MOVE conflation target, which is built by the `centreline_conflation_target` DAG. These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render interactive centreline features. This is intended to run after `centreline_conflation_target`. """ with TaskGroup( group_id='centreline_vector_tiles') as centreline_vector_tiles: group_id = 'centreline_vector_tiles' LOAD_VOLUME = create_bash_task_nested(group_id, 'load_volume') BUILD_VECTOR_TILES = create_bash_task_nested(group_id, 'build_vector_tiles') EXTRACT_VECTOR_TILES = create_bash_task_nested(group_id, 'extract_vector_tiles') LOAD_VOLUME >> BUILD_VECTOR_TILES BUILD_VECTOR_TILES >> EXTRACT_VECTOR_TILES """ arteries_geocoding Uses arterycode matching information and processes as originally developed by Data + Analytics to link counts with the Toronto centreline. The legacy FLOW system was not based on the Toronto Centreline, but rather used a legacy map layer that is no longer supported. In FLOW, arterycodes identified locations in that legacy map layer. To use these with the Toronto Centreline, we apply a series of heuristics developed by Data + Analytics: ID matching on `LINKID`, spatial matches, etc. This is the first step in our FLOW geocoding cascade, which continues with the DAGs `group_multidirection_arteries` and `group_multiday_counts`. All three DAGs must run before MOVE is considered to have updated its copy of FLOW data. This is intended to run after `replicator_transfer_flow` and `centreline_conflation_target`. """ with TaskGroup(group_id="arteries_geocoding") as arteries_geocoding: group_id = "arteries_geocoding" A1_ARTERIES_MANUAL_CORR = create_bash_task_nested( group_id, 'A1_arteries_manual_corr') A1_NODES_CORRECTED = create_bash_task_nested(group_id, 'A1_nodes_corrected') A2_NODES_CENTRELINE = create_bash_task_nested(group_id, 'A2_nodes_centreline') B1_ARTERIES_PX_CENTRELINE = create_bash_task_nested( group_id, 'B1_arteries_px_centreline') B2_ARTERIES_MANUAL_CORR_NORMALIZED = create_bash_task_nested( group_id, 'B2_arteries_manual_corr_normalized') C1_ARTERIES_LINKS = create_bash_task_nested(group_id, 'C1_arteries_links') C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS = create_bash_task_nested( group_id, 'C2_arteries_double_link_midblocks') C2_ARTERIES_DOUBLE_NODE = create_bash_task_nested( group_id, 'C2_arteries_double_node') C2_ARTERIES_SINGLE_NODE = create_bash_task_nested( group_id, 'C2_arteries_single_node') C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS = create_bash_task_nested( group_id, 'C3_arteries_double_node_midblocks') C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST = create_bash_task_nested( group_id, 'C4_arteries_double_node_midblocks_multi_best') D1_ARTERIES_CENTRELINE_TABLE = create_bash_task_nested( group_id, 'D1_arteries_centreline_table') D2_ARTERY_GEOCODING = create_bash_task_nested(group_id, 'D2_artery_geocoding') D3_ARTERIES_CENTRELINE_VIEW = create_bash_task_nested( group_id, 'D3_arteries_centreline_view') A1_NODES_CORRECTED >> A2_NODES_CENTRELINE A1_ARTERIES_MANUAL_CORR >> B2_ARTERIES_MANUAL_CORR_NORMALIZED A2_NODES_CENTRELINE >> C2_ARTERIES_DOUBLE_NODE C1_ARTERIES_LINKS >> C2_ARTERIES_DOUBLE_NODE A2_NODES_CENTRELINE >> C2_ARTERIES_SINGLE_NODE C1_ARTERIES_LINKS >> C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS C1_ARTERIES_LINKS >> C2_ARTERIES_SINGLE_NODE C2_ARTERIES_DOUBLE_NODE >> C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS >> C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST A2_NODES_CENTRELINE >> D1_ARTERIES_CENTRELINE_TABLE B1_ARTERIES_PX_CENTRELINE >> D1_ARTERIES_CENTRELINE_TABLE B2_ARTERIES_MANUAL_CORR_NORMALIZED >> D1_ARTERIES_CENTRELINE_TABLE C1_ARTERIES_LINKS >> D1_ARTERIES_CENTRELINE_TABLE C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS >> D1_ARTERIES_CENTRELINE_TABLE C2_ARTERIES_SINGLE_NODE >> D1_ARTERIES_CENTRELINE_TABLE C2_ARTERIES_DOUBLE_NODE >> D1_ARTERIES_CENTRELINE_TABLE C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS >> D1_ARTERIES_CENTRELINE_TABLE C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST >> D1_ARTERIES_CENTRELINE_TABLE D1_ARTERIES_CENTRELINE_TABLE >> D2_ARTERY_GEOCODING D2_ARTERY_GEOCODING >> D3_ARTERIES_CENTRELINE_VIEW """ crash_geocoding Normalizes CRASH data into collision _events_ and collision _involved persons_, then matches collision events to the centreline conflation target that was created by `centreline_conflation_target`. Our legacy schema in Oracle stores both event-related and involved-person-related information in a single table, `TRAFFIC.ACC`. That table has one record per involved person, with event-level details copied across all persons involved in a collision. To make this easier to work with in MOVE, we transform `TRAFFIC.ACC` into a normalized representation. To match collisions to the centreline, we use the following heuristic: - if there are any intersections within 20m, match to the closest such intersection; - otherwise, if there are any midblocks within 20m, match to the closest such midblock; - otherwise, do not match. This same heuristic was used by the legacy CRASH system to assign collisions to intersections and midblocks. (However, CRASH did not use the Toronto Centreline, but instead used a legacy map layer that has been deprecated and is no longer maintained by the City.) This is intended to run after `replicator_transfer_crash` and `centreline_conflation_target`. """ with TaskGroup(group_id="crash_geocoding") as crash_geocoding: group_id = 'crash_geocoding' A1_EVENTS_FIELDS_RAW = create_bash_task_nested(group_id, 'A1_events_fields_raw') A2_EVENTS_FIELDS_NORM = create_bash_task_nested( group_id, 'A2_events_fields_norm') A2_INVOLVED_FIELDS_RAW = create_bash_task_nested( group_id, 'A2_involved_fields_raw') A3_INVOLVED_FIELDS_NORM = create_bash_task_nested( group_id, 'A3_involved_fields_norm') A4_INVOLVED = create_bash_task_nested(group_id, 'A4_involved') A5_EVENTS = create_bash_task_nested(group_id, 'A5_events') A6_EVENTS_INTERSECTIONS = create_bash_task_nested( group_id, 'A6_events_intersections') A6_EVENTS_SEGMENTS = create_bash_task_nested(group_id, 'A6_events_segments') A7_EVENTS_CENTRELINE = create_bash_task_nested(group_id, 'A7_events_centreline') A1_EVENTS_FIELDS_RAW >> A2_EVENTS_FIELDS_NORM A1_EVENTS_FIELDS_RAW >> A2_INVOLVED_FIELDS_RAW A2_EVENTS_FIELDS_NORM >> A3_INVOLVED_FIELDS_NORM A2_INVOLVED_FIELDS_RAW >> A3_INVOLVED_FIELDS_NORM A3_INVOLVED_FIELDS_NORM >> A4_INVOLVED A4_INVOLVED >> A5_EVENTS A5_EVENTS >> A6_EVENTS_INTERSECTIONS A5_EVENTS >> A6_EVENTS_SEGMENTS A6_EVENTS_INTERSECTIONS >> A7_EVENTS_CENTRELINE A6_EVENTS_SEGMENTS >> A7_EVENTS_CENTRELINE """ collisions_vector_tiles Generates vector tiles from collisions data, which is built by the `crash_geocoding` DAG. These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render collisions heatmaps when zoomed out. This is intended to run after `crash_geocoding`. """ with TaskGroup( group_id="collisions_vector_tiles") as collisions_vector_tiles: group_id = "collisions_vector_tiles" BUILD_COLLISIONS_TILES = create_bash_task_nested( group_id, 'build_collisions_tiles') EXTRACT_COLLISIONS_TILES = create_bash_task_nested( group_id, 'extract_collisions_tiles') BUILD_COLLISIONS_TILES >> EXTRACT_COLLISIONS_TILES """ group_multidirection_arteries Continues the FLOW geocoding process started by `arteries_geocoding`, by identifying arterycodes that refer to different directions of travel in the same location and grouping them together. When a traffic study is requested, it might ask for 3 days of data collection on a 2-way street; someone requesting this study would want to see all 3 days in both directions of travel. However, the legacy FLOW schema uses separate arterycodes for different directions of travel, and also uses separate `COUNT_INFO_ID`s for each day of a traffic study. As a first step towards delivering all data for this study at once, we need to identify the arterycodes that correspond to these two directions of travel, and group them together. Once that's done, the DAG `group_multiday_counts` then takes care of grouping together the 3 days of the traffic study, so that we can get all six relevant counts in database. Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no reliable way to visualize that much data at once. This is intended to run after `arteries_geocoding`. """ with TaskGroup(group_id='group_multidirection_arteries' ) as group_multidirection_arteries: group_id = 'group_multidirection_arteries' A1_ARTERIES_DOUBLE_LINK_PAIRS = create_bash_task_nested( group_id, 'A1_arteries_double_link_pairs') A1_ARTERIES_MIDBLOCK_SOLO = create_bash_task_nested( group_id, 'A1_arteries_midblock_solo') A2_ARTERIES_GROUPS_PRE = create_bash_task_nested( group_id, 'A2_arteries_groups_pre') A3_ARTERIES_GROUPS_RANKED = create_bash_task_nested( group_id, 'A3_arteries_groups_ranked') A4_ARTERIES_GROUPS_POST = create_bash_task_nested( group_id, 'A4_arteries_groups_post') A1_ARTERIES_DOUBLE_LINK_PAIRS >> A2_ARTERIES_GROUPS_PRE A1_ARTERIES_MIDBLOCK_SOLO >> A2_ARTERIES_GROUPS_PRE A2_ARTERIES_GROUPS_PRE >> A3_ARTERIES_GROUPS_RANKED A3_ARTERIES_GROUPS_RANKED >> A4_ARTERIES_GROUPS_POST """ group_multiday_counts Finishes the FLOW geocoding process started by `arteries_geocoding` and continued by `group_multidirection_arteries`, by identifying consecutive days of data collection from the same arterycode group and grouping those together into a single study. When a traffic study is requested, it might ask for 3 days of data collection on a 2-way street; someone requesting this study would want to see all 3 days in both directions of travel. However, the legacy FLOW schema uses separate arterycodes for different directions of travel, and also uses separate `COUNT_INFO_ID`s for each day of a traffic study. Once `group_multidirection_arteries` has completed, we've identified the arterycodes that correspond to these two directions of travel. To find all data for the study, we now need to group together the 3 days over which data was collected at these two arterycodes. However, not all studies are of the same duration. To detect studies, we use runs of consecutive days at the same arterycode group. Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no reliable way to visualize that much data at once. This is intended to run after `group_multidirection_arteries`. """ with TaskGroup(group_id='group_multiday_counts') as group_multiday_counts: group_id = 'group_multiday_counts' A1_COUNTS_MULTIDAY_RUNS = create_bash_task_nested( group_id, 'A1_counts_multiday_runs') A2_ARTERIES_COUNTS_GROUPS = create_bash_task_nested( group_id, 'A2_arteries_counts_groups') A3_STUDIES = create_bash_task_nested(group_id, 'A3_studies') A4_COUNTS2_STUDIES = create_bash_task_nested(group_id, 'A4_counts2_studies') A1_COUNTS_MULTIDAY_RUNS >> A2_ARTERIES_COUNTS_GROUPS A2_ARTERIES_COUNTS_GROUPS >> A3_STUDIES A3_STUDIES >> A4_COUNTS2_STUDIES """ open_data_tmcs Builds the [Traffic Volumes at Intersections for All Modes](https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/) dataset for the City of Toronto Open Data Portal. The dataset is exposed in two ways: via database, and via HTTP. We store the dataset as a series of views in the `open_data` schema. We also dump those views to CSV files at `/data/open_data`, which is served from `/open_data` on our ETL EC2 instances. This is intended to run after `group_multiday_counts`. """ with TaskGroup(group_id='open_data_tmcs') as open_data_tmcs: group_id = 'open_data_tmcs' A1_TMCS_COUNT_DATA = create_bash_task_nested(group_id, 'A1_tmcs_count_data') A1_TMCS_COUNT_METADATA = create_bash_task_nested( group_id, 'A1_tmcs_count_metadata') A2_TMCS_LOCATIONS = create_bash_task_nested(group_id, 'A2_tmcs_locations') A3_TMCS_JOINED = create_bash_task_nested(group_id, 'A3_tmcs_joined') A4_TMCS_DECADES = create_bash_task_nested(group_id, 'A4_tmcs_decades') A4_TMCS_PREVIEW = create_bash_task_nested(group_id, 'A4_tmcs_preview') A1_TMCS_COUNT_DATA >> A2_TMCS_LOCATIONS A1_TMCS_COUNT_METADATA >> A2_TMCS_LOCATIONS A2_TMCS_LOCATIONS >> A3_TMCS_JOINED A3_TMCS_JOINED >> A4_TMCS_DECADES A3_TMCS_JOINED >> A4_TMCS_PREVIEW replicator_update_schema >> copy_gis_layers replicator_update_schema >> copy_opendata_shapefiles [copy_gis_layers, copy_opendata_shapefiles] >> centreline_conflation_target [copy_gis_layers, copy_opendata_shapefiles] >> gis_layers_vector_tiles centreline_conflation_target >> location_search_index centreline_conflation_target >> centreline_vector_tiles centreline_conflation_target >> arteries_geocoding centreline_conflation_target >> crash_geocoding crash_geocoding >> collisions_vector_tiles arteries_geocoding >> group_multidirection_arteries group_multidirection_arteries >> group_multiday_counts group_multiday_counts >> open_data_tmcs
max_active_runs=3, schedule_interval="@daily", default_args={ "email_on_failure": False, "email_on_retry": False, "retries": 1, "retry_delay": timedelta(minutes=1), }, catchup=False, template_searchpath="/usr/local/airflow/include", ) as dag: t0 = DummyOperator(task_id="start") # Define Task Group with Postgres Queries with TaskGroup("covid_table_queries") as covid_table_queries: for state in states: generate_files = PostgresOperator( task_id="covid_query_{0}".format(state), postgres_conn_id="gpdb", sql="covid_state_query.sql", params={"state": "'" + state + "'"}, ) # Define task to send email send_email = EmailOperator( task_id="send_email", to=email_to, subject="Covid Greenplum Queries DAG", html_content= "<p>The Covid queries were run on Greenplum successfully. <p>",