Esempio n. 1
0
def test_build_task_group_context_manager():
    execution_date = pendulum.parse("20200101")
    with DAG("test_build_task_group_context_manager",
             start_date=execution_date) as dag:
        task1 = DummyOperator(task_id="task1")
        with TaskGroup("group234") as group234:
            _ = DummyOperator(task_id="task2")

            with TaskGroup("group34") as group34:
                _ = DummyOperator(task_id="task3")
                _ = DummyOperator(task_id="task4")

        task5 = DummyOperator(task_id="task5")
        task1 >> group234
        group34 >> task5

    assert task1.get_direct_relative_ids(upstream=False) == {
        'group234.group34.task4',
        'group234.group34.task3',
        'group234.task2',
    }
    assert task5.get_direct_relative_ids(upstream=True) == {
        'group234.group34.task4',
        'group234.group34.task3',
    }

    assert dag.task_group.group_id is None
    assert dag.task_group.is_root
    assert set(
        dag.task_group.children.keys()) == {"task1", "group234", "task5"}
    assert group34.group_id == "group234.group34"

    assert task_group_to_dict(dag.task_group) == EXPECTED_JSON
Esempio n. 2
0
def test_build_task_group_with_prefix():
    """
    Tests that prefix_group_id turns on/off prefixing of task_id with group_id.
    """
    execution_date = pendulum.parse("20200101")
    with DAG("test_build_task_group_with_prefix", start_date=execution_date) as dag:
        task1 = DummyOperator(task_id="task1")
        with TaskGroup("group234", prefix_group_id=False) as group234:
            task2 = DummyOperator(task_id="task2")

            with TaskGroup("group34") as group34:
                task3 = DummyOperator(task_id="task3")

                with TaskGroup("group4", prefix_group_id=False) as group4:
                    task4 = DummyOperator(task_id="task4")

        task5 = DummyOperator(task_id="task5")
        task1 >> group234
        group34 >> task5

    assert task2.task_id == "task2"
    assert group34.group_id == "group34"
    assert task3.task_id == "group34.task3"
    assert group4.group_id == "group34.group4"
    assert task4.task_id == "task4"
    assert task5.task_id == "task5"
    assert group234.get_child_by_label("task2") == task2
    assert group234.get_child_by_label("group34") == group34
    assert group4.get_child_by_label("task4") == task4

    assert extract_node_id(task_group_to_dict(dag.task_group), include_label=True) == {
        'id': None,
        'label': None,
        'children': [
            {
                'id': 'group234',
                'label': 'group234',
                'children': [
                    {
                        'id': 'group34',
                        'label': 'group34',
                        'children': [
                            {
                                'id': 'group34.group4',
                                'label': 'group4',
                                'children': [{'id': 'task4', 'label': 'task4'}],
                            },
                            {'id': 'group34.task3', 'label': 'task3'},
                            {'id': 'group34.downstream_join_id', 'label': ''},
                        ],
                    },
                    {'id': 'task2', 'label': 'task2'},
                    {'id': 'group234.upstream_join_id', 'label': ''},
                ],
            },
            {'id': 'task1', 'label': 'task1'},
            {'id': 'task5', 'label': 'task5'},
        ],
    }
    def test_task_group_serialization(self):
        """
        Test TaskGroup serialization/deserialization.
        """
        from airflow.operators.dummy_operator import DummyOperator
        from airflow.utils.task_group import TaskGroup

        execution_date = datetime(2020, 1, 1)
        with DAG("test_task_group_serialization",
                 start_date=execution_date) as dag:
            task1 = DummyOperator(task_id="task1")
            with TaskGroup("group234") as group234:
                _ = DummyOperator(task_id="task2")

                with TaskGroup("group34") as group34:
                    _ = DummyOperator(task_id="task3")
                    _ = DummyOperator(task_id="task4")

            task5 = DummyOperator(task_id="task5")
            task1 >> group234
            group34 >> task5

        dag_dict = SerializedDAG.to_dict(dag)
        SerializedDAG.validate_schema(dag_dict)
        json_dag = SerializedDAG.from_json(SerializedDAG.to_json(dag))
        self.validate_deserialized_dag(json_dag, dag)

        serialized_dag = SerializedDAG.deserialize_dag(
            SerializedDAG.serialize_dag(dag))

        assert serialized_dag.task_group.children
        assert serialized_dag.task_group.children.keys(
        ) == dag.task_group.children.keys()

        def check_task_group(node):
            try:
                children = node.children.values()
            except AttributeError:
                # Round-trip serialization and check the result
                expected_serialized = SerializedBaseOperator.serialize_operator(
                    dag.get_task(node.task_id))
                expected_deserialized = SerializedBaseOperator.deserialize_operator(
                    expected_serialized)
                expected_dict = SerializedBaseOperator.serialize_operator(
                    expected_deserialized)
                assert node
                assert SerializedBaseOperator.serialize_operator(
                    node) == expected_dict
                return

            for child in children:
                check_task_group(child)

        check_task_group(serialized_dag.task_group)
Esempio n. 4
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        cleanup_cluster = installer.get_cleanup_task()
        with TaskGroup("benchmarks", prefix_group_id=False,
                       dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)

        with TaskGroup("Index Results", prefix_group_id=False,
                       dag=self.dag) as post_steps:
            index_status_task = self._get_status_indexer().get_index_task()

        install_cluster >> benchmarks >> [post_steps, cleanup_cluster]
Esempio n. 5
0
    def build(self):       

        installer = self._get_openshift_installer()
        initialize_cluster = installer.initialize_cluster_task()
        connect_to_platform = self._get_platform_connector().get_task()

        with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils:
            utils_tasks = self._get_scale_ci_diagnosis().get_utils()
            chain(*utils_tasks)

        with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)

        initialize_cluster >> connect_to_platform >> benchmarks >> utils
Esempio n. 6
0
        def create_section():
            """
            Create tasks in the outer section.
            There is broken link in the course, so I copypasted example from gridU
            """
            dummies = [DummyOperator(task_id=f'task-{i + 1}') for i in range(5)]

            with TaskGroup("inside_section_1") as inside_section_1:
                _ = [DummyOperator(task_id=f'task-{i + 1}',) for i in range(3)]

            with TaskGroup("inside_section_2") as inside_section_2:
                _ = [DummyOperator(task_id=f'task-{i + 1}',) for i in range(3)]

            dummies[-1] >> inside_section_1
            dummies[-2] >> inside_section_2
Esempio n. 7
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        cleanup_cluster = installer.get_cleanup_task()
        with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils:
            utils_tasks = self._get_scale_ci_diagnosis().get_utils()
            chain(*utils_tasks)
            utils_tasks[-1] >> cleanup_cluster
        with TaskGroup("benchmarks", prefix_group_id=False,
                       dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)
            benchmark_tasks[-1] >> utils

        install_cluster >> benchmarks
Esempio n. 8
0
def meetime_slowly_changing_dimensions():
    """
    Exctraction of Meetime dimensions which has no control for updates,
    these dimensions are being uploaded once a day at 17:00 (check the cron expression above) 
    
    """

    items = ['users', 'cadences', 'company']

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end', trigger_rule='none_failed')

    with TaskGroup(group_id="meetime") as tg:
        for item in items:
            extract_recents = MeetimeRecentsOperator(
                task_id=str('extract_' + item),
                item=item,
                since_timestamp=Variable.get(
                    f"meetime_last_update_timestamp_{item}",
                    default_var='2021-01-01 00:00:00'),
                s3_connection_id='movilake',
                connection_id='meetime_api')

            extract_recents

    start >> tg >> end
def meetime_incremental():
    """
    Incremental extraction of bellow entities from Meetime API
   
    """

    items = [
        'calls', 'demos', 'leads', 'leads/custom-fields', 'prospections',
        'prospections/activities', 'prospections/lost-reasons'
    ]

    start = DummyOperator(task_id='start')
    end = DummyOperator(task_id='end', trigger_rule='none_failed')

    with TaskGroup(group_id="meetime") as tg:
        for item in items:
            #Airflow does not support slashes as task id
            item_remove_slash = item.replace('/', '_')
            extract_recents = MeetimeRecentsOperator(
                task_id=str('extract_' + item_remove_slash),
                item=item,
                since_timestamp=Variable.get(
                    f"meetime_last_update_timestamp_{item}",
                    default_var='2021-01-01 00:00:00'),
                s3_connection_id='movilake',
                connection_id='meetime_api')

            extract_recents

    start >> tg >> end
Esempio n. 10
0
 def _add_benchmarks(self, task_group):
     with TaskGroup(task_group, prefix_group_id=True,
                    dag=self.dag) as benchmarks:
         benchmark_tasks = self._get_e2e_benchmarks(
             task_group).get_benchmarks()
         chain(*benchmark_tasks)
     return benchmarks
def home_scraper_section():
    """
    Create tasks in the outer section.
    """

    params = ({
        "workers": 5,
        "start_page": start_page,
        "end_page": end_page,
        "pagesize": 15,
        "verbose": True,
    } for start_page, end_page in {(1, 5), (5, 15), (15, 20)})

    home_ok = DummyOperator(task_id="home-ok")  # replace with with apicheck

    with TaskGroup("home_scraper") as home_scraper:
        _ = [
            ScrapEstateOperator(
                task_id=f"home-{i + 1}",
                url="https://home.dk/umbraco/backoffice/home-api/Search",
                api_name="home.dk",
                scraper_cls=Home,
                params=param,
            ) for i, param in enumerate(params)
        ]

    home_ok >> home_scraper
Esempio n. 12
0
 def _get_benchmarks(self, benchmarks):
     for index, benchmark in enumerate(benchmarks):
         if 'benchmarks' not in benchmark:
             benchmarks[index] = self._get_benchmark(benchmark)
         elif 'group' in benchmark:
             with TaskGroup(benchmark['group'], prefix_group_id=False, dag=self.dag) as task_group:
                 benchmarks[index] = self._get_benchmarks(benchmark['benchmarks'])
         else: 
             benchmarks[index] = self._get_benchmarks(benchmark['benchmarks'])
     return benchmarks
Esempio n. 13
0
def test_build_task_group():
    """
    This is an alternative syntax to use TaskGroup. It should result in the same TaskGroup
    as using context manager.
    """
    execution_date = pendulum.parse("20200101")
    dag = DAG("test_build_task_group", start_date=execution_date)
    task1 = DummyOperator(task_id="task1", dag=dag)
    group234 = TaskGroup("group234", dag=dag)
    _ = DummyOperator(task_id="task2", dag=dag, task_group=group234)
    group34 = TaskGroup("group34", dag=dag, parent_group=group234)
    _ = DummyOperator(task_id="task3", dag=dag, task_group=group34)
    _ = DummyOperator(task_id="task4", dag=dag, task_group=group34)
    task5 = DummyOperator(task_id="task5", dag=dag)

    task1 >> group234
    group34 >> task5

    assert task_group_to_dict(dag.task_group) == EXPECTED_JSON
Esempio n. 14
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        connect_to_platform = self._get_platform_connector().get_task()
        final_status=final_dag_status.get_task(self.dag)

        with TaskGroup("utils", prefix_group_id=False, dag=self.dag) as utils:
            utils_tasks = self._get_scale_ci_diagnosis().get_utils()
            chain(*utils_tasks)

        with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)

        if self.config.cleanup_on_success:
            cleanup_cluster = installer.get_cleanup_task()
            install_cluster >> connect_to_platform >> benchmarks >> utils >> cleanup_cluster >> final_status
        else:
            install_cluster >> connect_to_platform >> benchmarks >> utils
Esempio n. 15
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        cleanup_cluster = installer.get_cleanup_task()
        with TaskGroup("benchmarks", prefix_group_id=False,
                       dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)
            benchmark_tasks[-1] >> cleanup_cluster

        install_cluster >> benchmarks
Esempio n. 16
0
        def factory(*args, **kwargs):
            # Generate signature for decorated function and bind the arguments when called
            # we do this to extract parameters so we can annotate them on the DAG object.
            # In addition, this fails if we are missing any args/kwargs with TypeError as expected.
            # Apply defaults to capture default values if set.

            # Initialize TaskGroup with bound arguments
            with TaskGroup(*task_group_bound_args.args,
                           add_suffix_on_collision=True,
                           **task_group_bound_args.kwargs):
                # Invoke function to run Tasks inside the TaskGroup
                return f(*args, **kwargs)
Esempio n. 17
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)

        if self.config.cleanup_on_success:
            cleanup_cluster = installer.get_cleanup_task()
            install_cluster >> benchmarks >> cleanup_cluster
        else:
            install_cluster >> benchmarks
Esempio n. 18
0
def test_taskgroup_dag():
    """Creates a test DAG with a few operators to test on, with some in a task group."""
    def f(task_id):
        return f"OP:{task_id}"

    with DAG(dag_id="test_xcom_dag", default_args=DEFAULT_ARGS) as dag:
        op1 = PythonOperator(python_callable=f, task_id="test_op_1")
        op4 = PythonOperator(python_callable=f, task_id="test_op_4")
        with TaskGroup("group_1") as group:
            op2 = PythonOperator(python_callable=f, task_id="test_op_2")
            op3 = PythonOperator(python_callable=f, task_id="test_op_3")
            return dag, group, (op1, op2, op3, op4)
Esempio n. 19
0
def test_duplicate_group_id():
    from airflow.exceptions import DuplicateTaskIdFound

    execution_date = pendulum.parse("20200101")

    with pytest.raises(DuplicateTaskIdFound, match=r".* 'task1' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            _ = DummyOperator(task_id="task1")
            with TaskGroup("task1"):
                pass

    with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            _ = DummyOperator(task_id="task1")
            with TaskGroup("group1", prefix_group_id=False):
                with TaskGroup("group1"):
                    pass

    with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            with TaskGroup("group1", prefix_group_id=False):
                _ = DummyOperator(task_id="group1")

    with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1.downstream_join_id' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            _ = DummyOperator(task_id="task1")
            with TaskGroup("group1"):
                _ = DummyOperator(task_id="downstream_join_id")

    with pytest.raises(DuplicateTaskIdFound, match=r".* 'group1.upstream_join_id' .*"):
        with DAG("test_duplicate_group_id", start_date=execution_date):
            _ = DummyOperator(task_id="task1")
            with TaskGroup("group1"):
                _ = DummyOperator(task_id="upstream_join_id")
Esempio n. 20
0
def dagGroup():
    """
        * with DAG를 쓰면, 그 내부에 있는 Operator는 파라미터에 dag를 안 넣어줘도 되는 것 같음.
        * Group을 만들면, 전체 DAG에서  Group으로 테스크 연결하면, 그룹에도 DAG가 있어서, 그룹내의 DAG 생성.
    """
    with DAG(dag_id="example_task_group",
             start_date=days_ago(2),
             tags=["example"]) as dag:
        start = DummyOperator(task_id="start")

    # [START howto_task_group_section_1]
    with TaskGroup("section_1", tooltip="Tasks for section_1") as section_1:
        task_1 = DummyOperator(task_id="task_1")
        task_2 = BashOperator(task_id="task_2", bash_command='echo 1')
        task_3 = DummyOperator(task_id="task_3")

        task_1 >> [task_2, task_3]
    # [END howto_task_group_section_1]

    # [START howto_task_group_section_2]
    with TaskGroup("section_2", tooltip="Tasks for section_2") as section_2:
        task_1 = DummyOperator(task_id="task_1")

        # [START howto_task_group_inner_section_2]
        with TaskGroup("inner_section_2",
                       tooltip="Tasks for inner_section2") as inner_section_2:
            task_2 = BashOperator(task_id="task_2", bash_command='echo 1')
            task_3 = DummyOperator(task_id="task_3")
            task_4 = DummyOperator(task_id="task_4")

            [task_2, task_3] >> task_4
        # [END howto_task_group_inner_section_2]

    # [END howto_task_group_section_2]

    end = DummyOperator(task_id='end')

    start >> section_1 >> section_2 >> end
Esempio n. 21
0
    def __init__(self,
                 dag=None,
                 dbt_global_cli_flags=None,
                 dbt_project_dir=None,
                 dbt_profiles_dir=None,
                 dbt_target=None,
                 dbt_tag=None,
                 dbt_run_group_name='dbt_run',
                 dbt_test_group_name='dbt_test'
                 ):

        self.dag = dag
        self.dbt_global_cli_flags = dbt_global_cli_flags
        self.dbt_project_dir = dbt_project_dir
        self.dbt_profiles_dir = dbt_profiles_dir
        self.dbt_target = dbt_target
        self.dbt_tag = dbt_tag

        self.dbt_run_group = TaskGroup(dbt_run_group_name)
        self.dbt_test_group = TaskGroup(dbt_test_group_name)

        # Compile the manifest, then parse it and populate the two task groups
        self.compile_dbt()
        self.make_dbt_task_groups()
Esempio n. 22
0
    def build(self):
        installer = self._get_openshift_installer()
        install_cluster = installer.get_install_task()
        connect_to_platform = self._get_platform_connector().get_task()
        final_status=final_dag_status.get_task(self.dag)
        with TaskGroup("benchmarks", prefix_group_id=False, dag=self.dag) as benchmarks:
            benchmark_tasks = self._get_e2e_benchmarks().get_benchmarks()
            chain(*benchmark_tasks)
        
        rosa_post_installation = self._get_rosa_postinstall_setup()._get_rosa_postinstallation()

        if self.config.cleanup_on_success:
            cleanup_cluster = installer.get_cleanup_task()
            install_cluster >> rosa_post_installation >> connect_to_platform >> benchmarks >> cleanup_cluster >> final_status
        else:
            install_cluster >> rosa_post_installation >> connect_to_platform >> benchmarks
Esempio n. 23
0
    def test_multiple_calls_in_task_group(self):
        """Test calling task multiple times in a TaskGroup"""
        @task_decorator
        def do_run():
            return 4

        group_id = "KnightsOfNii"
        with self.dag:
            with TaskGroup(group_id=group_id):
                do_run()
                assert [f"{group_id}.do_run"] == self.dag.task_ids
                do_run()
                assert [f"{group_id}.do_run",
                        f"{group_id}.do_run__1"] == self.dag.task_ids

        assert len(self.dag.task_ids) == 2
Esempio n. 24
0
    def make_task_groups(task_groups: Dict[str, Any],
                         dag: DAG) -> Dict[str, "TaskGroup"]:
        """Takes a DAG and task group configurations. Creates TaskGroup instances.

        :param task_groups: Task group configuration from the YAML configuration file.
        :param dag: DAG instance that task groups to be added.
        """
        task_groups_dict: Dict[str, "TaskGroup"] = {}
        if version.parse(AIRFLOW_VERSION) >= version.parse("2.0.0"):
            for task_group_name, task_group_conf in task_groups.items():
                task_group_conf["group_id"] = task_group_name
                task_group_conf["dag"] = dag
                task_group = TaskGroup(
                    **{
                        k: v
                        for k, v in task_group_conf.items()
                        if k not in SYSTEM_PARAMS
                    })
                task_groups_dict[task_group.group_id] = task_group
        return task_groups_dict
Esempio n. 25
0
def training_groups():
    with TaskGroup("trainings") as group:

        model_settings = Variable.get('avocado_dag_model_settings', deserialize_json=True)

        for feature in model_settings['max_features']:
            for estimator in model_settings['n_estimators']:
                ml_id = f"{feature}_{estimator}"
                PapermillOperator(
                    task_id=f'training_model_{ml_id}',
                    input_nb='/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                    output_nb=f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb',
                    pool='training_pool',
                    parameters={
                        'filepath': '/tmp/avocado.csv',
                        'n_estimators': estimator,
                        'max_features': feature,
                        'ml_id': ml_id
                    }
                )
        return group
Esempio n. 26
0
def training_group():
    with TaskGroup("trainings", tooltip="Training tasks") as group:
        n_estimators = [100, 150]
        max_features = ['auto', 'sqrt']
        for feature in max_features:
            for estimator in n_estimators:
                ml_id = f"{feature}_{estimator}"
                PapermillOperator(
                    task_id=f'training_model_{ml_id}',
                    input_nb=
                    '/usr/local/airflow/include/notebooks/avocado_prediction.ipynb',
                    output_nb=
                    f'/tmp/out-model-avocado-prediction-{ml_id}.ipynb',
                    pool='training_pool',
                    parameters={
                        'filepath': '/tmp/avocado.csv',
                        'n_estimators': estimator,
                        'max_features': feature,
                        'ml_id': ml_id
                    })
    return group
Esempio n. 27
0
default_args = {
    'owner': 'teste',
    'depends_on_past': False,
    'start_date': datetime(2019, 1, 1),
    'retries': 0,
}

with DAG('dag-pipeline-iris-aula-v1',
         schedule_interval=timedelta(minutes=10),
         catchup=False,
         default_args=default_args) as dag:

    start = DummyOperator(task_id="start")

    with TaskGroup("etl", tooltip="etl") as etl:

        t1 = BashOperator(dag=dag,
                          task_id='download_dataset',
                          bash_command="""
            cd {0}/featurestore
            curl -o iris.txt  https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data
            """.format(pathScript))

        [t1]

    with TaskGroup("preProcessing", tooltip="preProcessing") as preProcessing:
        t2 = BashOperator(dag=dag,
                          task_id='encoder_dataset',
                          bash_command="""
            cd {0}
Esempio n. 28
0
# under the License.
"""Example DAG demonstrating the usage of the TaskGroup."""

from airflow.models.dag import DAG
from airflow.operators.bash import BashOperator
from airflow.operators.dummy import DummyOperator
from airflow.utils.dates import days_ago
from airflow.utils.task_group import TaskGroup

# [START howto_task_group]
with DAG(dag_id="example_task_group", start_date=days_ago(2),
         tags=["example"]) as dag:
    start = DummyOperator(task_id="start")

    # [START howto_task_group_section_1]
    with TaskGroup("section_1", tooltip="Tasks for section_1") as section_1:
        task_1 = DummyOperator(task_id="task_1")
        task_2 = BashOperator(task_id="task_2", bash_command='echo 1')
        task_3 = DummyOperator(task_id="task_3")

        task_1 >> [task_2, task_3]
    # [END howto_task_group_section_1]

    # [START howto_task_group_section_2]
    with TaskGroup("section_2", tooltip="Tasks for section_2") as section_2:
        task_1 = DummyOperator(task_id="task_1")

        # [START howto_task_group_inner_section_2]
        with TaskGroup("inner_section_2",
                       tooltip="Tasks for inner_section2") as inner_section_2:
            task_2 = BashOperator(task_id="task_2", bash_command='echo 1')
Esempio n. 29
0
def taskflow_dag():
    # Update replicator tables
    # This task group will take the new tables created by the on-prem replicator under the schema TRAFFIC_NEW
    # and alter the schema to TRAFFIC_INTER and then create the materialized view TRAFFIC on top
    # but it will only do this is the TRAFFIC_NEW table exists (ie. a new dataset was written)
    with TaskGroup(
            group_id="replicator_update_schema") as replicator_update_schema:
        group_id = "replicator_update_schema"

        ACC = create_bash_task_nested(group_id, 'acc')
        ARC_LINK = create_bash_task_nested(group_id, 'arc_link')
        ARTERYDATA = create_bash_task_nested(group_id, 'arterydata')
        CATEGORY = create_bash_task_nested(group_id, "category")
        CNT_DET = create_bash_task_nested(group_id, 'cnt_det')
        CNT_SPD = create_bash_task_nested(group_id, 'cnt_spd')
        COUTNINFO = create_bash_task_nested(group_id, 'countinfo')
        COUNTINFOMICS = create_bash_task_nested(group_id, 'countinfomics')
        DET = create_bash_task_nested(group_id, 'det')
        NODE = create_bash_task_nested(group_id, 'node')

        ACC >> ARC_LINK
        ARC_LINK >> ARTERYDATA
        ARTERYDATA >> CATEGORY
        CATEGORY >> CNT_DET
        CNT_DET >> CNT_SPD
        CNT_SPD >> COUTNINFO
        COUTNINFO >> COUNTINFOMICS
        COUNTINFOMICS >> DET
        DET >> NODE
        NODE

    # GCC's ArcGIS REST API server exposes a series of "services", each with a name like
    # `cot_geospatial2`.  Within those services, individual layers have an ID
    # (in parentheses, after the layer name).
    with TaskGroup(group_id="copy_gis_layers") as copy_gis_layers:
        TASKS = {
            'bikeway': ('cot_geospatial2', 2),
            'accessible_signal': ('cot_geospatial2', 4),
            'pedestrian_crossover': ('cot_geospatial2', 7),
            'traffic_signal': ('cot_geospatial2', 9),
            'hospital': ('cot_geospatial10', 21),
            'toinview_program_point': ('cot_geospatial12', 46),
            'toinview_program_line': ('cot_geospatial12', 47),
            'toinview_program_polygon': ('cot_geospatial12', 48),
            'school': ('cot_geospatial28', 17)
        }
        for task_id, task_args in TASKS.items():
            mapserver_name, layer_id = task_args
            params = {'mapserver_name': mapserver_name, 'layer_id': layer_id}
            bash_task = BashOperator(task_id=task_id,
                                     bash_command='/copy_gis_layer.sh',
                                     params=params)
            bash_task

    # The Open Data Portal (i.e. CKAN) stores resources at URLs of format
    # `${BASE_URL}/dataset/${DATASET_ID}/resource/${RESOURCE_ID}/download/${FILENAME}`.
    #
    # To find these resource URLs:
    #
    # - find the dataset in the Open Data Portal (for instance, the Toronto Centreline
    #   is at https://open.toronto.ca/dataset/toronto-centreline-tcl/);
    # - open the "For Developers" tab in the carousel;
    # - find the dataset ID listed in `params`;
    # - use this to request `${BASE_URL}/action/package_show?id=${DATASET_ID}`;
    # - in there, look for the URL under `result.resources[].url`.
    with TaskGroup(
            group_id="copy_opendata_shapefiles") as copy_opendata_shapefiles:
        group_id = "copy_opendata_shapefiles"

        TASK_GROUP = {
            'centreline': {
                'resource_url':
                'https://ckanadmin0.intra.prod-toronto.ca/dataset/1d079757-377b-4564-82df-eb5638583bfb/resource/7209841e-e59c-49e4-9205-3b0587f2eea9/download/centreline_wgs84_v2.zip',
                'source_srid': 3857
            },
            'centreline_intersection': {
                'resource_url':
                'https://ckan0.cf.opendata.inter.prod-toronto.ca/dataset/2c83f641-7808-49ba-b80f-7011851d4e27/resource/c2fc0db0-7dcd-4c13-a54c-f39debc441bd/download/intersection-file-wgs84.zip',
                'source_srid': 4326
            }
        }

        for task_id, params in TASK_GROUP.items():
            task_id_extract = '{0}_extract'.format(task_id)

            with TaskGroup(group_id=f'{task_id}'):

                INDEX_OPENDATA = BashOperator(
                    task_id='index_opendata',
                    bash_command='/copy_opendata_shapefiles/index_opendata.sh'.
                    format(task_id=task_id))

                EXTRACT_OPENDATA_SHAPEFILE = BashOperator(
                    task_id=task_id_extract,
                    bash_command=
                    '/copy_opendata_shapefiles/extract_opendata_shapefile.sh',
                    params={
                        'name': task_id,
                        'resource_url': params['resource_url']
                    })

                task_id_load = '{0}_load'.format(task_id)

                LOAD_SHAPEFILE = BashOperator(
                    task_id=task_id_load,
                    bash_command='/copy_opendata_shapefiles/load_shapefile.sh',
                    params={
                        'name': task_id,
                        'source_srid': params['source_srid']
                    })

                EXTRACT_OPENDATA_SHAPEFILE >> LOAD_SHAPEFILE >> INDEX_OPENDATA

    # centreline_conflation_target
    #
    # Normalize the Toronto Centreline into common _conflation target_ and _routing target_
    # views, for use by other pipelines.
    #
    # The conflation target consists of two views `centreline.midblocks`, `centreline.intersections`.
    # The midblocks and intersections in these views are shown on MOVE's map.  When we conflate
    # collisions and traffic studies to the centreline, we only conflate those to centreline features
    # that are in this conflation target.
    #
    # The routing target consists of two views `centreline.routing_vertices`, `centreline.routing_edges`
    # and is a superset of the conflation target.  This exists because the conflation target is not a
    # valid graph (in the graph theory sense); some midblock endpoints refer to intersection IDs that do
    # not correspond to actual intersections.  To fix this, the routing target fills in vertices for
    # those intersection IDs.  When routing corridors between centreline features, we use the routing
    # target, then filter the result down to only those features in the conflation target.
    #
    # This is intended to run after `copy_opendata_shapefiles`.
    with TaskGroup(group_id="centreline_conflation_target"
                   ) as centreline_conflation_target:
        group_id = "centreline_conflation_target"

        A0_INTERSECTIONS_BASE = create_bash_task_nested(
            group_id, 'A0_intersections_base')
        A0_MIDBLOCKS_BASE = create_bash_task_nested(group_id,
                                                    'A0_midblocks_base')
        A1_INTERSECTION_IDS = create_bash_task_nested(group_id,
                                                      'A1_intersection_ids')
        A2_INTERSECTIONS = create_bash_task_nested(group_id,
                                                   "A2_intersections")
        A3_MIDBLOCK_NAMES = create_bash_task_nested(group_id,
                                                    'A3_midblock_names')
        A4_MIDBLOCKS = create_bash_task_nested(group_id, 'A4_midblocks')
        A5_ROUTING_VERTICES = create_bash_task_nested(group_id,
                                                      'A5_routing_vertices')
        A6_ROUTING_EDGES = create_bash_task_nested(group_id,
                                                   'A6_routing_edges')

        [A0_INTERSECTIONS_BASE, A0_MIDBLOCKS_BASE] >> A1_INTERSECTION_IDS
        A1_INTERSECTION_IDS >> A2_INTERSECTIONS
        A2_INTERSECTIONS >> A3_MIDBLOCK_NAMES
        A3_MIDBLOCK_NAMES >> A4_MIDBLOCKS
        A4_MIDBLOCKS >> A5_ROUTING_VERTICES
        A5_ROUTING_VERTICES >> A6_ROUTING_EDGES

    # """
    # gis_layers_vector_tiles
    #
    # Generates vector tiles from GIS layers provided by GCC, which are loaded into our database by
    # the `copy_gis_layers` DAG.  These are stored in `/data/tiles`, and are served from `/tiles` on
    # our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render information
    # about schools, hospitals, and other points of interest when zoomed in.
    #
    # This is intended to run after `copy_gis_layers`.
    # """
    with TaskGroup(
            group_id="gis_layers_vector_tiles") as gis_layers_vector_tiles:
        BUILD_GIS_LAYERS_TILES = create_bash_task('build_gis_layers_tiles')
        EXTRACT_GIS_LAYERS_TILES = create_bash_task('extract_gis_layers_tiles')

        BUILD_GIS_LAYERS_TILES >> EXTRACT_GIS_LAYERS_TILES
    """
    location_search_index

    Builds the views and indexes that support location search, and also builds an index of midblock
    names.

    This is intended to run after `centreline_conflation_target` and `copy_gis_layers`
    """
    with TaskGroup(group_id="location_search_index") as location_search_index:
        group_id = "location_search_index"

        TRANSFORM_CENTRELINE_INDEX = create_bash_task_nested(
            group_id, 'transform_centreline_index')
        TRANSFORM_INTERSECTIONS_INDEX = create_bash_task_nested(
            group_id, 'transform_intersections_index')
        TRANSFORM_TRAFFIC_SIGNAL = create_bash_task_nested(
            group_id, 'transform_traffic_signal')

        TRANSFORM_TRAFFIC_SIGNAL
        TRANSFORM_CENTRELINE_INDEX >> TRANSFORM_INTERSECTIONS_INDEX
    """
    centreline_vector_tiles

    Generates vector tiles from the MOVE conflation target, which is built by the
    `centreline_conflation_target` DAG.  These are stored in `/data/tiles`, and are served from
    `/tiles` on our web EC2 instances; they are used by `FcPaneMap` in the web frontend to render
    interactive centreline features.

    This is intended to run after `centreline_conflation_target`.
    """
    with TaskGroup(
            group_id='centreline_vector_tiles') as centreline_vector_tiles:
        group_id = 'centreline_vector_tiles'

        LOAD_VOLUME = create_bash_task_nested(group_id, 'load_volume')
        BUILD_VECTOR_TILES = create_bash_task_nested(group_id,
                                                     'build_vector_tiles')
        EXTRACT_VECTOR_TILES = create_bash_task_nested(group_id,
                                                       'extract_vector_tiles')

        LOAD_VOLUME >> BUILD_VECTOR_TILES
        BUILD_VECTOR_TILES >> EXTRACT_VECTOR_TILES
    """
    arteries_geocoding

    Uses arterycode matching information and processes as originally developed by Data + Analytics to
    link counts with the Toronto centreline.

    The legacy FLOW system was not based on the Toronto Centreline, but rather used a legacy map
    layer that is no longer supported.  In FLOW, arterycodes identified locations in that legacy
    map layer.  To use these with the Toronto Centreline, we apply a series of heuristics developed
    by Data + Analytics: ID matching on `LINKID`, spatial matches, etc.

    This is the first step in our FLOW geocoding cascade, which continues with the DAGs
    `group_multidirection_arteries` and `group_multiday_counts`.  All three DAGs must run before
    MOVE is considered to have updated its copy of FLOW data.

    This is intended to run after `replicator_transfer_flow` and `centreline_conflation_target`.
    """
    with TaskGroup(group_id="arteries_geocoding") as arteries_geocoding:
        group_id = "arteries_geocoding"

        A1_ARTERIES_MANUAL_CORR = create_bash_task_nested(
            group_id, 'A1_arteries_manual_corr')
        A1_NODES_CORRECTED = create_bash_task_nested(group_id,
                                                     'A1_nodes_corrected')
        A2_NODES_CENTRELINE = create_bash_task_nested(group_id,
                                                      'A2_nodes_centreline')
        B1_ARTERIES_PX_CENTRELINE = create_bash_task_nested(
            group_id, 'B1_arteries_px_centreline')
        B2_ARTERIES_MANUAL_CORR_NORMALIZED = create_bash_task_nested(
            group_id, 'B2_arteries_manual_corr_normalized')
        C1_ARTERIES_LINKS = create_bash_task_nested(group_id,
                                                    'C1_arteries_links')
        C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS = create_bash_task_nested(
            group_id, 'C2_arteries_double_link_midblocks')
        C2_ARTERIES_DOUBLE_NODE = create_bash_task_nested(
            group_id, 'C2_arteries_double_node')
        C2_ARTERIES_SINGLE_NODE = create_bash_task_nested(
            group_id, 'C2_arteries_single_node')
        C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS = create_bash_task_nested(
            group_id, 'C3_arteries_double_node_midblocks')
        C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST = create_bash_task_nested(
            group_id, 'C4_arteries_double_node_midblocks_multi_best')
        D1_ARTERIES_CENTRELINE_TABLE = create_bash_task_nested(
            group_id, 'D1_arteries_centreline_table')
        D2_ARTERY_GEOCODING = create_bash_task_nested(group_id,
                                                      'D2_artery_geocoding')
        D3_ARTERIES_CENTRELINE_VIEW = create_bash_task_nested(
            group_id, 'D3_arteries_centreline_view')

        A1_NODES_CORRECTED >> A2_NODES_CENTRELINE
        A1_ARTERIES_MANUAL_CORR >> B2_ARTERIES_MANUAL_CORR_NORMALIZED
        A2_NODES_CENTRELINE >> C2_ARTERIES_DOUBLE_NODE
        C1_ARTERIES_LINKS >> C2_ARTERIES_DOUBLE_NODE
        A2_NODES_CENTRELINE >> C2_ARTERIES_SINGLE_NODE
        C1_ARTERIES_LINKS >> C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS
        C1_ARTERIES_LINKS >> C2_ARTERIES_SINGLE_NODE
        C2_ARTERIES_DOUBLE_NODE >> C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS
        C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS >> C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST
        A2_NODES_CENTRELINE >> D1_ARTERIES_CENTRELINE_TABLE
        B1_ARTERIES_PX_CENTRELINE >> D1_ARTERIES_CENTRELINE_TABLE
        B2_ARTERIES_MANUAL_CORR_NORMALIZED >> D1_ARTERIES_CENTRELINE_TABLE
        C1_ARTERIES_LINKS >> D1_ARTERIES_CENTRELINE_TABLE
        C2_ARTERIES_DOUBLE_LINK_MIDBLOCKS >> D1_ARTERIES_CENTRELINE_TABLE
        C2_ARTERIES_SINGLE_NODE >> D1_ARTERIES_CENTRELINE_TABLE
        C2_ARTERIES_DOUBLE_NODE >> D1_ARTERIES_CENTRELINE_TABLE
        C3_ARTERIES_DOUBLE_NODE_MIDBLOCKS >> D1_ARTERIES_CENTRELINE_TABLE
        C4_ARTERIES_DOUBLE_NODE_MIDBLOCKS_MULTI_BEST >> D1_ARTERIES_CENTRELINE_TABLE
        D1_ARTERIES_CENTRELINE_TABLE >> D2_ARTERY_GEOCODING
        D2_ARTERY_GEOCODING >> D3_ARTERIES_CENTRELINE_VIEW
    """
    crash_geocoding

    Normalizes CRASH data into collision _events_ and collision _involved persons_, then matches
    collision events to the centreline conflation target that was created by
    `centreline_conflation_target`.

    Our legacy schema in Oracle stores both event-related and involved-person-related information
    in a single table, `TRAFFIC.ACC`.  That table has one record per involved person, with event-level
    details copied across all persons involved in a collision.  To make this easier to work with in
    MOVE, we transform `TRAFFIC.ACC` into a normalized representation.

    To match collisions to the centreline, we use the following heuristic:

    - if there are any intersections within 20m, match to the closest such intersection;
    - otherwise, if there are any midblocks within 20m, match to the closest such midblock;
    - otherwise, do not match.

    This same heuristic was used by the legacy CRASH system to assign collisions to intersections
    and midblocks.  (However, CRASH did not use the Toronto Centreline, but instead used a legacy
    map layer that has been deprecated and is no longer maintained by the City.)

    This is intended to run after `replicator_transfer_crash` and `centreline_conflation_target`.
    """
    with TaskGroup(group_id="crash_geocoding") as crash_geocoding:
        group_id = 'crash_geocoding'

        A1_EVENTS_FIELDS_RAW = create_bash_task_nested(group_id,
                                                       'A1_events_fields_raw')
        A2_EVENTS_FIELDS_NORM = create_bash_task_nested(
            group_id, 'A2_events_fields_norm')
        A2_INVOLVED_FIELDS_RAW = create_bash_task_nested(
            group_id, 'A2_involved_fields_raw')
        A3_INVOLVED_FIELDS_NORM = create_bash_task_nested(
            group_id, 'A3_involved_fields_norm')
        A4_INVOLVED = create_bash_task_nested(group_id, 'A4_involved')
        A5_EVENTS = create_bash_task_nested(group_id, 'A5_events')
        A6_EVENTS_INTERSECTIONS = create_bash_task_nested(
            group_id, 'A6_events_intersections')
        A6_EVENTS_SEGMENTS = create_bash_task_nested(group_id,
                                                     'A6_events_segments')
        A7_EVENTS_CENTRELINE = create_bash_task_nested(group_id,
                                                       'A7_events_centreline')

        A1_EVENTS_FIELDS_RAW >> A2_EVENTS_FIELDS_NORM
        A1_EVENTS_FIELDS_RAW >> A2_INVOLVED_FIELDS_RAW
        A2_EVENTS_FIELDS_NORM >> A3_INVOLVED_FIELDS_NORM
        A2_INVOLVED_FIELDS_RAW >> A3_INVOLVED_FIELDS_NORM
        A3_INVOLVED_FIELDS_NORM >> A4_INVOLVED
        A4_INVOLVED >> A5_EVENTS
        A5_EVENTS >> A6_EVENTS_INTERSECTIONS
        A5_EVENTS >> A6_EVENTS_SEGMENTS
        A6_EVENTS_INTERSECTIONS >> A7_EVENTS_CENTRELINE
        A6_EVENTS_SEGMENTS >> A7_EVENTS_CENTRELINE
    """
    collisions_vector_tiles

    Generates vector tiles from collisions data, which is built by the `crash_geocoding` DAG.
    These are stored in `/data/tiles`, and are served from `/tiles` on our web EC2 instances; they
    are used by `FcPaneMap` in the web frontend to render collisions heatmaps when zoomed out.

    This is intended to run after `crash_geocoding`.
    """
    with TaskGroup(
            group_id="collisions_vector_tiles") as collisions_vector_tiles:
        group_id = "collisions_vector_tiles"

        BUILD_COLLISIONS_TILES = create_bash_task_nested(
            group_id, 'build_collisions_tiles')
        EXTRACT_COLLISIONS_TILES = create_bash_task_nested(
            group_id, 'extract_collisions_tiles')

        BUILD_COLLISIONS_TILES >> EXTRACT_COLLISIONS_TILES
    """
    group_multidirection_arteries

    Continues the FLOW geocoding process started by `arteries_geocoding`, by identifying arterycodes
    that refer to different directions of travel in the same location and grouping them together.

    When a traffic study is requested, it might ask for 3 days of data collection on a 2-way street;
    someone requesting this study would want to see all 3 days in both directions of travel.
    However, the legacy FLOW schema uses separate arterycodes for different directions of travel,
    and also uses separate `COUNT_INFO_ID`s for each day of a traffic study.

    As a first step towards delivering all data for this study at once, we need to identify the
    arterycodes that correspond to these two directions of travel, and group them together.  Once
    that's done, the DAG `group_multiday_counts` then takes care of grouping together the 3 days
    of the traffic study, so that we can get all six relevant counts in database.

    Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no
    reliable way to visualize that much data at once.

    This is intended to run after `arteries_geocoding`.
    """
    with TaskGroup(group_id='group_multidirection_arteries'
                   ) as group_multidirection_arteries:
        group_id = 'group_multidirection_arteries'

        A1_ARTERIES_DOUBLE_LINK_PAIRS = create_bash_task_nested(
            group_id, 'A1_arteries_double_link_pairs')
        A1_ARTERIES_MIDBLOCK_SOLO = create_bash_task_nested(
            group_id, 'A1_arteries_midblock_solo')
        A2_ARTERIES_GROUPS_PRE = create_bash_task_nested(
            group_id, 'A2_arteries_groups_pre')
        A3_ARTERIES_GROUPS_RANKED = create_bash_task_nested(
            group_id, 'A3_arteries_groups_ranked')
        A4_ARTERIES_GROUPS_POST = create_bash_task_nested(
            group_id, 'A4_arteries_groups_post')

        A1_ARTERIES_DOUBLE_LINK_PAIRS >> A2_ARTERIES_GROUPS_PRE
        A1_ARTERIES_MIDBLOCK_SOLO >> A2_ARTERIES_GROUPS_PRE
        A2_ARTERIES_GROUPS_PRE >> A3_ARTERIES_GROUPS_RANKED
        A3_ARTERIES_GROUPS_RANKED >> A4_ARTERIES_GROUPS_POST
    """
    group_multiday_counts

    Finishes the FLOW geocoding process started by `arteries_geocoding` and continued by
    `group_multidirection_arteries`, by identifying consecutive days of data collection from
    the same arterycode group and grouping those together into a single study.

    When a traffic study is requested, it might ask for 3 days of data collection on a 2-way street;
    someone requesting this study would want to see all 3 days in both directions of travel.
    However, the legacy FLOW schema uses separate arterycodes for different directions of travel,
    and also uses separate `COUNT_INFO_ID`s for each day of a traffic study.

    Once `group_multidirection_arteries` has completed, we've identified the arterycodes that
    correspond to these two directions of travel.  To find all data for the study, we now need to
    group together the 3 days over which data was collected at these two arterycodes.

    However, not all studies are of the same duration.  To detect studies, we use runs of consecutive
    days at the same arterycode group.

    Note that we do not group *permanent* counts (i.e. "PERM STN" or "RESCU") for now, as we have no
    reliable way to visualize that much data at once.

    This is intended to run after `group_multidirection_arteries`.
    """
    with TaskGroup(group_id='group_multiday_counts') as group_multiday_counts:
        group_id = 'group_multiday_counts'

        A1_COUNTS_MULTIDAY_RUNS = create_bash_task_nested(
            group_id, 'A1_counts_multiday_runs')
        A2_ARTERIES_COUNTS_GROUPS = create_bash_task_nested(
            group_id, 'A2_arteries_counts_groups')
        A3_STUDIES = create_bash_task_nested(group_id, 'A3_studies')
        A4_COUNTS2_STUDIES = create_bash_task_nested(group_id,
                                                     'A4_counts2_studies')

        A1_COUNTS_MULTIDAY_RUNS >> A2_ARTERIES_COUNTS_GROUPS
        A2_ARTERIES_COUNTS_GROUPS >> A3_STUDIES
        A3_STUDIES >> A4_COUNTS2_STUDIES
    """
    open_data_tmcs

    Builds the [Traffic Volumes at Intersections for All Modes](https://open.toronto.ca/dataset/traffic-volumes-at-intersections-for-all-modes/)
    dataset for the City of Toronto Open Data Portal.

    The dataset is exposed in two ways: via database, and via HTTP.  We store the dataset as a series
    of views in the `open_data` schema.  We also dump those views to CSV files at `/data/open_data`,
    which is served from `/open_data` on our ETL EC2 instances.

    This is intended to run after `group_multiday_counts`.
    """
    with TaskGroup(group_id='open_data_tmcs') as open_data_tmcs:
        group_id = 'open_data_tmcs'

        A1_TMCS_COUNT_DATA = create_bash_task_nested(group_id,
                                                     'A1_tmcs_count_data')
        A1_TMCS_COUNT_METADATA = create_bash_task_nested(
            group_id, 'A1_tmcs_count_metadata')
        A2_TMCS_LOCATIONS = create_bash_task_nested(group_id,
                                                    'A2_tmcs_locations')
        A3_TMCS_JOINED = create_bash_task_nested(group_id, 'A3_tmcs_joined')
        A4_TMCS_DECADES = create_bash_task_nested(group_id, 'A4_tmcs_decades')
        A4_TMCS_PREVIEW = create_bash_task_nested(group_id, 'A4_tmcs_preview')

        A1_TMCS_COUNT_DATA >> A2_TMCS_LOCATIONS
        A1_TMCS_COUNT_METADATA >> A2_TMCS_LOCATIONS
        A2_TMCS_LOCATIONS >> A3_TMCS_JOINED
        A3_TMCS_JOINED >> A4_TMCS_DECADES
        A3_TMCS_JOINED >> A4_TMCS_PREVIEW

    replicator_update_schema >> copy_gis_layers
    replicator_update_schema >> copy_opendata_shapefiles
    [copy_gis_layers, copy_opendata_shapefiles] >> centreline_conflation_target
    [copy_gis_layers, copy_opendata_shapefiles] >> gis_layers_vector_tiles
    centreline_conflation_target >> location_search_index
    centreline_conflation_target >> centreline_vector_tiles
    centreline_conflation_target >> arteries_geocoding
    centreline_conflation_target >> crash_geocoding
    crash_geocoding >> collisions_vector_tiles
    arteries_geocoding >> group_multidirection_arteries
    group_multidirection_arteries >> group_multiday_counts
    group_multiday_counts >> open_data_tmcs
        max_active_runs=3,
        schedule_interval="@daily",
        default_args={
            "email_on_failure": False,
            "email_on_retry": False,
            "retries": 1,
            "retry_delay": timedelta(minutes=1),
        },
        catchup=False,
        template_searchpath="/usr/local/airflow/include",
) as dag:

    t0 = DummyOperator(task_id="start")

    # Define Task Group with Postgres Queries
    with TaskGroup("covid_table_queries") as covid_table_queries:
        for state in states:
            generate_files = PostgresOperator(
                task_id="covid_query_{0}".format(state),
                postgres_conn_id="gpdb",
                sql="covid_state_query.sql",
                params={"state": "'" + state + "'"},
            )

    # Define task to send email
    send_email = EmailOperator(
        task_id="send_email",
        to=email_to,
        subject="Covid Greenplum Queries DAG",
        html_content=
        "<p>The Covid queries were run on Greenplum successfully. <p>",