Example #1
0
    def bag_dag(self, dag, root_dag):
        """
        Adds the DAG into the bag, recurses into sub dags.
        Throws AirflowDagCycleException if a cycle is detected in this dag or its subdags
        """

        test_cycle(dag)  # throws if a task cycle is found

        dag.resolve_template_files()
        dag.last_loaded = timezone.utcnow()

        for task in dag.tasks:
            settings.policy(task)

        subdags = dag.subdags

        try:
            for subdag in subdags:
                subdag.full_filepath = dag.full_filepath
                subdag.parent_dag = dag
                subdag.is_subdag = True
                self.bag_dag(dag=subdag, root_dag=root_dag)

            self.dags[dag.dag_id] = dag
            self.log.debug('Loaded DAG %s', dag)
        except AirflowDagCycleException as cycle_exception:
            # There was an error in bagging the dag. Remove it from the list of dags
            self.log.exception('Exception bagging dag: %s', dag.dag_id)
            # Only necessary at the root level since DAG.subdags automatically
            # performs DFS to search through all subdags
            if dag == root_dag:
                for subdag in subdags:
                    if subdag.dag_id in self.dags:
                        del self.dags[subdag.dag_id]
            raise cycle_exception
def assert_has_valid_dag(module):
    """Assert that a module contains a valid DAG."""

    no_dag_found = True

    for dag in vars(module).values():
        if isinstance(dag, models.DAG):
            no_dag_found = False
            test_cycle(dag)  # Throws if a task cycle is found.

    if no_dag_found:
        raise AssertionError('module does not contain a valid DAG')
def test_dag_integrity(dag_path: Path) -> None:
    """Import a DAG file and check if it has a valid DAG instance."""
    dag_name = path.basename(dag_path)
    dag_module = import_module(dag_name, dag_path)

    # Look for all DAG instancies.
    dags = [var for var in vars(dag_module).values() if isinstance(var, DAG)]
    # Assert that there is at least one DAG instance.
    assert dags

    # Test all DAG instancies for cycles.
    for dag in dags:
        test_cycle(dag=dag)
Example #4
0
    def _bag_dag(self, *, dag, root_dag, recursive):
        """Actual implementation of bagging a dag.

        The only purpose of this is to avoid exposing ``recursive`` in ``bag_dag()``,
        intended to only be used by the ``_bag_dag()`` implementation.
        """
        test_cycle(dag)  # throws if a task cycle is found

        dag.resolve_template_files()
        dag.last_loaded = timezone.utcnow()

        # Check policies
        settings.dag_policy(dag)

        for task in dag.tasks:
            settings.task_policy(task)

        subdags = dag.subdags

        try:
            # DAG.subdags automatically performs DFS search, so we don't recurse
            # into further _bag_dag() calls.
            if recursive:
                for subdag in subdags:
                    subdag.full_filepath = dag.full_filepath
                    subdag.parent_dag = dag
                    subdag.is_subdag = True
                    self._bag_dag(dag=subdag,
                                  root_dag=root_dag,
                                  recursive=False)

            prev_dag = self.dags.get(dag.dag_id)
            if prev_dag and prev_dag.full_filepath != dag.full_filepath:
                raise AirflowDagDuplicatedIdException(
                    dag_id=dag.dag_id,
                    incoming=dag.full_filepath,
                    existing=self.dags[dag.dag_id].full_filepath,
                )
            self.dags[dag.dag_id] = dag
            self.log.debug('Loaded DAG %s', dag)
        except (AirflowDagCycleException, AirflowDagDuplicatedIdException):
            # There was an error in bagging the dag. Remove it from the list of dags
            self.log.exception('Exception bagging dag: %s', dag.dag_id)
            # Only necessary at the root level since DAG.subdags automatically
            # performs DFS to search through all subdags
            if recursive:
                for subdag in subdags:
                    if subdag.dag_id in self.dags:
                        del self.dags[subdag.dag_id]
            raise
    def test_cycle_empty(self):
        # test empty
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        self.assertFalse(test_cycle(dag))
    def test_dag_cycles(self, dag_file):
        """Test for dag cycles on each of dag in dags directory."""
        module_name, _ = os.path.splitext(dag_file)
        module_path = os.path.join(DAG_PATH, dag_file)
        mod_spec = importlib.util.spec_from_file_location(
            module_name, module_path)
        module = importlib.util.module_from_spec(mod_spec)
        mod_spec.loader.exec_module(module)

        dag_objects = [
            var for var in vars(module).values() if isinstance(var, DAG)
        ]
        assert dag_objects

        for dag in dag_objects:
            # Test cycles
            test_cycle(dag)
    def test_cycle_single_task(self):
        # test single task
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        with dag:
            DummyOperator(task_id='A')

        self.assertFalse(test_cycle(dag))
    def test_cycle_loop(self):
        # test self loop
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        # A -> A
        with dag:
            op1 = DummyOperator(task_id='A')
            op1.set_downstream(op1)

        with self.assertRaises(AirflowDagCycleException):
            self.assertFalse(test_cycle(dag))
    def test_cycle_downstream_loop(self):
        # test downstream self loop
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        # A -> B -> C -> D -> E -> E
        with dag:
            op1 = DummyOperator(task_id='A')
            op2 = DummyOperator(task_id='B')
            op3 = DummyOperator(task_id='C')
            op4 = DummyOperator(task_id='D')
            op5 = DummyOperator(task_id='E')
            op1.set_downstream(op2)
            op2.set_downstream(op3)
            op3.set_downstream(op4)
            op4.set_downstream(op5)
            op5.set_downstream(op5)

        with self.assertRaises(AirflowDagCycleException):
            self.assertFalse(test_cycle(dag))
    def test_cycle_no_cycle(self):
        # test no cycle
        dag = DAG(
            'dag',
            start_date=DEFAULT_DATE,
            default_args={'owner': 'owner1'})

        # A -> B -> C
        #      B -> D
        # E -> F
        with dag:
            op1 = DummyOperator(task_id='A')
            op2 = DummyOperator(task_id='B')
            op3 = DummyOperator(task_id='C')
            op4 = DummyOperator(task_id='D')
            op5 = DummyOperator(task_id='E')
            op6 = DummyOperator(task_id='F')
            op1.set_downstream(op2)
            op2.set_downstream(op3)
            op2.set_downstream(op4)
            op5.set_downstream(op6)

        self.assertFalse(test_cycle(dag))