def bag_dag(self, dag, root_dag): """ Adds the DAG into the bag, recurses into sub dags. Throws AirflowDagCycleException if a cycle is detected in this dag or its subdags """ test_cycle(dag) # throws if a task cycle is found dag.resolve_template_files() dag.last_loaded = timezone.utcnow() for task in dag.tasks: settings.policy(task) subdags = dag.subdags try: for subdag in subdags: subdag.full_filepath = dag.full_filepath subdag.parent_dag = dag subdag.is_subdag = True self.bag_dag(dag=subdag, root_dag=root_dag) self.dags[dag.dag_id] = dag self.log.debug('Loaded DAG %s', dag) except AirflowDagCycleException as cycle_exception: # There was an error in bagging the dag. Remove it from the list of dags self.log.exception('Exception bagging dag: %s', dag.dag_id) # Only necessary at the root level since DAG.subdags automatically # performs DFS to search through all subdags if dag == root_dag: for subdag in subdags: if subdag.dag_id in self.dags: del self.dags[subdag.dag_id] raise cycle_exception
def assert_has_valid_dag(module): """Assert that a module contains a valid DAG.""" no_dag_found = True for dag in vars(module).values(): if isinstance(dag, models.DAG): no_dag_found = False test_cycle(dag) # Throws if a task cycle is found. if no_dag_found: raise AssertionError('module does not contain a valid DAG')
def test_dag_integrity(dag_path: Path) -> None: """Import a DAG file and check if it has a valid DAG instance.""" dag_name = path.basename(dag_path) dag_module = import_module(dag_name, dag_path) # Look for all DAG instancies. dags = [var for var in vars(dag_module).values() if isinstance(var, DAG)] # Assert that there is at least one DAG instance. assert dags # Test all DAG instancies for cycles. for dag in dags: test_cycle(dag=dag)
def _bag_dag(self, *, dag, root_dag, recursive): """Actual implementation of bagging a dag. The only purpose of this is to avoid exposing ``recursive`` in ``bag_dag()``, intended to only be used by the ``_bag_dag()`` implementation. """ test_cycle(dag) # throws if a task cycle is found dag.resolve_template_files() dag.last_loaded = timezone.utcnow() # Check policies settings.dag_policy(dag) for task in dag.tasks: settings.task_policy(task) subdags = dag.subdags try: # DAG.subdags automatically performs DFS search, so we don't recurse # into further _bag_dag() calls. if recursive: for subdag in subdags: subdag.full_filepath = dag.full_filepath subdag.parent_dag = dag subdag.is_subdag = True self._bag_dag(dag=subdag, root_dag=root_dag, recursive=False) prev_dag = self.dags.get(dag.dag_id) if prev_dag and prev_dag.full_filepath != dag.full_filepath: raise AirflowDagDuplicatedIdException( dag_id=dag.dag_id, incoming=dag.full_filepath, existing=self.dags[dag.dag_id].full_filepath, ) self.dags[dag.dag_id] = dag self.log.debug('Loaded DAG %s', dag) except (AirflowDagCycleException, AirflowDagDuplicatedIdException): # There was an error in bagging the dag. Remove it from the list of dags self.log.exception('Exception bagging dag: %s', dag.dag_id) # Only necessary at the root level since DAG.subdags automatically # performs DFS to search through all subdags if recursive: for subdag in subdags: if subdag.dag_id in self.dags: del self.dags[subdag.dag_id] raise
def test_cycle_empty(self): # test empty dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) self.assertFalse(test_cycle(dag))
def test_dag_cycles(self, dag_file): """Test for dag cycles on each of dag in dags directory.""" module_name, _ = os.path.splitext(dag_file) module_path = os.path.join(DAG_PATH, dag_file) mod_spec = importlib.util.spec_from_file_location( module_name, module_path) module = importlib.util.module_from_spec(mod_spec) mod_spec.loader.exec_module(module) dag_objects = [ var for var in vars(module).values() if isinstance(var, DAG) ] assert dag_objects for dag in dag_objects: # Test cycles test_cycle(dag)
def test_cycle_single_task(self): # test single task dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: DummyOperator(task_id='A') self.assertFalse(test_cycle(dag))
def test_cycle_loop(self): # test self loop dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> A with dag: op1 = DummyOperator(task_id='A') op1.set_downstream(op1) with self.assertRaises(AirflowDagCycleException): self.assertFalse(test_cycle(dag))
def test_cycle_downstream_loop(self): # test downstream self loop dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C -> D -> E -> E with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op1.set_downstream(op2) op2.set_downstream(op3) op3.set_downstream(op4) op4.set_downstream(op5) op5.set_downstream(op5) with self.assertRaises(AirflowDagCycleException): self.assertFalse(test_cycle(dag))
def test_cycle_no_cycle(self): # test no cycle dag = DAG( 'dag', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) # A -> B -> C # B -> D # E -> F with dag: op1 = DummyOperator(task_id='A') op2 = DummyOperator(task_id='B') op3 = DummyOperator(task_id='C') op4 = DummyOperator(task_id='D') op5 = DummyOperator(task_id='E') op6 = DummyOperator(task_id='F') op1.set_downstream(op2) op2.set_downstream(op3) op2.set_downstream(op4) op5.set_downstream(op6) self.assertFalse(test_cycle(dag))