def start(self): self.log.info("Running dag trigger loop for %s seconds", self.run_duration) self.log.info("Processing each file at most %s times", self.num_runs) # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self.subdir) known_file_paths = list_py_file_paths(self.subdir) self.log.info("There are %s files in %s", len(known_file_paths), self.subdir) self.log.info("known files are %s.", str(known_file_paths)) def processor_factory(file_path, zombies): return EventDagFileProcessor(file_path, self.pickle_dags, [], zombies) # When using sqlite, we do not use async_mode # so the scheduler job and DAG parser don't access the DB at the same time. async_mode = not self.using_sqlite processor_timeout_seconds = conf.getint('core', 'dag_file_processor_timeout') processor_timeout = timedelta(seconds=processor_timeout_seconds) self.processor_agent = DagFileProcessorAgent( self.subdir, known_file_paths, self.num_runs, processor_factory, processor_timeout, async_mode) self.processor_agent.start() self.execute_start_time = timezone.utcnow() self.dag_process_thread = threading.Thread(target=self.run_parse_dags, args=()) self.dag_process_thread.setDaemon(True) self.dag_process_thread.start()
def upgrade_dags(args): """ Corrects old style DAG python files into the new format. Reads configuration from "args.config". Uses standard "conf.get" instead of "conf_get", because the fields we use are always set. Copies all deprecated dags into the "deprecated_dags" folder, adds deprecated DAGs to the ".airflowignore" file created within that folder. Original DAG files are replaced with the new ones (with base64 encoded gzip compressed workflow content), original workflow files remain unchanged. """ conf.read(args.config) # this will read already patched airflow.cfg dags_folder = conf.get("core", "dags_folder") for dag_location in list_py_file_paths( # will skip all DAGs from ".airflowignore" directory=dags_folder, safe_mode=conf.getboolean("core", "dag_discovery_safe_mode"), # use what user set in his config include_examples=False ): overwrite_deprecated_dag( # upgrades only deprecated DAGs, skips others dag_location=dag_location, deprecated_dags_folder=os.path.join( dags_folder, "deprecated_dags" ) )
def test_dag_is_deactivated_upon_dagfile_deletion(self): dag_id = 'old_existing_dag' dag_fileloc = "/usr/local/airflow/dags/non_existing_path.py" dag = DAG( dag_id, is_paused_upon_creation=True, ) dag.fileloc = dag_fileloc session = settings.Session() dag.sync_to_db(session=session) orm_dag = session.query(DagModel).filter( DagModel.dag_id == dag_id).one() self.assertTrue(orm_dag.is_active) self.assertEqual(orm_dag.fileloc, dag_fileloc) DagModel.deactivate_deleted_dags( list_py_file_paths(settings.DAGS_FOLDER)) orm_dag = session.query(DagModel).filter( DagModel.dag_id == dag_id).one() self.assertFalse(orm_dag.is_active) # CleanUp session.execute( DagModel.__table__.delete().where(DagModel.dag_id == dag_id))
def remove_outdated_dags(cwl_id): logging.info(f"""Searching for dags based on cwl_id: {cwl_id}""") dags = {} for location in list_py_file_paths(DAGS_FOLDER, include_examples=False): dag_id = get_rootname(location) if cwl_id not in dag_id: continue dags[dag_id] = { "location": location, "modified": datetime.fromtimestamp(os.path.getmtime(location)) } logging.info(f"""Found dag_id: {dag_id}, modified: {dags[dag_id]["modified"]}""") for dag_id, dag_metadata in sorted(dags.items(), key=lambda i: i[1]["modified"])[:-1]: logging.info(f"""Cleaning dag_id: {dag_id}""") if len(DagRun.find(dag_id=dag_id, state=State.RUNNING)) == 0: try: delete_dag.delete_dag(dag_id) except Exception as ex: logging.error(f"""Failed to delete DAG\n {ex}""") for f in [ dag_metadata["location"], os.path.splitext(dag_metadata["location"])[0]+".cwl" ]: try: logging.info(f"""Deleting DAG file: {f}""") os.remove(f) except Exception as ex: logging.error(f"""Failed to delete file {f}\n {ex}""") else: logging.info("Skipping, DAG has running DagRuns")
def collect_dags( self, dag_folder=None, only_if_updated=True, include_examples=configuration.conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=configuration.conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ start_dttm = timezone.utcnow() dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] FileLoadStat = namedtuple('FileLoadStat', "file duration dag_num task_num dags") dag_folder = correct_maybe_zipped(dag_folder) for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: ts = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) td = timezone.utcnow() - ts td = td.total_seconds() + (float(td.microseconds) / 1000000) stats.append( FileLoadStat( filepath.replace(dag_folder, ''), td, len(found_dags), sum([len(dag.tasks) for dag in found_dags]), str([dag.dag_id for dag in found_dags]), )) except Exception as e: self.log.exception(e) Stats.gauge('collect_dags', (timezone.utcnow() - start_dttm).total_seconds(), 1) Stats.gauge('dagbag_size', len(self.dags), 1) Stats.gauge('dagbag_import_errors', len(self.import_errors), 1) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True)
def collect_dags(self, dag_folder=None, only_if_updated=True, include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] FileLoadStat = namedtuple('FileLoadStat', "file duration dag_num task_num dags") dag_folder = correct_maybe_zipped(dag_folder) for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: ts = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) dag_ids = [dag.dag_id for dag in found_dags] dag_id_names = str(dag_ids) td = timezone.utcnow() - ts td = td.total_seconds() + (float(td.microseconds) / 1000000) stats.append( FileLoadStat( filepath.replace(settings.DAGS_FOLDER, ''), td, len(found_dags), sum([len(dag.tasks) for dag in found_dags]), dag_id_names, )) except Exception as e: self.log.exception(e) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True) for file_stat in self.dagbag_stats: # file_stat.file similar format: /subdir/dag_name.py # TODO: Remove for Airflow 2.0 filename = file_stat.file.split('/')[-1].replace('.py', '') Stats.timing('dag.loading-duration.{}'.format(filename), file_stat.duration)
def load_templates(self, only_if_updated=True): self.log.info(f'Loading DAG Templates from "{self.templates_dir}"...') for filepath in list_py_file_paths(self.templates_dir, safe_mode=False, include_examples=False): self.process_file(filepath, only_if_updated=only_if_updated) self.templates = { key: cls() for key, cls in self.template_classes.items() } return self.templates
def remove_outdated_dags(cwl_id, dags_folder): """ Iterates over DAG files from the dags_folder (excluding Airflow examples). Assuming that dag_id written inside Python file is equal to its rootname and follows the naming rule "cwldid-commitsha", we check if there are any files that have target cwl_id in the rootname (aka in the dag_id). For all collected DAGs (based on cwl_id) we save modified timestamp and location, then sort them by timestamp excluding the newest one, thus forming a list of outdated DAGs for the same cwl_id (the same workflow). Then we iterate over the list of outdated DAGs and check whether we can safely remove it (both from DB and disk). The only condition when we don't delete outdated DAG is when there is at list one DagRun for it. """ logging.info( f"Searching for dags based on cwl_id: {cwl_id} in order to remove the old ones" ) dags = {} for location in list_py_file_paths(dags_folder, include_examples=False): dag_id = get_rootname(location) if cwl_id not in dag_id: continue dags[dag_id] = { "location": location, "modified": datetime.fromtimestamp(os.path.getmtime(location)) } logging.info( f"Found dag_id: {dag_id}, modified: {dags[dag_id]['modified']}") for dag_id, dag_metadata in sorted(dags.items(), key=lambda i: i[1]["modified"])[:-1]: logging.info(f"Cleaning dag_id: {dag_id}") if len(DagRun.find(dag_id=dag_id, state=State.RUNNING)) == 0: try: delete_dag.delete_dag(dag_id) except Exception as ex: logging.error(f"Failed to delete DAG\n {ex}") for f in [ dag_metadata["location"], os.path.splitext(dag_metadata["location"])[0] + ".cwl" ]: try: logging.info(f"Deleting DAG file: {f}") os.remove(f) except Exception as ex: logging.error(f"Failed to delete file {f}\n {ex}") else: logging.info("Skipping, DAG has running DagRuns")
"""Tests the basic integrity of DAGs by loading the DAG files and determining if they contain a valid DAG object. Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8 """ from os import path import pytest from airflow import models as airflow_models from airflow.utils.dag_processing import list_py_file_paths DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags") DAG_PATHS = list_py_file_paths(DAG_BASE_DIR, safe_mode=True, include_examples=False) @pytest.mark.parametrize("dag_path", DAG_PATHS) def test_dag_integrity(dag_path): """Import DAG file and check for a valid DAG instance.""" dag_name = path.basename(dag_path) module = _import_file(dag_name, dag_path) assert any( isinstance(var, airflow_models.DAG) for var in vars(module).values()) def _import_file(module_name, module_path): import importlib.util
"""Tests the basic integrity of DAGs by loading the DAG files and determining if they contain a valid DAG object. Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8 """ from os import path import pytest from airflow import models as airflow_models from airflow.utils.dag_processing import list_py_file_paths DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags") DAG_PATHS = list_py_file_paths(DAG_BASE_DIR) @pytest.mark.parametrize("dag_path", DAG_PATHS) def test_dag_integrity(dag_path): """Import DAG file and check for a valid DAG instance.""" dag_name = path.basename(dag_path) module = _import_file(dag_name, dag_path) assert any(isinstance(var, airflow_models.DAG) for var in vars(module).values()) def _import_file(module_name, module_path): import importlib.util spec = importlib.util.spec_from_file_location(module_name, str(module_path)) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module)
"""Tests the basic integrity of DAGs by loading the DAG files and determining if they contain a valid DAG object. Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8 """ from os import path import pytest from airflow import models as airflow_models from airflow.utils.dag_processing import list_py_file_paths DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags") DAG_PATHS = [ path for path in list_py_file_paths(DAG_BASE_DIR, include_examples=False) if path.endswith(".py") ] @pytest.mark.parametrize("dag_path", DAG_PATHS) def test_dag_integrity(dag_path): """Import DAG file and check for a valid DAG instance.""" dag_name = path.basename(dag_path) module = _import_file(dag_name, dag_path) assert any( isinstance(var, airflow_models.DAG) for var in vars(module).values()) def _import_file(module_name, module_path): import importlib.util
"""Tests the basic integrity of DAGs by loading the DAG files and determining if they contain a valid DAG object. Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8 """ from os import path import pytest from airflow import models as airflow_models from airflow.utils.dag_processing import list_py_file_paths DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags") DAG_PATHS = list_py_file_paths(DAG_BASE_DIR, include_examples=False) @pytest.mark.parametrize("dag_path", DAG_PATHS) def test_dag_integrity(dag_path): """Import DAG file and check for a valid DAG instance.""" dag_name = path.basename(dag_path) module = _import_file(dag_name, dag_path) assert any(isinstance(var, airflow_models.DAG) for var in vars(module).values()) def _import_file(module_name, module_path): import importlib.util spec = importlib.util.spec_from_file_location(module_name, str(module_path)) module = importlib.util.module_from_spec(spec) spec.loader.exec_module(module)
"""Tests the basic integrity of DAGs by loading the DAG files and determining if they contain a valid DAG object. Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8 """ from os import path import pytest from airflow import models as airflow_models from airflow.utils.dag_processing import list_py_file_paths DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags") DAG_PATHS = [ path for path in list_py_file_paths( DAG_BASE_DIR, include_examples=False, safe_mode=True) if path.endswith('.py') ] @pytest.mark.parametrize("dag_path", DAG_PATHS) def test_dag_integrity(dag_path): """Import DAG file and check for a valid DAG instance.""" dag_name = path.basename(dag_path) module = _import_file(dag_name, dag_path) assert any( isinstance(var, airflow_models.DAG) for var in vars(module).values()) def _import_file(module_name, module_path):