def start(self):

        self.log.info("Running dag trigger loop for %s seconds",
                      self.run_duration)
        self.log.info("Processing each file at most %s times", self.num_runs)

        # Build up a list of Python files that could contain DAGs
        self.log.info("Searching for files in %s", self.subdir)
        known_file_paths = list_py_file_paths(self.subdir)
        self.log.info("There are %s files in %s", len(known_file_paths),
                      self.subdir)
        self.log.info("known files are %s.", str(known_file_paths))

        def processor_factory(file_path, zombies):
            return EventDagFileProcessor(file_path, self.pickle_dags, [],
                                         zombies)

        # When using sqlite, we do not use async_mode
        # so the scheduler job and DAG parser don't access the DB at the same time.
        async_mode = not self.using_sqlite

        processor_timeout_seconds = conf.getint('core',
                                                'dag_file_processor_timeout')
        processor_timeout = timedelta(seconds=processor_timeout_seconds)
        self.processor_agent = DagFileProcessorAgent(
            self.subdir, known_file_paths, self.num_runs, processor_factory,
            processor_timeout, async_mode)

        self.processor_agent.start()

        self.execute_start_time = timezone.utcnow()
        self.dag_process_thread = threading.Thread(target=self.run_parse_dags,
                                                   args=())
        self.dag_process_thread.setDaemon(True)
        self.dag_process_thread.start()
예제 #2
0
def upgrade_dags(args):
    """
    Corrects old style DAG python files into the new format.
    Reads configuration from "args.config". Uses standard
    "conf.get" instead of "conf_get", because the fields we
    use are always set. Copies all deprecated dags into the 
    "deprecated_dags" folder, adds deprecated DAGs to the
    ".airflowignore" file created within that folder. Original
    DAG files are replaced with the new ones (with base64
    encoded gzip compressed workflow content), original workflow
    files remain unchanged.
    """

    conf.read(args.config)                                      # this will read already patched airflow.cfg
    dags_folder = conf.get("core", "dags_folder")
    for dag_location in list_py_file_paths(                     # will skip all DAGs from ".airflowignore"
        directory=dags_folder,
        safe_mode=conf.getboolean("core", "dag_discovery_safe_mode"),  # use what user set in his config
        include_examples=False
    ):
        overwrite_deprecated_dag(                               # upgrades only deprecated DAGs, skips others
            dag_location=dag_location,
            deprecated_dags_folder=os.path.join(
                dags_folder,
                "deprecated_dags"
            )
        )
예제 #3
0
    def test_dag_is_deactivated_upon_dagfile_deletion(self):
        dag_id = 'old_existing_dag'
        dag_fileloc = "/usr/local/airflow/dags/non_existing_path.py"
        dag = DAG(
            dag_id,
            is_paused_upon_creation=True,
        )
        dag.fileloc = dag_fileloc
        session = settings.Session()
        dag.sync_to_db(session=session)

        orm_dag = session.query(DagModel).filter(
            DagModel.dag_id == dag_id).one()

        self.assertTrue(orm_dag.is_active)
        self.assertEqual(orm_dag.fileloc, dag_fileloc)

        DagModel.deactivate_deleted_dags(
            list_py_file_paths(settings.DAGS_FOLDER))

        orm_dag = session.query(DagModel).filter(
            DagModel.dag_id == dag_id).one()
        self.assertFalse(orm_dag.is_active)

        # CleanUp
        session.execute(
            DagModel.__table__.delete().where(DagModel.dag_id == dag_id))
예제 #4
0
def remove_outdated_dags(cwl_id):
    logging.info(f"""Searching for dags based on cwl_id: {cwl_id}""")
    dags = {}
    for location in list_py_file_paths(DAGS_FOLDER, include_examples=False):
        dag_id = get_rootname(location)
        if cwl_id not in dag_id:
            continue
        dags[dag_id] = {
            "location": location,
            "modified": datetime.fromtimestamp(os.path.getmtime(location))
        }
        logging.info(f"""Found dag_id: {dag_id}, modified: {dags[dag_id]["modified"]}""")
    for dag_id, dag_metadata in sorted(dags.items(), key=lambda i: i[1]["modified"])[:-1]:
        logging.info(f"""Cleaning dag_id: {dag_id}""")
        if len(DagRun.find(dag_id=dag_id, state=State.RUNNING)) == 0:
            try:
                delete_dag.delete_dag(dag_id)
            except Exception as ex:
                logging.error(f"""Failed to delete DAG\n {ex}""")
            for f in [
                dag_metadata["location"],
                os.path.splitext(dag_metadata["location"])[0]+".cwl"
            ]:
                try:
                    logging.info(f"""Deleting DAG file: {f}""")
                    os.remove(f)
                except Exception as ex:
                    logging.error(f"""Failed to delete file {f}\n {ex}""")
        else:
            logging.info("Skipping, DAG has running DagRuns")
예제 #5
0
    def collect_dags(
        self,
        dag_folder=None,
        only_if_updated=True,
        include_examples=configuration.conf.getboolean('core',
                                                       'LOAD_EXAMPLES'),
        safe_mode=configuration.conf.getboolean('core',
                                                'DAG_DISCOVERY_SAFE_MODE')):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the regex patterns specified
        in the file.

        **Note**: The patterns in .airflowignore are treated as
        un-anchored regexes, not shell-like glob patterns.
        """
        start_dttm = timezone.utcnow()
        dag_folder = dag_folder or self.dag_folder

        # Used to store stats around DagBag processing
        stats = []
        FileLoadStat = namedtuple('FileLoadStat',
                                  "file duration dag_num task_num dags")

        dag_folder = correct_maybe_zipped(dag_folder)

        for filepath in list_py_file_paths(dag_folder,
                                           safe_mode=safe_mode,
                                           include_examples=include_examples):
            try:
                ts = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)

                td = timezone.utcnow() - ts
                td = td.total_seconds() + (float(td.microseconds) / 1000000)
                stats.append(
                    FileLoadStat(
                        filepath.replace(dag_folder, ''),
                        td,
                        len(found_dags),
                        sum([len(dag.tasks) for dag in found_dags]),
                        str([dag.dag_id for dag in found_dags]),
                    ))
            except Exception as e:
                self.log.exception(e)
        Stats.gauge('collect_dags',
                    (timezone.utcnow() - start_dttm).total_seconds(), 1)
        Stats.gauge('dagbag_size', len(self.dags), 1)
        Stats.gauge('dagbag_import_errors', len(self.import_errors), 1)
        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
예제 #6
0
    def collect_dags(self,
                     dag_folder=None,
                     only_if_updated=True,
                     include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'),
                     safe_mode=conf.getboolean('core',
                                               'DAG_DISCOVERY_SAFE_MODE')):
        """
        Given a file path or a folder, this method looks for python modules,
        imports them and adds them to the dagbag collection.

        Note that if a ``.airflowignore`` file is found while processing
        the directory, it will behave much like a ``.gitignore``,
        ignoring files that match any of the regex patterns specified
        in the file.

        **Note**: The patterns in .airflowignore are treated as
        un-anchored regexes, not shell-like glob patterns.
        """
        dag_folder = dag_folder or self.dag_folder
        # Used to store stats around DagBag processing
        stats = []
        FileLoadStat = namedtuple('FileLoadStat',
                                  "file duration dag_num task_num dags")

        dag_folder = correct_maybe_zipped(dag_folder)

        for filepath in list_py_file_paths(dag_folder,
                                           safe_mode=safe_mode,
                                           include_examples=include_examples):
            try:
                ts = timezone.utcnow()
                found_dags = self.process_file(filepath,
                                               only_if_updated=only_if_updated,
                                               safe_mode=safe_mode)
                dag_ids = [dag.dag_id for dag in found_dags]
                dag_id_names = str(dag_ids)

                td = timezone.utcnow() - ts
                td = td.total_seconds() + (float(td.microseconds) / 1000000)
                stats.append(
                    FileLoadStat(
                        filepath.replace(settings.DAGS_FOLDER, ''),
                        td,
                        len(found_dags),
                        sum([len(dag.tasks) for dag in found_dags]),
                        dag_id_names,
                    ))
            except Exception as e:
                self.log.exception(e)
        self.dagbag_stats = sorted(stats,
                                   key=lambda x: x.duration,
                                   reverse=True)
        for file_stat in self.dagbag_stats:
            # file_stat.file similar format: /subdir/dag_name.py
            # TODO: Remove for Airflow 2.0
            filename = file_stat.file.split('/')[-1].replace('.py', '')
            Stats.timing('dag.loading-duration.{}'.format(filename),
                         file_stat.duration)
예제 #7
0
 def load_templates(self, only_if_updated=True):
     self.log.info(f'Loading DAG Templates from "{self.templates_dir}"...')
     for filepath in list_py_file_paths(self.templates_dir,
                                        safe_mode=False,
                                        include_examples=False):
         self.process_file(filepath, only_if_updated=only_if_updated)
     self.templates = {
         key: cls()
         for key, cls in self.template_classes.items()
     }
     return self.templates
예제 #8
0
def remove_outdated_dags(cwl_id, dags_folder):
    """
    Iterates over DAG files from the dags_folder (excluding Airflow examples). Assuming
    that dag_id written inside Python file is equal to its rootname and follows the naming
    rule "cwldid-commitsha", we check if there are any files that have target cwl_id in the
    rootname (aka in the dag_id). For all collected DAGs (based on cwl_id) we save modified
    timestamp and location, then sort them by timestamp excluding the newest one, thus
    forming a list of outdated DAGs for the same cwl_id (the same workflow). Then we iterate
    over the list of outdated DAGs and check whether we can safely remove it (both from DB
    and disk). The only condition when we don't delete outdated DAG is when there is at list
    one DagRun for it.
    """

    logging.info(
        f"Searching for dags based on cwl_id: {cwl_id} in order to remove the old ones"
    )
    dags = {}
    for location in list_py_file_paths(dags_folder, include_examples=False):
        dag_id = get_rootname(location)
        if cwl_id not in dag_id:
            continue
        dags[dag_id] = {
            "location": location,
            "modified": datetime.fromtimestamp(os.path.getmtime(location))
        }
        logging.info(
            f"Found dag_id: {dag_id}, modified: {dags[dag_id]['modified']}")
    for dag_id, dag_metadata in sorted(dags.items(),
                                       key=lambda i: i[1]["modified"])[:-1]:
        logging.info(f"Cleaning dag_id: {dag_id}")
        if len(DagRun.find(dag_id=dag_id, state=State.RUNNING)) == 0:
            try:
                delete_dag.delete_dag(dag_id)
            except Exception as ex:
                logging.error(f"Failed to delete DAG\n {ex}")
            for f in [
                    dag_metadata["location"],
                    os.path.splitext(dag_metadata["location"])[0] + ".cwl"
            ]:
                try:
                    logging.info(f"Deleting DAG file: {f}")
                    os.remove(f)
                except Exception as ex:
                    logging.error(f"Failed to delete file {f}\n {ex}")
        else:
            logging.info("Skipping, DAG has running DagRuns")
"""Tests the basic integrity of DAGs by loading the DAG files and determining
   if they contain a valid DAG object.

Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py
Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8
"""
from os import path

import pytest
from airflow import models as airflow_models
from airflow.utils.dag_processing import list_py_file_paths

DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags")
DAG_PATHS = list_py_file_paths(DAG_BASE_DIR,
                               safe_mode=True,
                               include_examples=False)


@pytest.mark.parametrize("dag_path", DAG_PATHS)
def test_dag_integrity(dag_path):
    """Import DAG file and check for a valid DAG instance."""

    dag_name = path.basename(dag_path)
    module = _import_file(dag_name, dag_path)
    assert any(
        isinstance(var, airflow_models.DAG) for var in vars(module).values())


def _import_file(module_name, module_path):
    import importlib.util
"""Tests the basic integrity of DAGs by loading the DAG files and determining
   if they contain a valid DAG object.

Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py
Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8
"""
from os import path

import pytest
from airflow import models as airflow_models
from airflow.utils.dag_processing import list_py_file_paths

DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags")
DAG_PATHS = list_py_file_paths(DAG_BASE_DIR)


@pytest.mark.parametrize("dag_path", DAG_PATHS)
def test_dag_integrity(dag_path):
    """Import DAG file and check for a valid DAG instance."""

    dag_name = path.basename(dag_path)
    module = _import_file(dag_name, dag_path)
    assert any(isinstance(var, airflow_models.DAG) for var in vars(module).values())


def _import_file(module_name, module_path):
    import importlib.util

    spec = importlib.util.spec_from_file_location(module_name, str(module_path))
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
예제 #11
0
"""Tests the basic integrity of DAGs by loading the DAG files and determining
   if they contain a valid DAG object.

Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py
Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8
"""
from os import path

import pytest
from airflow import models as airflow_models
from airflow.utils.dag_processing import list_py_file_paths

DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags")
DAG_PATHS = [
    path for path in list_py_file_paths(DAG_BASE_DIR, include_examples=False)
    if path.endswith(".py")
]


@pytest.mark.parametrize("dag_path", DAG_PATHS)
def test_dag_integrity(dag_path):
    """Import DAG file and check for a valid DAG instance."""

    dag_name = path.basename(dag_path)
    module = _import_file(dag_name, dag_path)
    assert any(
        isinstance(var, airflow_models.DAG) for var in vars(module).values())


def _import_file(module_name, module_path):
    import importlib.util
"""Tests the basic integrity of DAGs by loading the DAG files and determining
   if they contain a valid DAG object.

Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py
Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8
"""
from os import path

import pytest
from airflow import models as airflow_models
from airflow.utils.dag_processing import list_py_file_paths

DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags")
DAG_PATHS = list_py_file_paths(DAG_BASE_DIR, include_examples=False)


@pytest.mark.parametrize("dag_path", DAG_PATHS)
def test_dag_integrity(dag_path):
    """Import DAG file and check for a valid DAG instance."""

    dag_name = path.basename(dag_path)
    module = _import_file(dag_name, dag_path)
    assert any(isinstance(var, airflow_models.DAG) for var in vars(module).values())


def _import_file(module_name, module_path):
    import importlib.util

    spec = importlib.util.spec_from_file_location(module_name, str(module_path))
    module = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(module)
"""Tests the basic integrity of DAGs by loading the DAG files and determining
   if they contain a valid DAG object.

Based on example from https://github.com/danielvdende/data-testing-with-airflow/blob/master/integrity_tests/test_dag_integrity.py
Accompanying blog post: https://medium.com/wbaa/datas-inferno-7-circles-of-data-testing-hell-with-airflow-cef4adff58d8
"""
from os import path

import pytest
from airflow import models as airflow_models
from airflow.utils.dag_processing import list_py_file_paths

DAG_BASE_DIR = path.join(path.dirname(__file__), "..", "..", "dags")
DAG_PATHS = [
    path for path in list_py_file_paths(
        DAG_BASE_DIR, include_examples=False, safe_mode=True)
    if path.endswith('.py')
]


@pytest.mark.parametrize("dag_path", DAG_PATHS)
def test_dag_integrity(dag_path):
    """Import DAG file and check for a valid DAG instance."""

    dag_name = path.basename(dag_path)
    module = _import_file(dag_name, dag_path)
    assert any(
        isinstance(var, airflow_models.DAG) for var in vars(module).values())


def _import_file(module_name, module_path):