def collect_dags( self, dag_folder=None, only_if_updated=True, include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'), include_smart_sensor=conf.getboolean('smart_sensor', 'USE_SMART_SENSOR'), safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE'), ): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ if self.read_dags_from_db: return self.log.info("Filling up the DagBag from %s", dag_folder) dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] dag_folder = correct_maybe_zipped(dag_folder) for filepath in list_py_file_paths( dag_folder, safe_mode=safe_mode, include_examples=include_examples, include_smart_sensor=include_smart_sensor, ): try: file_parse_start_dttm = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) file_parse_end_dttm = timezone.utcnow() stats.append( FileLoadStat( file=filepath.replace(settings.DAGS_FOLDER, ''), duration=file_parse_end_dttm - file_parse_start_dttm, dag_num=len(found_dags), task_num=sum([len(dag.tasks) for dag in found_dags]), dags=str([dag.dag_id for dag in found_dags]), )) except Exception as e: # pylint: disable=broad-except self.log.exception(e) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True)
def collect_dags( self, dag_folder: Union[str, "pathlib.Path", None] = None, only_if_updated: bool = True, include_examples: bool = conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode: bool = conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE'), ): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the patterns specified in the file. **Note**: The patterns in ``.airflowignore`` are interpreted as either un-anchored regexes or gitignore-like glob expressions, depending on the ``DAG_IGNORE_FILE_SYNTAX`` configuration parameter. """ if self.read_dags_from_db: return self.log.info("Filling up the DagBag from %s", dag_folder) dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] # Ensure dag_folder is a str -- it may have been a pathlib.Path dag_folder = correct_maybe_zipped(str(dag_folder)) for filepath in list_py_file_paths( dag_folder, safe_mode=safe_mode, include_examples=include_examples, ): try: file_parse_start_dttm = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) file_parse_end_dttm = timezone.utcnow() stats.append( FileLoadStat( file=filepath.replace(settings.DAGS_FOLDER, ''), duration=file_parse_end_dttm - file_parse_start_dttm, dag_num=len(found_dags), task_num=sum(len(dag.tasks) for dag in found_dags), dags=str([dag.dag_id for dag in found_dags]), )) except Exception as e: self.log.exception(e) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True)
def _refresh_dag_dir(self): """ Refresh file paths from dag dir if we haven't done it for too long. """ now = timezone.utcnow() elapsed_time_since_refresh = (now - self.last_dag_dir_refresh_time).total_seconds() if elapsed_time_since_refresh > self.dag_dir_list_interval: # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self._dag_directory) self._file_paths = list_py_file_paths(self._dag_directory) self.last_dag_dir_refresh_time = now self.log.info("There are %s files in %s", len(self._file_paths), self._dag_directory) self.set_file_paths(self._file_paths) # noinspection PyBroadException try: self.log.debug("Removing old import errors") self.clear_nonexistent_import_errors() # pylint: disable=no-value-for-parameter except Exception: # pylint: disable=broad-except self.log.exception("Error removing old import errors")
def _refresh_dag_dir(self): """Refresh file paths from dag dir if we haven't done it for too long.""" now = timezone.utcnow() elapsed_time_since_refresh = ( now - self.last_dag_dir_refresh_time).total_seconds() if elapsed_time_since_refresh > self.dag_dir_list_interval: # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self._dag_directory) self._file_paths = list_py_file_paths(self._dag_directory) self.last_dag_dir_refresh_time = now self.log.info("There are %s files in %s", len(self._file_paths), self._dag_directory) self.set_file_paths(self._file_paths) try: self.log.debug("Removing old import errors") self.clear_nonexistent_import_errors() except Exception: self.log.exception("Error removing old import errors") # Check if file path is a zipfile and get the full path of the python file. # Without this, SerializedDagModel.remove_deleted_files would delete zipped dags. # Likewise DagCode.remove_deleted_code dag_filelocs = [] for fileloc in self._file_paths: if not fileloc.endswith(".py") and zipfile.is_zipfile(fileloc): with zipfile.ZipFile(fileloc) as z: dag_filelocs.extend([ os.path.join(fileloc, info.filename) for info in z.infolist() if might_contain_dag(info.filename, True, z) ]) else: dag_filelocs.append(fileloc) SerializedDagModel.remove_deleted_dags(dag_filelocs) DagModel.deactivate_deleted_dags(self._file_paths) from airflow.models.dagcode import DagCode DagCode.remove_deleted_code(dag_filelocs)
def test_dag_is_deactivated_upon_dagfile_deletion(self): dag_id = 'old_existing_dag' dag_fileloc = "/usr/local/airflow/dags/non_existing_path.py" dag = DAG( dag_id, is_paused_upon_creation=True, ) dag.fileloc = dag_fileloc session = settings.Session() dag.sync_to_db(session=session) orm_dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).one() self.assertTrue(orm_dag.is_active) self.assertEqual(orm_dag.fileloc, dag_fileloc) DagModel.deactivate_deleted_dags(list_py_file_paths(settings.DAGS_FOLDER)) orm_dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).one() self.assertFalse(orm_dag.is_active) # CleanUp session.execute(DagModel.__table__.delete().where(DagModel.dag_id == dag_id)) session.close()
def _refresh_dag_dir(self): """Refresh file paths from dag dir if we haven't done it for too long.""" now = timezone.utcnow() elapsed_time_since_refresh = (now - self.last_dag_dir_refresh_time).total_seconds() if elapsed_time_since_refresh > self.dag_dir_list_interval: # Build up a list of Python files that could contain DAGs self.log.info("Searching for files in %s", self._dag_directory) self._file_paths = list_py_file_paths(self._dag_directory) self.last_dag_dir_refresh_time = now self.log.info("There are %s files in %s", len(self._file_paths), self._dag_directory) self.set_file_paths(self._file_paths) try: self.log.debug("Removing old import errors") self.clear_nonexistent_import_errors() # pylint: disable=no-value-for-parameter except Exception: # noqa pylint: disable=broad-except self.log.exception("Error removing old import errors") SerializedDagModel.remove_deleted_dags(self._file_paths) DagModel.deactivate_deleted_dags(self._file_paths) if self.store_dag_code: from airflow.models.dagcode import DagCode DagCode.remove_deleted_code(self._file_paths)
def collect_dags(self, dag_folder=None, only_if_updated=True, include_examples=conf.getboolean('core', 'LOAD_EXAMPLES'), safe_mode=conf.getboolean('core', 'DAG_DISCOVERY_SAFE_MODE')): """ Given a file path or a folder, this method looks for python modules, imports them and adds them to the dagbag collection. Note that if a ``.airflowignore`` file is found while processing the directory, it will behave much like a ``.gitignore``, ignoring files that match any of the regex patterns specified in the file. **Note**: The patterns in .airflowignore are treated as un-anchored regexes, not shell-like glob patterns. """ if self.store_serialized_dags: return self.log.info("Filling up the DagBag from %s", dag_folder) start_dttm = timezone.utcnow() dag_folder = dag_folder or self.dag_folder # Used to store stats around DagBag processing stats = [] dag_folder = correct_maybe_zipped(dag_folder) for filepath in list_py_file_paths(dag_folder, safe_mode=safe_mode, include_examples=include_examples): try: file_parse_start_dttm = timezone.utcnow() found_dags = self.process_file(filepath, only_if_updated=only_if_updated, safe_mode=safe_mode) file_parse_end_dttm = timezone.utcnow() stats.append( FileLoadStat( file=filepath.replace(settings.DAGS_FOLDER, ''), duration=file_parse_end_dttm - file_parse_start_dttm, dag_num=len(found_dags), task_num=sum([len(dag.tasks) for dag in found_dags]), dags=str([dag.dag_id for dag in found_dags]), )) except Exception as e: # pylint: disable=broad-except self.log.exception(e) end_dttm = timezone.utcnow() durations = (end_dttm - start_dttm).total_seconds() Stats.gauge('collect_dags', durations, 1) Stats.gauge('dagbag_size', len(self.dags), 1) Stats.gauge('dagbag_import_errors', len(self.import_errors), 1) self.dagbag_stats = sorted(stats, key=lambda x: x.duration, reverse=True) for file_stat in self.dagbag_stats: # file_stat.file similar format: /subdir/dag_name.py # TODO: Remove for Airflow 2.0 filename = file_stat.file.split('/')[-1].replace('.py', '') Stats.timing('dag.loading-duration.{}'.format(filename), file_stat.duration)