def delete_dag(self, keep_records_in_log: bool = True, session=None): dag = session.query(DagModel).filter( DagModel.dag_id == self.dag_id).first() if dag is None: raise DagNotFound(f"Dag id {self.dag_id} not found") # so explicitly removes serialized DAG here. if STORE_SERIALIZED_DAGS and SerializedDagModel.has_dag( dag_id=self.dag_id, session=session): SerializedDagModel.remove_dag(dag_id=self.dag_id, session=session) # noinspection PyUnresolvedReferences,PyProtectedMember for model in models.base.Base._decl_class_registry.values(): if hasattr(model, "dag_id"): if model.__name__: print(model.__name__) if keep_records_in_log and model.__name__ == "Log": continue cond = or_(model.dag_id == self.dag_id, model.dag_id.like(self.dag_id + ".%")) session.query(model).filter(cond).delete( synchronize_session="fetch") # Delete entries in Import Errors table for a deleted DAG # This handles the case when the dag_id is changed in the file session.query(models.ImportError).filter( models.ImportError.filename == dag.fileloc).delete( synchronize_session="fetch")
def delete_dag(dag_id: str, keep_records_in_log: bool = True, session=None) -> int: """ :param dag_id: the dag_id of the DAG to delete :param keep_records_in_log: whether keep records of the given dag_id in the Log table in the backend database (for reasons like auditing). The default value is True. :param session: session used :return count of deleted dags """ log.info("Deleting DAG: %s", dag_id) running_tis = (session.query(models.TaskInstance.state).filter( models.TaskInstance.dag_id == dag_id).filter( models.TaskInstance.state == State.RUNNING).first()) if running_tis: raise AirflowException("TaskInstances still running") dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first() if dag is None: raise DagNotFound(f"Dag id {dag_id} not found") # deleting a DAG should also delete all of its subdags dags_to_delete_query = session.query(DagModel.dag_id).filter( or_( DagModel.dag_id == dag_id, and_(DagModel.dag_id.like(f"{dag_id}.%"), DagModel.is_subdag), )) dags_to_delete = [dag_id for dag_id, in dags_to_delete_query] # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval. # There may be a lag, so explicitly removes serialized DAG here. if SerializedDagModel.has_dag(dag_id=dag_id, session=session): SerializedDagModel.remove_dag(dag_id=dag_id, session=session) count = 0 for model in get_sqla_model_classes(): if hasattr(model, "dag_id"): if keep_records_in_log and model.__name__ == 'Log': continue count += (session.query(model).filter( model.dag_id.in_(dags_to_delete)).delete( synchronize_session='fetch')) if dag.is_subdag: parent_dag_id, task_id = dag_id.rsplit(".", 1) for model in TaskFail, models.TaskInstance: count += (session.query(model).filter( model.dag_id == parent_dag_id, model.task_id == task_id).delete()) # Delete entries in Import Errors table for a deleted DAG # This handles the case when the dag_id is changed in the file session.query(models.ImportError).filter( models.ImportError.filename == dag.fileloc).delete( synchronize_session='fetch') return count
def test_remove_dags_by_id(self): """DAGs can be removed from database.""" example_dags_list = list(self._write_example_dags().values()) # Remove SubDags from the list as they are not stored in DB in a separate row # and are directly added in Json blob of the main DAG filtered_example_dags_list = [dag for dag in example_dags_list if not dag.is_subdag] # Tests removing by dag_id. dag_removed_by_id = filtered_example_dags_list[0] SDM.remove_dag(dag_removed_by_id.dag_id) self.assertFalse(SDM.has_dag(dag_removed_by_id.dag_id))
def delete_dag(dag_id: str, keep_records_in_log: bool = True, session=None) -> int: """ :param dag_id: the dag_id of the DAG to delete :param keep_records_in_log: whether keep records of the given dag_id in the Log table in the backend database (for reasons like auditing). The default value is True. :param session: session used :return count of deleted dags """ logger = LoggingMixin() logger.log.info("Deleting DAG: %s", dag_id) dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first() if dag is None: raise DagNotFound("Dag id {} not found".format(dag_id)) # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval. # There may be a lag, so explicitly removes serialized DAG here. if STORE_SERIALIZED_DAGS and SerializedDagModel.has_dag(dag_id=dag_id, session=session): SerializedDagModel.remove_dag(dag_id=dag_id, session=session) count = 0 # noinspection PyUnresolvedReferences,PyProtectedMember for model in models.base.Base._decl_class_registry.values(): # pylint: disable=protected-access if hasattr(model, "dag_id"): if keep_records_in_log and model.__name__ == 'Log': continue cond = or_(model.dag_id == dag_id, model.dag_id.like(dag_id + ".%")) count += session.query(model).filter(cond).delete( synchronize_session='fetch') if dag.is_subdag: parent_dag_id, task_id = dag_id.rsplit(".", 1) for model in models.DagRun, TaskFail, models.TaskInstance: count += session.query(model).filter( model.dag_id == parent_dag_id, model.task_id == task_id).delete() # Delete entries in Import Errors table for a deleted DAG # This handles the case when the dag_id is changed in the file session.query(models.ImportError).filter( models.ImportError.filename == dag.fileloc).delete( synchronize_session='fetch') return count
def _deactivate_stale_dags(self, session=None): """ Detects DAGs which are no longer present in files Deactivate them and remove them in the serialized_dag table """ now = timezone.utcnow() elapsed_time_since_refresh = ( now - self.last_deactivate_stale_dags_time).total_seconds() if elapsed_time_since_refresh > self.deactivate_stale_dags_interval: last_parsed = { fp: self.get_last_finish_time(fp) for fp in self.file_paths if self.get_last_finish_time(fp) } to_deactivate = set() dags_parsed = (session.query(DagModel.dag_id, DagModel.fileloc, DagModel.last_parsed_time).filter( DagModel.is_active).all()) for dag in dags_parsed: # The largest valid difference between a DagFileStat's last_finished_time and a DAG's # last_parsed_time is _processor_timeout. Longer than that indicates that the DAG is # no longer present in the file. if (dag.fileloc in last_parsed and (dag.last_parsed_time + self._processor_timeout) < last_parsed[dag.fileloc]): self.log.info("DAG %s is missing and will be deactivated.", dag.dag_id) to_deactivate.add(dag.dag_id) if to_deactivate: deactivated = (session.query(DagModel).filter( DagModel.dag_id.in_(to_deactivate)).update( {DagModel.is_active: False}, synchronize_session="fetch")) if deactivated: self.log.info( "Deactivated %i DAGs which are no longer present in file.", deactivated) for dag_id in to_deactivate: SerializedDagModel.remove_dag(dag_id) self.log.info("Deleted DAG %s in serialized_dag table", dag_id) self.last_deactivate_stale_dags_time = timezone.utcnow()