Ejemplo n.º 1
0
    def delete_dag(self, keep_records_in_log: bool = True, session=None):
        dag = session.query(DagModel).filter(
            DagModel.dag_id == self.dag_id).first()
        if dag is None:
            raise DagNotFound(f"Dag id {self.dag_id} not found")

        # so explicitly removes serialized DAG here.
        if STORE_SERIALIZED_DAGS and SerializedDagModel.has_dag(
                dag_id=self.dag_id, session=session):
            SerializedDagModel.remove_dag(dag_id=self.dag_id, session=session)

        # noinspection PyUnresolvedReferences,PyProtectedMember
        for model in models.base.Base._decl_class_registry.values():
            if hasattr(model, "dag_id"):
                if model.__name__:
                    print(model.__name__)
                if keep_records_in_log and model.__name__ == "Log":
                    continue
                cond = or_(model.dag_id == self.dag_id,
                           model.dag_id.like(self.dag_id + ".%"))
                session.query(model).filter(cond).delete(
                    synchronize_session="fetch")

        # Delete entries in Import Errors table for a deleted DAG
        # This handles the case when the dag_id is changed in the file
        session.query(models.ImportError).filter(
            models.ImportError.filename == dag.fileloc).delete(
                synchronize_session="fetch")
Ejemplo n.º 2
0
def delete_dag(dag_id: str,
               keep_records_in_log: bool = True,
               session=None) -> int:
    """
    :param dag_id: the dag_id of the DAG to delete
    :param keep_records_in_log: whether keep records of the given dag_id
        in the Log table in the backend database (for reasons like auditing).
        The default value is True.
    :param session: session used
    :return count of deleted dags
    """
    log.info("Deleting DAG: %s", dag_id)
    running_tis = (session.query(models.TaskInstance.state).filter(
        models.TaskInstance.dag_id == dag_id).filter(
            models.TaskInstance.state == State.RUNNING).first())
    if running_tis:
        raise AirflowException("TaskInstances still running")
    dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first()
    if dag is None:
        raise DagNotFound(f"Dag id {dag_id} not found")

    # deleting a DAG should also delete all of its subdags
    dags_to_delete_query = session.query(DagModel.dag_id).filter(
        or_(
            DagModel.dag_id == dag_id,
            and_(DagModel.dag_id.like(f"{dag_id}.%"), DagModel.is_subdag),
        ))
    dags_to_delete = [dag_id for dag_id, in dags_to_delete_query]

    # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval.
    # There may be a lag, so explicitly removes serialized DAG here.
    if SerializedDagModel.has_dag(dag_id=dag_id, session=session):
        SerializedDagModel.remove_dag(dag_id=dag_id, session=session)

    count = 0

    for model in get_sqla_model_classes():
        if hasattr(model, "dag_id"):
            if keep_records_in_log and model.__name__ == 'Log':
                continue
            count += (session.query(model).filter(
                model.dag_id.in_(dags_to_delete)).delete(
                    synchronize_session='fetch'))
    if dag.is_subdag:
        parent_dag_id, task_id = dag_id.rsplit(".", 1)
        for model in TaskFail, models.TaskInstance:
            count += (session.query(model).filter(
                model.dag_id == parent_dag_id,
                model.task_id == task_id).delete())

    # Delete entries in Import Errors table for a deleted DAG
    # This handles the case when the dag_id is changed in the file
    session.query(models.ImportError).filter(
        models.ImportError.filename == dag.fileloc).delete(
            synchronize_session='fetch')

    return count
Ejemplo n.º 3
0
 def test_remove_dags_by_id(self):
     """DAGs can be removed from database."""
     example_dags_list = list(self._write_example_dags().values())
     # Remove SubDags from the list as they are not stored in DB in a separate row
     # and are directly added in Json blob of the main DAG
     filtered_example_dags_list = [dag for dag in example_dags_list if not dag.is_subdag]
     # Tests removing by dag_id.
     dag_removed_by_id = filtered_example_dags_list[0]
     SDM.remove_dag(dag_removed_by_id.dag_id)
     self.assertFalse(SDM.has_dag(dag_removed_by_id.dag_id))
Ejemplo n.º 4
0
def delete_dag(dag_id: str,
               keep_records_in_log: bool = True,
               session=None) -> int:
    """
    :param dag_id: the dag_id of the DAG to delete
    :param keep_records_in_log: whether keep records of the given dag_id
        in the Log table in the backend database (for reasons like auditing).
        The default value is True.
    :param session: session used
    :return count of deleted dags
    """
    logger = LoggingMixin()
    logger.log.info("Deleting DAG: %s", dag_id)
    dag = session.query(DagModel).filter(DagModel.dag_id == dag_id).first()
    if dag is None:
        raise DagNotFound("Dag id {} not found".format(dag_id))

    # Scheduler removes DAGs without files from serialized_dag table every dag_dir_list_interval.
    # There may be a lag, so explicitly removes serialized DAG here.
    if STORE_SERIALIZED_DAGS and SerializedDagModel.has_dag(dag_id=dag_id,
                                                            session=session):
        SerializedDagModel.remove_dag(dag_id=dag_id, session=session)

    count = 0

    # noinspection PyUnresolvedReferences,PyProtectedMember
    for model in models.base.Base._decl_class_registry.values():  # pylint: disable=protected-access
        if hasattr(model, "dag_id"):
            if keep_records_in_log and model.__name__ == 'Log':
                continue
            cond = or_(model.dag_id == dag_id,
                       model.dag_id.like(dag_id + ".%"))
            count += session.query(model).filter(cond).delete(
                synchronize_session='fetch')
    if dag.is_subdag:
        parent_dag_id, task_id = dag_id.rsplit(".", 1)
        for model in models.DagRun, TaskFail, models.TaskInstance:
            count += session.query(model).filter(
                model.dag_id == parent_dag_id,
                model.task_id == task_id).delete()

    # Delete entries in Import Errors table for a deleted DAG
    # This handles the case when the dag_id is changed in the file
    session.query(models.ImportError).filter(
        models.ImportError.filename == dag.fileloc).delete(
            synchronize_session='fetch')

    return count
Ejemplo n.º 5
0
    def _deactivate_stale_dags(self, session=None):
        """
        Detects DAGs which are no longer present in files

        Deactivate them and remove them in the serialized_dag table
        """
        now = timezone.utcnow()
        elapsed_time_since_refresh = (
            now - self.last_deactivate_stale_dags_time).total_seconds()
        if elapsed_time_since_refresh > self.deactivate_stale_dags_interval:
            last_parsed = {
                fp: self.get_last_finish_time(fp)
                for fp in self.file_paths if self.get_last_finish_time(fp)
            }
            to_deactivate = set()
            dags_parsed = (session.query(DagModel.dag_id, DagModel.fileloc,
                                         DagModel.last_parsed_time).filter(
                                             DagModel.is_active).all())
            for dag in dags_parsed:
                # The largest valid difference between a DagFileStat's last_finished_time and a DAG's
                # last_parsed_time is _processor_timeout. Longer than that indicates that the DAG is
                # no longer present in the file.
                if (dag.fileloc in last_parsed
                        and (dag.last_parsed_time + self._processor_timeout) <
                        last_parsed[dag.fileloc]):
                    self.log.info("DAG %s is missing and will be deactivated.",
                                  dag.dag_id)
                    to_deactivate.add(dag.dag_id)

            if to_deactivate:
                deactivated = (session.query(DagModel).filter(
                    DagModel.dag_id.in_(to_deactivate)).update(
                        {DagModel.is_active: False},
                        synchronize_session="fetch"))
                if deactivated:
                    self.log.info(
                        "Deactivated %i DAGs which are no longer present in file.",
                        deactivated)

                for dag_id in to_deactivate:
                    SerializedDagModel.remove_dag(dag_id)
                    self.log.info("Deleted DAG %s in serialized_dag table",
                                  dag_id)

            self.last_deactivate_stale_dags_time = timezone.utcnow()