def sync_to_db(self, session: Session = None): """Save attributes about list of DAG to the DB.""" # To avoid circular import - airflow.models.dagbag -> airflow.models.dag -> airflow.models.dagbag from airflow.models.dag import DAG from airflow.models.serialized_dag import SerializedDagModel def _serialize_dag_capturing_errors(dag, session): """ Try to serialize the dag to the DB, but make a note of any errors. We can't place them directly in import_errors, as this may be retried, and work the next time """ if dag.is_subdag: return [] try: # We can't use bulk_write_to_db as we want to capture each error individually dag_was_updated = SerializedDagModel.write_dag( dag, min_update_interval=settings. MIN_SERIALIZED_DAG_UPDATE_INTERVAL, session=session, ) if dag_was_updated: self._sync_perm_for_dag(dag, session=session) return [] except OperationalError: raise except Exception: self.log.exception("Failed to write serialized DAG: %s", dag.full_filepath) return [(dag.fileloc, traceback.format_exc( limit=-self.dagbag_import_error_traceback_depth))] # Retry 'DAG.bulk_write_to_db' & 'SerializedDagModel.bulk_sync_to_db' in case # of any Operational Errors # In case of failures, provide_session handles rollback for attempt in run_with_db_retries(logger=self.log): with attempt: serialize_errors = [] self.log.debug( "Running dagbag.sync_to_db with retries. Try %d of %d", attempt.retry_state.attempt_number, MAX_DB_RETRIES, ) self.log.debug("Calling the DAG.bulk_sync_to_db method") try: # Write Serialized DAGs to DB, capturing errors for dag in self.dags.values(): serialize_errors.extend( _serialize_dag_capturing_errors(dag, session)) DAG.bulk_write_to_db(self.dags.values(), session=session) except OperationalError: session.rollback() raise # Only now we are "complete" do we update import_errors - don't want to record errors from # previous failed attempts self.import_errors.update(dict(serialize_errors))
def clean_unused(cls, session=None): """ Deletes all triggers that have no tasks/DAGs dependent on them (triggers have a one-to-many relationship to both) """ # Update all task instances with trigger IDs that are not DEFERRED to remove them for attempt in run_with_db_retries(): with attempt: session.query(TaskInstance).filter( TaskInstance.state != State.DEFERRED, TaskInstance.trigger_id.isnot(None)).update( {TaskInstance.trigger_id: None}) # Get all triggers that have no task instances depending on them... ids = [ trigger_id for (trigger_id, ) in (session.query(cls.id).join( TaskInstance, cls.id == TaskInstance.trigger_id, isouter=True).group_by(cls.id).having( func.count(TaskInstance.trigger_id) == 0)) ] # ...and delete them (we can't do this in one query due to MySQL) session.query(Trigger).filter( Trigger.id.in_(ids)).delete(synchronize_session=False)
def adopt_or_reset_orphaned_tasks(self, session: Session = None): """ Reset any TaskInstance still in QUEUED or SCHEDULED states that were enqueued by a SchedulerJob that is no longer running. :return: the number of TIs reset :rtype: int """ self.log.info("Resetting orphaned tasks for active dag runs") timeout = conf.getint('scheduler', 'scheduler_health_check_threshold') for attempt in run_with_db_retries(logger=self.log): with attempt: self.log.debug( "Running SchedulerJob.adopt_or_reset_orphaned_tasks with retries. Try %d of %d", attempt.retry_state.attempt_number, MAX_DB_RETRIES, ) self.log.debug("Calling SchedulerJob.adopt_or_reset_orphaned_tasks method") try: num_failed = ( session.query(SchedulerJob) .filter( SchedulerJob.state == State.RUNNING, SchedulerJob.latest_heartbeat < (timezone.utcnow() - timedelta(seconds=timeout)), ) .update({"state": State.FAILED}) ) if num_failed: self.log.info("Marked %d SchedulerJob instances as failed", num_failed) Stats.incr(self.__class__.__name__.lower() + '_end', num_failed) resettable_states = [State.QUEUED, State.RUNNING] query = ( session.query(TI) .filter(TI.state.in_(resettable_states)) # outerjoin is because we didn't use to have queued_by_job # set, so we need to pick up anything pre upgrade. This (and the # "or queued_by_job_id IS NONE") can go as soon as scheduler HA is # released. .outerjoin(TI.queued_by_job) .filter(or_(TI.queued_by_job_id.is_(None), SchedulerJob.state != State.RUNNING)) .join(TI.dag_run) .filter( DagRun.run_type != DagRunType.BACKFILL_JOB, DagRun.state == State.RUNNING, ) .options(load_only(TI.dag_id, TI.task_id, TI.run_id)) ) # Lock these rows, so that another scheduler can't try and adopt these too tis_to_reset_or_adopt = with_row_locks( query, of=TI, session=session, **skip_locked(session=session) ).all() to_reset = self.executor.try_adopt_task_instances(tis_to_reset_or_adopt) reset_tis_message = [] for ti in to_reset: reset_tis_message.append(repr(ti)) ti.state = State.NONE ti.queued_by_job_id = None for ti in set(tis_to_reset_or_adopt) - set(to_reset): ti.queued_by_job_id = self.id Stats.incr('scheduler.orphaned_tasks.cleared', len(to_reset)) Stats.incr('scheduler.orphaned_tasks.adopted', len(tis_to_reset_or_adopt) - len(to_reset)) if to_reset: task_instance_str = '\n\t'.join(reset_tis_message) self.log.info( "Reset the following %s orphaned TaskInstances:\n\t%s", len(to_reset), task_instance_str, ) # Issue SQL/finish "Unit of Work", but let @provide_session # commit (or if passed a session, let caller decide when to commit session.flush() except OperationalError: session.rollback() raise return len(to_reset)