def _find_schedulable_tasks( self, dag_run: DagRun, session: Session, check_execution_date=False) -> Optional[List[TI]]: """ Make scheduling decisions about an individual dag run ``currently_active_runs`` is passed in so that a batch query can be used to ask this for all dag runs in the batch, to avoid an n+1 query. :param dag_run: The DagRun to schedule :return: scheduled tasks """ if not dag_run or dag_run.get_state() in State.finished: return try: dag = dag_run.dag = self.dagbag.get_dag(dag_run.dag_id, session=session) except SerializedDagNotFound: self.log.exception("DAG '%s' not found in serialized_dag table", dag_run.dag_id) return None if not dag: self.log.error("Couldn't find dag %s in DagBag/DB!", dag_run.dag_id) return None currently_active_runs = session.query(TI.execution_date, ).filter( TI.dag_id == dag_run.dag_id, TI.state.notin_(list(State.finished)), ).all() if check_execution_date and dag_run.execution_date > timezone.utcnow( ) and not dag.allow_future_exec_dates: self.log.warning("Execution date is in future: %s", dag_run.execution_date) return None if dag.max_active_runs: if (len(currently_active_runs) >= dag.max_active_runs and dag_run.execution_date not in currently_active_runs): self.log.info( "DAG %s already has %d active runs, not queuing any tasks for run %s", dag.dag_id, len(currently_active_runs), dag_run.execution_date, ) return None self._verify_integrity_if_dag_changed(dag_run=dag_run, session=session) schedulable_tis, callback_to_run = dag_run.update_state( session=session, execute_callbacks=False) dag_run.schedule_tis(schedulable_tis, session) query = (session.query(TI).outerjoin(TI.dag_run).filter( or_(DR.run_id.is_(None), DR.run_type != DagRunType.BACKFILL_JOB)).join( TI.dag_model).filter(not_(DM.is_paused)).filter( TI.state == State.SCHEDULED).options( selectinload('dag_model'))) scheduled_tis: List[TI] = with_row_locks( query, of=TI, **skip_locked(session=session), ).all() # filter need event tasks serialized_dag = session.query(SerializedDagModel).filter( SerializedDagModel.dag_id == dag_run.dag_id).first() dep: DagEventDependencies = DagEventDependencies.from_json( serialized_dag.event_relationships) event_task_set = dep.find_event_dependencies_tasks() final_scheduled_tis = [] for ti in scheduled_tis: if ti.task_id not in event_task_set: final_scheduled_tis.append(ti) return final_scheduled_tis
def _schedule_dag_run( self, dag_run: DagRun, session: Session, ) -> Optional[DagCallbackRequest]: """ Make scheduling decisions about an individual dag run :param dag_run: The DagRun to schedule :return: Callback that needs to be executed """ dag = dag_run.dag = self.dagbag.get_dag(dag_run.dag_id, session=session) if not dag: self.log.error("Couldn't find dag %s in DagBag/DB!", dag_run.dag_id) return 0 if ( dag_run.start_date and dag.dagrun_timeout and dag_run.start_date < timezone.utcnow() - dag.dagrun_timeout ): dag_run.set_state(State.FAILED) unfinished_task_instances = ( session.query(TI) .filter(TI.dag_id == dag_run.dag_id) .filter(TI.run_id == dag_run.run_id) .filter(TI.state.in_(State.unfinished)) ) for task_instance in unfinished_task_instances: task_instance.state = State.SKIPPED session.merge(task_instance) session.flush() self.log.info("Run %s of %s has timed-out", dag_run.run_id, dag_run.dag_id) callback_to_execute = DagCallbackRequest( full_filepath=dag.fileloc, dag_id=dag.dag_id, run_id=dag_run.run_id, is_failure_callback=True, msg='timed_out', ) # Send SLA & DAG Success/Failure Callbacks to be executed self._send_dag_callbacks_to_processor(dag_run, callback_to_execute) return 0 if dag_run.execution_date > timezone.utcnow() and not dag.allow_future_exec_dates: self.log.error("Execution date is in future: %s", dag_run.execution_date) return 0 self._verify_integrity_if_dag_changed(dag_run=dag_run, session=session) # TODO[HA]: Rename update_state -> schedule_dag_run, ?? something else? schedulable_tis, callback_to_run = dag_run.update_state(session=session, execute_callbacks=False) # This will do one query per dag run. We "could" build up a complex # query to update all the TIs across all the execution dates and dag # IDs in a single query, but it turns out that can be _very very slow_ # see #11147/commit ee90807ac for more details dag_run.schedule_tis(schedulable_tis, session) return callback_to_run