def next_dagrun_info( self, *, last_automated_data_interval: Optional[DataInterval], restriction: TimeRestriction, ) -> Optional[DagRunInfo]: earliest = restriction.earliest if not restriction.catchup: earliest = self._skip_to_latest(earliest) elif earliest is not None: earliest = self._align(earliest) if last_automated_data_interval is None: # First run; schedule the run at the first available time matching # the schedule, and retrospectively create a data interval for it. if earliest is None: return None start = earliest else: # There's a previous run. if earliest is not None: # Catchup is False or DAG has new start date in the future. # Make sure we get the later one. start = max(last_automated_data_interval.end, earliest) else: # Data interval starts from the end of the previous interval. start = last_automated_data_interval.end if restriction.latest is not None and start > restriction.latest: return None end = self._get_next(start) return DagRunInfo.interval(start=start, end=end)
def next_dagrun_info( self, *, last_automated_data_interval: Optional[DataInterval], restriction: TimeRestriction, ) -> Optional[DagRunInfo]: if last_automated_data_interval is not None: # There was a previous run on the regular schedule. last_start = last_automated_data_interval.start last_start_weekday = last_start.weekday() if 0 <= last_start_weekday < 4: # Last run on Monday through Thursday -- next is tomorrow. delta = timedelta(days=1) else: # Last run on Friday -- skip to next Monday. delta = timedelta(days=(7 - last_start_weekday)) next_start = DateTime.combine((last_start + delta).date(), Time.min).replace(tzinfo=UTC) else: # This is the first ever run on the regular schedule. next_start = restriction.earliest if next_start is None: # No start_date. Don't schedule. return None if not restriction.catchup: # If the DAG has catchup=False, today is the earliest to consider. next_start = max(next_start, DateTime.combine(Date.today(), Time.min).replace(tzinfo=UTC)) elif next_start.time() != Time.min: # If earliest does not fall on midnight, skip to the next day. next_day = next_start.date() + timedelta(days=1) next_start = DateTime.combine(next_day, Time.min).replace(tzinfo=UTC) next_start_weekday = next_start.weekday() if next_start_weekday in (5, 6): # If next start is in the weekend, go to next Monday. delta = timedelta(days=(7 - next_start_weekday)) next_start = next_start + delta if restriction.latest is not None and next_start > restriction.latest: return None # Over the DAG's scheduled end; don't schedule. return DagRunInfo.interval(start=next_start, end=(next_start + timedelta(days=1)))
def next_dagrun_info( self, last_automated_dagrun: Optional[DateTime], restriction: TimeRestriction, ) -> Optional[DagRunInfo]: earliest = restriction.earliest if not restriction.catchup: earliest = self._schedule.skip_to_latest(earliest) if last_automated_dagrun is None: # First run; schedule the run at the first available time matching # the schedule, and retrospectively create a data interval for it. if earliest is None: return None start = self._schedule.align(earliest) else: # There's a previous run. Create a data interval starting from when # the end of the previous interval. start = self._schedule.get_next(last_automated_dagrun) if restriction.latest is not None and start > restriction.latest: return None end = self._schedule.get_next(start) return DagRunInfo.interval(start=start, end=end)
def next_dagrun_info( self, *, last_automated_data_interval: DataInterval | None, restriction: TimeRestriction, ) -> DagRunInfo | None: if restriction.catchup: if last_automated_data_interval is None: if restriction.earliest is None: return None next_start_time = self._align_to_next(restriction.earliest) else: next_start_time = self._get_next( last_automated_data_interval.end) else: current_time = DateTime.utcnow() if restriction.earliest is not None and current_time < restriction.earliest: next_start_time = self._align_to_next(restriction.earliest) else: next_start_time = self._align_to_next(current_time) if restriction.latest is not None and restriction.latest < next_start_time: return None return DagRunInfo.interval(next_start_time - self._interval, next_start_time)
def _execute(self, session=None): """ Initializes all components required to run a dag for a specified date range and calls helper method to execute the tasks. """ ti_status = BackfillJob._DagRunTaskStatus() start_date = self.bf_start_date # Get DagRun schedule between the start/end dates, which will turn into dag runs. dagrun_start_date = timezone.coerce_datetime(start_date) if self.bf_end_date is None: dagrun_end_date = pendulum.now(timezone.utc) else: dagrun_end_date = pendulum.instance(self.bf_end_date) dagrun_infos = list( self.dag.iter_dagrun_infos_between(dagrun_start_date, dagrun_end_date)) if self.run_backwards: tasks_that_depend_on_past = [ t.task_id for t in self.dag.task_dict.values() if t.depends_on_past ] if tasks_that_depend_on_past: raise AirflowException( f'You cannot backfill backwards because one or more ' f'tasks depend_on_past: {",".join(tasks_that_depend_on_past)}' ) dagrun_infos = dagrun_infos[::-1] if not dagrun_infos: if not self.run_at_least_once: self.log.info( "No run dates were found for the given dates and dag interval." ) return dagrun_infos = [ DagRunInfo.interval(dagrun_start_date, dagrun_end_date) ] # picklin' pickle_id = None if not self.donot_pickle and self.executor_class not in ( executor_constants.LOCAL_EXECUTOR, executor_constants.SEQUENTIAL_EXECUTOR, executor_constants.DASK_EXECUTOR, ): pickle = DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.job_id = "backfill" executor.start() ti_status.total_runs = len(dagrun_infos) # total dag runs in backfill try: remaining_dates = ti_status.total_runs while remaining_dates > 0: dagrun_infos_to_process = [ dagrun_info for dagrun_info in dagrun_infos if dagrun_info.logical_date not in ti_status.executed_dag_run_dates ] self._execute_dagruns( dagrun_infos=dagrun_infos_to_process, ti_status=ti_status, executor=executor, pickle_id=pickle_id, start_date=start_date, session=session, ) remaining_dates = ti_status.total_runs - len( ti_status.executed_dag_run_dates) err = self._collect_errors(ti_status=ti_status, session=session) if err: raise BackfillUnfinished(err, ti_status) if remaining_dates > 0: self.log.info( "max_active_runs limit for dag %s has been reached " " - waiting for other dag runs to finish", self.dag_id, ) time.sleep(self.delay_on_limit_secs) except (KeyboardInterrupt, SystemExit): self.log.warning("Backfill terminated by user.") # TODO: we will need to terminate running task instances and set the # state to failed. self._set_unfinished_dag_runs_to_failed(ti_status.active_runs) finally: session.commit() executor.end() self.log.info("Backfill done. Exiting.")
def _execute(self, session=None): """ Initializes all components required to run a dag for a specified date range and calls helper method to execute the tasks. """ ti_status = BackfillJob._DagRunTaskStatus() start_date = self.bf_start_date # Get DagRun schedule between the start/end dates, which will turn into dag runs. dagrun_start_date = timezone.coerce_datetime(start_date) if self.bf_end_date is None: dagrun_end_date = pendulum.now(timezone.utc) else: dagrun_end_date = pendulum.instance(self.bf_end_date) dagrun_infos = list( self.dag.iter_dagrun_infos_between(dagrun_start_date, dagrun_end_date)) if self.run_backwards: tasks_that_depend_on_past = [ t.task_id for t in self.dag.task_dict.values() if t.depends_on_past ] if tasks_that_depend_on_past: raise AirflowException( f'You cannot backfill backwards because one or more ' f'tasks depend_on_past: {",".join(tasks_that_depend_on_past)}' ) dagrun_infos = dagrun_infos[::-1] if not dagrun_infos: if not self.run_at_least_once: self.log.info( "No run dates were found for the given dates and dag interval." ) return dagrun_infos = [ DagRunInfo.interval(dagrun_start_date, dagrun_end_date) ] dag_with_subdags_ids = [d.dag_id for d in self._get_dag_with_subdags()] running_dagruns = DagRun.find( dag_id=dag_with_subdags_ids, execution_start_date=self.bf_start_date, execution_end_date=self.bf_end_date, no_backfills=True, state=DagRunState.RUNNING, ) if running_dagruns: for run in running_dagruns: self.log.error( "Backfill cannot be created for DagRun %s in %s, as there's already %s in a RUNNING " "state.", run.run_id, run.execution_date.strftime("%Y-%m-%dT%H:%M:%S"), run.run_type, ) self.log.error( "Changing DagRun into BACKFILL would cause scheduler to lose track of executing " "tasks. Not changing DagRun type into BACKFILL, and trying insert another DagRun into " "database would cause database constraint violation for dag_id + execution_date " "combination. Please adjust backfill dates or wait for this DagRun to finish.", ) return # picklin' pickle_id = None if not self.donot_pickle and self.executor_class not in ( executor_constants.LOCAL_EXECUTOR, executor_constants.SEQUENTIAL_EXECUTOR, executor_constants.DASK_EXECUTOR, ): pickle = DagPickle(self.dag) session.add(pickle) session.commit() pickle_id = pickle.id executor = self.executor executor.job_id = "backfill" executor.start() ti_status.total_runs = len(dagrun_infos) # total dag runs in backfill try: remaining_dates = ti_status.total_runs while remaining_dates > 0: dagrun_infos_to_process = [ dagrun_info for dagrun_info in dagrun_infos if dagrun_info.logical_date not in ti_status.executed_dag_run_dates ] self._execute_dagruns( dagrun_infos=dagrun_infos_to_process, ti_status=ti_status, executor=executor, pickle_id=pickle_id, start_date=start_date, session=session, ) remaining_dates = ti_status.total_runs - len( ti_status.executed_dag_run_dates) err = self._collect_errors(ti_status=ti_status, session=session) if err: if not self.continue_on_failures or ti_status.deadlocked: raise BackfillUnfinished(err, ti_status) if remaining_dates > 0: self.log.info( "max_active_runs limit for dag %s has been reached " " - waiting for other dag runs to finish", self.dag_id, ) time.sleep(self.delay_on_limit_secs) except (KeyboardInterrupt, SystemExit): self.log.warning("Backfill terminated by user.") # TODO: we will need to terminate running task instances and set the # state to failed. self._set_unfinished_dag_runs_to_failed(ti_status.active_runs) finally: session.commit() executor.end() self.log.info("Backfill done for DAG %s. Exiting.", self.dag)