Example #1
0
 def next_dagrun_info(
     self,
     *,
     last_automated_data_interval: Optional[DataInterval],
     restriction: TimeRestriction,
 ) -> Optional[DagRunInfo]:
     earliest = restriction.earliest
     if not restriction.catchup:
         earliest = self._skip_to_latest(earliest)
     elif earliest is not None:
         earliest = self._align(earliest)
     if last_automated_data_interval is None:
         # First run; schedule the run at the first available time matching
         # the schedule, and retrospectively create a data interval for it.
         if earliest is None:
             return None
         start = earliest
     else:  # There's a previous run.
         if earliest is not None:
             # Catchup is False or DAG has new start date in the future.
             # Make sure we get the later one.
             start = max(last_automated_data_interval.end, earliest)
         else:
             # Data interval starts from the end of the previous interval.
             start = last_automated_data_interval.end
     if restriction.latest is not None and start > restriction.latest:
         return None
     end = self._get_next(start)
     return DagRunInfo.interval(start=start, end=end)
Example #2
0
 def next_dagrun_info(
     self,
     *,
     last_automated_data_interval: Optional[DataInterval],
     restriction: TimeRestriction,
 ) -> Optional[DagRunInfo]:
     if last_automated_data_interval is not None:  # There was a previous run on the regular schedule.
         last_start = last_automated_data_interval.start
         last_start_weekday = last_start.weekday()
         if 0 <= last_start_weekday < 4:  # Last run on Monday through Thursday -- next is tomorrow.
             delta = timedelta(days=1)
         else:  # Last run on Friday -- skip to next Monday.
             delta = timedelta(days=(7 - last_start_weekday))
         next_start = DateTime.combine((last_start + delta).date(), Time.min).replace(tzinfo=UTC)
     else:  # This is the first ever run on the regular schedule.
         next_start = restriction.earliest
         if next_start is None:  # No start_date. Don't schedule.
             return None
         if not restriction.catchup:
             # If the DAG has catchup=False, today is the earliest to consider.
             next_start = max(next_start, DateTime.combine(Date.today(), Time.min).replace(tzinfo=UTC))
         elif next_start.time() != Time.min:
             # If earliest does not fall on midnight, skip to the next day.
             next_day = next_start.date() + timedelta(days=1)
             next_start = DateTime.combine(next_day, Time.min).replace(tzinfo=UTC)
         next_start_weekday = next_start.weekday()
         if next_start_weekday in (5, 6):  # If next start is in the weekend, go to next Monday.
             delta = timedelta(days=(7 - next_start_weekday))
             next_start = next_start + delta
     if restriction.latest is not None and next_start > restriction.latest:
         return None  # Over the DAG's scheduled end; don't schedule.
     return DagRunInfo.interval(start=next_start, end=(next_start + timedelta(days=1)))
Example #3
0
 def next_dagrun_info(
     self,
     last_automated_dagrun: Optional[DateTime],
     restriction: TimeRestriction,
 ) -> Optional[DagRunInfo]:
     if last_automated_dagrun is not None:
         return None  # Already run, no more scheduling.
     if restriction.earliest is None:  # No start date, won't run.
         return None
     # "@once" always schedule to the start_date determined by the DAG and
     # tasks, regardless of catchup or not. This has been the case since 1.10
     # and we're inheriting it. See AIRFLOW-1928.
     run_after = restriction.earliest
     if restriction.latest is not None and run_after > restriction.latest:
         return None
     return DagRunInfo.exact(run_after)
Example #4
0
    def next_dagrun_info(
        self,
        *,
        last_automated_data_interval: Optional[DataInterval],
        restriction: TimeRestriction,
    ) -> Optional[DagRunInfo]:
        if last_automated_data_interval is None:
            next_event = self.event_dates[0]
        else:
            future_dates = itertools.dropwhile(
                lambda when: when <= last_automated_data_interval.end, self.event_dates  # type: ignore
            )
            next_event = next(future_dates, None)  # type: ignore
            if next_event is None:
                return None

        return DagRunInfo.exact(next_event)
Example #5
0
 def next_dagrun_info(
     self,
     last_automated_dagrun: Optional[DateTime],
     restriction: TimeRestriction,
 ) -> Optional[DagRunInfo]:
     earliest = restriction.earliest
     if not restriction.catchup:
         earliest = self._schedule.skip_to_latest(earliest)
     if last_automated_dagrun is None:
         # First run; schedule the run at the first available time matching
         # the schedule, and retrospectively create a data interval for it.
         if earliest is None:
             return None
         start = self._schedule.align(earliest)
     else:
         # There's a previous run. Create a data interval starting from when
         # the end of the previous interval.
         start = self._schedule.get_next(last_automated_dagrun)
     if restriction.latest is not None and start > restriction.latest:
         return None
     end = self._schedule.get_next(start)
     return DagRunInfo.interval(start=start, end=end)
Example #6
0
 def next_dagrun_info(
     self,
     *,
     last_automated_data_interval: DataInterval | None,
     restriction: TimeRestriction,
 ) -> DagRunInfo | None:
     if restriction.catchup:
         if last_automated_data_interval is None:
             if restriction.earliest is None:
                 return None
             next_start_time = self._align_to_next(restriction.earliest)
         else:
             next_start_time = self._get_next(
                 last_automated_data_interval.end)
     else:
         current_time = DateTime.utcnow()
         if restriction.earliest is not None and current_time < restriction.earliest:
             next_start_time = self._align_to_next(restriction.earliest)
         else:
             next_start_time = self._align_to_next(current_time)
     if restriction.latest is not None and restriction.latest < next_start_time:
         return None
     return DagRunInfo.interval(next_start_time - self._interval,
                                next_start_time)
Example #7
0
    def _execute(self, session=None):
        """
        Initializes all components required to run a dag for a specified date range and
        calls helper method to execute the tasks.
        """
        ti_status = BackfillJob._DagRunTaskStatus()

        start_date = self.bf_start_date

        # Get DagRun schedule between the start/end dates, which will turn into dag runs.
        dagrun_start_date = timezone.coerce_datetime(start_date)
        if self.bf_end_date is None:
            dagrun_end_date = pendulum.now(timezone.utc)
        else:
            dagrun_end_date = pendulum.instance(self.bf_end_date)
        dagrun_infos = list(
            self.dag.iter_dagrun_infos_between(dagrun_start_date,
                                               dagrun_end_date))
        if self.run_backwards:
            tasks_that_depend_on_past = [
                t.task_id for t in self.dag.task_dict.values()
                if t.depends_on_past
            ]
            if tasks_that_depend_on_past:
                raise AirflowException(
                    f'You cannot backfill backwards because one or more '
                    f'tasks depend_on_past: {",".join(tasks_that_depend_on_past)}'
                )
            dagrun_infos = dagrun_infos[::-1]

        if not dagrun_infos:
            if not self.run_at_least_once:
                self.log.info(
                    "No run dates were found for the given dates and dag interval."
                )
                return
            dagrun_infos = [
                DagRunInfo.interval(dagrun_start_date, dagrun_end_date)
            ]

        # picklin'
        pickle_id = None

        if not self.donot_pickle and self.executor_class not in (
                executor_constants.LOCAL_EXECUTOR,
                executor_constants.SEQUENTIAL_EXECUTOR,
                executor_constants.DASK_EXECUTOR,
        ):
            pickle = DagPickle(self.dag)
            session.add(pickle)
            session.commit()
            pickle_id = pickle.id

        executor = self.executor
        executor.job_id = "backfill"
        executor.start()

        ti_status.total_runs = len(dagrun_infos)  # total dag runs in backfill

        try:
            remaining_dates = ti_status.total_runs
            while remaining_dates > 0:
                dagrun_infos_to_process = [
                    dagrun_info for dagrun_info in dagrun_infos
                    if dagrun_info.logical_date not in
                    ti_status.executed_dag_run_dates
                ]
                self._execute_dagruns(
                    dagrun_infos=dagrun_infos_to_process,
                    ti_status=ti_status,
                    executor=executor,
                    pickle_id=pickle_id,
                    start_date=start_date,
                    session=session,
                )

                remaining_dates = ti_status.total_runs - len(
                    ti_status.executed_dag_run_dates)
                err = self._collect_errors(ti_status=ti_status,
                                           session=session)
                if err:
                    raise BackfillUnfinished(err, ti_status)

                if remaining_dates > 0:
                    self.log.info(
                        "max_active_runs limit for dag %s has been reached "
                        " - waiting for other dag runs to finish",
                        self.dag_id,
                    )
                    time.sleep(self.delay_on_limit_secs)
        except (KeyboardInterrupt, SystemExit):
            self.log.warning("Backfill terminated by user.")

            # TODO: we will need to terminate running task instances and set the
            # state to failed.
            self._set_unfinished_dag_runs_to_failed(ti_status.active_runs)
        finally:
            session.commit()
            executor.end()

        self.log.info("Backfill done. Exiting.")
Example #8
0
    def _execute(self, session=None):
        """
        Initializes all components required to run a dag for a specified date range and
        calls helper method to execute the tasks.
        """
        ti_status = BackfillJob._DagRunTaskStatus()

        start_date = self.bf_start_date

        # Get DagRun schedule between the start/end dates, which will turn into dag runs.
        dagrun_start_date = timezone.coerce_datetime(start_date)
        if self.bf_end_date is None:
            dagrun_end_date = pendulum.now(timezone.utc)
        else:
            dagrun_end_date = pendulum.instance(self.bf_end_date)
        dagrun_infos = list(
            self.dag.iter_dagrun_infos_between(dagrun_start_date,
                                               dagrun_end_date))
        if self.run_backwards:
            tasks_that_depend_on_past = [
                t.task_id for t in self.dag.task_dict.values()
                if t.depends_on_past
            ]
            if tasks_that_depend_on_past:
                raise AirflowException(
                    f'You cannot backfill backwards because one or more '
                    f'tasks depend_on_past: {",".join(tasks_that_depend_on_past)}'
                )
            dagrun_infos = dagrun_infos[::-1]

        if not dagrun_infos:
            if not self.run_at_least_once:
                self.log.info(
                    "No run dates were found for the given dates and dag interval."
                )
                return
            dagrun_infos = [
                DagRunInfo.interval(dagrun_start_date, dagrun_end_date)
            ]

        dag_with_subdags_ids = [d.dag_id for d in self._get_dag_with_subdags()]
        running_dagruns = DagRun.find(
            dag_id=dag_with_subdags_ids,
            execution_start_date=self.bf_start_date,
            execution_end_date=self.bf_end_date,
            no_backfills=True,
            state=DagRunState.RUNNING,
        )

        if running_dagruns:
            for run in running_dagruns:
                self.log.error(
                    "Backfill cannot be created for DagRun %s in %s, as there's already %s in a RUNNING "
                    "state.",
                    run.run_id,
                    run.execution_date.strftime("%Y-%m-%dT%H:%M:%S"),
                    run.run_type,
                )
            self.log.error(
                "Changing DagRun into BACKFILL would cause scheduler to lose track of executing "
                "tasks. Not changing DagRun type into BACKFILL, and trying insert another DagRun into "
                "database would cause database constraint violation for dag_id + execution_date "
                "combination. Please adjust backfill dates or wait for this DagRun to finish.",
            )
            return
        # picklin'
        pickle_id = None

        if not self.donot_pickle and self.executor_class not in (
                executor_constants.LOCAL_EXECUTOR,
                executor_constants.SEQUENTIAL_EXECUTOR,
                executor_constants.DASK_EXECUTOR,
        ):
            pickle = DagPickle(self.dag)
            session.add(pickle)
            session.commit()
            pickle_id = pickle.id

        executor = self.executor
        executor.job_id = "backfill"
        executor.start()

        ti_status.total_runs = len(dagrun_infos)  # total dag runs in backfill

        try:
            remaining_dates = ti_status.total_runs
            while remaining_dates > 0:
                dagrun_infos_to_process = [
                    dagrun_info for dagrun_info in dagrun_infos
                    if dagrun_info.logical_date not in
                    ti_status.executed_dag_run_dates
                ]
                self._execute_dagruns(
                    dagrun_infos=dagrun_infos_to_process,
                    ti_status=ti_status,
                    executor=executor,
                    pickle_id=pickle_id,
                    start_date=start_date,
                    session=session,
                )

                remaining_dates = ti_status.total_runs - len(
                    ti_status.executed_dag_run_dates)
                err = self._collect_errors(ti_status=ti_status,
                                           session=session)
                if err:
                    if not self.continue_on_failures or ti_status.deadlocked:
                        raise BackfillUnfinished(err, ti_status)

                if remaining_dates > 0:
                    self.log.info(
                        "max_active_runs limit for dag %s has been reached "
                        " - waiting for other dag runs to finish",
                        self.dag_id,
                    )
                    time.sleep(self.delay_on_limit_secs)
        except (KeyboardInterrupt, SystemExit):
            self.log.warning("Backfill terminated by user.")

            # TODO: we will need to terminate running task instances and set the
            # state to failed.
            self._set_unfinished_dag_runs_to_failed(ti_status.active_runs)
        finally:
            session.commit()
            executor.end()

        self.log.info("Backfill done for DAG %s. Exiting.", self.dag)