class AstronomerAvailableVersion(Base): __tablename__ = "astro_available_version" version = Column(Text, nullable=False, primary_key=True) level = Column(Text, nullable=False) date_released = Column(UtcDateTime(timezone=True), nullable=False) description = Column(Text) url = Column(Text) hidden_from_ui = Column(Boolean, default=False, nullable=False) __table_args__ = ( Index('idx_astro_available_version_hidden', hidden_from_ui), )
class BaseJob(Base, LoggingMixin): """ Abstract class to be derived for jobs. Jobs are processing items with state and duration that aren't task instances. For instance a BackfillJob is a collection of task instance runs, but should have its own state, start and end time. """ __tablename__ = "job" id = Column(Integer, primary_key=True) dag_id = Column( String(ID_LEN), ) state = Column(String(20)) job_type = Column(String(30)) start_date = Column(UtcDateTime()) end_date = Column(UtcDateTime()) latest_heartbeat = Column(UtcDateTime()) executor_class = Column(String(500)) hostname = Column(String(500)) unixname = Column(String(1000)) __mapper_args__ = {'polymorphic_on': job_type, 'polymorphic_identity': 'BaseJob'} __table_args__ = ( Index('job_type_heart', job_type, latest_heartbeat), Index('idx_job_state_heartbeat', state, latest_heartbeat), Index('idx_job_dag_id', dag_id), ) task_instances_enqueued = relationship( TaskInstance, primaryjoin=id == foreign(TaskInstance.queued_by_job_id), backref=backref('queued_by_job', uselist=False), ) dag_runs = relationship( DagRun, primaryjoin=id == foreign(DagRun.creating_job_id), backref=backref('creating_job'), ) """ TaskInstances which have been enqueued by this Job. Only makes sense for SchedulerJob and BackfillJob instances. """ heartrate = conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC') def __init__(self, executor=None, heartrate=None, *args, **kwargs): self.hostname = get_hostname() if executor: self.executor = executor self.executor_class = executor.__class__.__name__ else: self.executor_class = conf.get('core', 'EXECUTOR') self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() if heartrate is not None: self.heartrate = heartrate self.unixname = getuser() self.max_tis_per_query: int = conf.getint('scheduler', 'max_tis_per_query') super().__init__(*args, **kwargs) @cached_property def executor(self): return ExecutorLoader.get_default_executor() @classmethod @provide_session def most_recent_job(cls, session=None) -> Optional['BaseJob']: """ Return the most recent job of this type, if any, based on last heartbeat received. This method should be called on a subclass (i.e. on SchedulerJob) to return jobs of that type. :param session: Database session :rtype: BaseJob or None """ return session.query(cls).order_by(cls.latest_heartbeat.desc()).limit(1).first() def is_alive(self, grace_multiplier=2.1): """ Is this job currently alive. We define alive as in a state of RUNNING, and having sent a heartbeat within a multiple of the heartrate (default of 2.1) :param grace_multiplier: multiplier of heartrate to require heart beat within :type grace_multiplier: number :rtype: boolean """ return ( self.state == State.RUNNING and (timezone.utcnow() - self.latest_heartbeat).total_seconds() < self.heartrate * grace_multiplier ) @provide_session def kill(self, session=None): """Handles on_kill callback and updates state in database.""" job = session.query(BaseJob).filter(BaseJob.id == self.id).first() job.end_date = timezone.utcnow() try: self.on_kill() except Exception as e: self.log.error('on_kill() method failed: %s', str(e)) session.merge(job) session.commit() raise AirflowException("Job shut down externally.") def on_kill(self): """Will be called when an external kill command is received""" def heartbeat_callback(self, session=None): """Callback that is called during heartbeat. This method should be overwritten.""" def heartbeat(self, only_if_necessary: bool = False): """ Heartbeats update the job's entry in the database with a timestamp for the latest_heartbeat and allows for the job to be killed externally. This allows at the system level to monitor what is actually active. For instance, an old heartbeat for SchedulerJob would mean something is wrong. This also allows for any job to be killed externally, regardless of who is running it or on which machine it is running. Note that if your heart rate is set to 60 seconds and you call this method after 10 seconds of processing since the last heartbeat, it will sleep 50 seconds to complete the 60 seconds and keep a steady heart rate. If you go over 60 seconds before calling it, it won't sleep at all. :param only_if_necessary: If the heartbeat is not yet due then do nothing (don't update column, don't call ``heartbeat_callback``) :type only_if_necessary: boolean """ seconds_remaining = 0 if self.latest_heartbeat: seconds_remaining = self.heartrate - (timezone.utcnow() - self.latest_heartbeat).total_seconds() if seconds_remaining > 0 and only_if_necessary: return previous_heartbeat = self.latest_heartbeat try: with create_session() as session: # This will cause it to load from the db session.merge(self) previous_heartbeat = self.latest_heartbeat if self.state in State.terminating_states: self.kill() # Figure out how long to sleep for sleep_for = 0 if self.latest_heartbeat: seconds_remaining = ( self.heartrate - (timezone.utcnow() - self.latest_heartbeat).total_seconds() ) sleep_for = max(0, seconds_remaining) sleep(sleep_for) # Update last heartbeat time with create_session() as session: # Make the session aware of this object session.merge(self) self.latest_heartbeat = timezone.utcnow() session.commit() # At this point, the DB has updated. previous_heartbeat = self.latest_heartbeat self.heartbeat_callback(session=session) self.log.debug('[heartbeat]') except OperationalError: Stats.incr(convert_camel_to_snake(self.__class__.__name__) + '_heartbeat_failure', 1, 1) self.log.exception("%s heartbeat got an exception", self.__class__.__name__) # We didn't manage to heartbeat, so make sure that the timestamp isn't updated self.latest_heartbeat = previous_heartbeat def run(self): """Starts the job.""" Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1) # Adding an entry in the DB with create_session() as session: self.state = State.RUNNING session.add(self) session.commit() make_transient(self) try: self._execute() # In case of max runs or max duration self.state = State.SUCCESS except SystemExit: # In case of ^C or SIGTERM self.state = State.SUCCESS except Exception: self.state = State.FAILED raise finally: self.end_date = timezone.utcnow() session.merge(self) session.commit() Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1) def _execute(self): raise NotImplementedError("This method needs to be overridden")
class BaseJob(Base, LoggingMixin): """ Abstract class to be derived for jobs. Jobs are processing items with state and duration that aren't task instances. For instance a BackfillJob is a collection of task instance runs, but should have its own state, start and end time. """ __tablename__ = "job" id = Column(Integer, primary_key=True) dag_id = Column(String(ID_LEN), ) state = Column(String(20)) job_type = Column(String(30)) start_date = Column(UtcDateTime()) end_date = Column(UtcDateTime()) latest_heartbeat = Column(UtcDateTime()) executor_class = Column(String(500)) hostname = Column(String(500)) unixname = Column(String(1000)) __mapper_args__ = { 'polymorphic_on': job_type, 'polymorphic_identity': 'BaseJob' } __table_args__ = ( Index('job_type_heart', job_type, latest_heartbeat), Index('idx_job_state_heartbeat', state, latest_heartbeat), ) heartrate = conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC') def __init__(self, executor=None, heartrate=None, *args, **kwargs): self.hostname = get_hostname() self.executor = executor or ExecutorLoader.get_default_executor() self.executor_class = self.executor.__class__.__name__ self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() if heartrate is not None: self.heartrate = heartrate self.unixname = getpass.getuser() self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query') super().__init__(*args, **kwargs) @classmethod @provide_session def most_recent_job(cls, session=None) -> Optional['BaseJob']: """ Return the most recent job of this type, if any, based on last heartbeat received. This method should be called on a subclass (i.e. on SchedulerJob) to return jobs of that type. :param session: Database session :rtype: BaseJob or None """ return session.query(cls).order_by( cls.latest_heartbeat.desc()).limit(1).first() def is_alive(self, grace_multiplier=2.1): """ Is this job currently alive. We define alive as in a state of RUNNING, and having sent a heartbeat within a multiple of the heartrate (default of 2.1) :param grace_multiplier: multiplier of heartrate to require heart beat within :type grace_multiplier: number :rtype: boolean """ return (self.state == State.RUNNING and (timezone.utcnow() - self.latest_heartbeat).total_seconds() < self.heartrate * grace_multiplier) @provide_session def kill(self, session=None): """ Handles on_kill callback and updates state in database. """ job = session.query(BaseJob).filter(BaseJob.id == self.id).first() job.end_date = timezone.utcnow() try: self.on_kill() except Exception as e: # pylint: disable=broad-except self.log.error('on_kill() method failed: %s', str(e)) session.merge(job) session.commit() raise AirflowException("Job shut down externally.") def on_kill(self): """ Will be called when an external kill command is received """ def heartbeat_callback(self, session=None): """ Callback that is called during heartbeat. This method should be overwritten. """ def heartbeat(self, only_if_necessary: bool = False): """ Heartbeats update the job's entry in the database with a timestamp for the latest_heartbeat and allows for the job to be killed externally. This allows at the system level to monitor what is actually active. For instance, an old heartbeat for SchedulerJob would mean something is wrong. This also allows for any job to be killed externally, regardless of who is running it or on which machine it is running. Note that if your heart rate is set to 60 seconds and you call this method after 10 seconds of processing since the last heartbeat, it will sleep 50 seconds to complete the 60 seconds and keep a steady heart rate. If you go over 60 seconds before calling it, it won't sleep at all. :param only_if_necessary: If the heartbeat is not yet due then do nothing (don't update column, don't call ``heartbeat_callback``) :type only_if_necessary: boolean """ seconds_remaining = 0 if self.latest_heartbeat: seconds_remaining = self.heartrate - ( timezone.utcnow() - self.latest_heartbeat).total_seconds() if seconds_remaining > 0 and only_if_necessary: return previous_heartbeat = self.latest_heartbeat try: with create_session() as session: # This will cause it to load from the db session.merge(self) previous_heartbeat = self.latest_heartbeat if self.state == State.SHUTDOWN: self.kill() # Figure out how long to sleep for sleep_for = 0 if self.latest_heartbeat: seconds_remaining = self.heartrate - \ (timezone.utcnow() - self.latest_heartbeat)\ .total_seconds() sleep_for = max(0, seconds_remaining) sleep(sleep_for) # Update last heartbeat time with create_session() as session: # Make the sesion aware of this object session.merge(self) self.latest_heartbeat = timezone.utcnow() session.commit() # At this point, the DB has updated. previous_heartbeat = self.latest_heartbeat self.heartbeat_callback(session=session) self.log.debug('[heartbeat]') except OperationalError: Stats.incr( convert_camel_to_snake(self.__class__.__name__) + '_heartbeat_failure', 1, 1) self.log.exception("%s heartbeat got an exception", self.__class__.__name__) # We didn't manage to heartbeat, so make sure that the timestamp isn't updated self.latest_heartbeat = previous_heartbeat def run(self): """ Starts the job. """ Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1) # Adding an entry in the DB with create_session() as session: self.state = State.RUNNING session.add(self) session.commit() make_transient(self) try: self._execute() # In case of max runs or max duration self.state = State.SUCCESS except SystemExit: # In case of ^C or SIGTERM self.state = State.SUCCESS except Exception: self.state = State.FAILED raise finally: self.end_date = timezone.utcnow() session.merge(self) session.commit() Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1) def _execute(self): raise NotImplementedError("This method needs to be overridden") @provide_session def reset_state_for_orphaned_tasks(self, filter_by_dag_run=None, session=None): """ This function checks if there are any tasks in the dagrun (or all) that have a scheduled state but are not known by the executor. If it finds those it will reset the state to None so they will get picked up again. The batch option is for performance reasons as the queries are made in sequence. :param filter_by_dag_run: the dag_run we want to process, None if all :type filter_by_dag_run: airflow.models.DagRun :return: the TIs reset (in expired SQLAlchemy state) :rtype: list[airflow.models.TaskInstance] """ queued_tis = self.executor.queued_tasks # also consider running as the state might not have changed in the db yet running_tis = self.executor.running resettable_states = [State.SCHEDULED, State.QUEUED] TI = models.TaskInstance DR = models.DagRun if filter_by_dag_run is None: resettable_tis = ( session.query(TI).join( DR, and_(TI.dag_id == DR.dag_id, TI.execution_date == DR.execution_date)).filter( # pylint: disable=comparison-with-callable DR.state == State.RUNNING, DR.run_id.notlike( f"{DagRunType.BACKFILL_JOB.value}__%"), TI.state.in_(resettable_states))).all() else: resettable_tis = filter_by_dag_run.get_task_instances( state=resettable_states, session=session) tis_to_reset = [] # Can't use an update here since it doesn't support joins for ti in resettable_tis: if ti.key not in queued_tis and ti.key not in running_tis: tis_to_reset.append(ti) if len(tis_to_reset) == 0: return [] def query(result, items): if not items: return result filter_for_tis = TI.filter_for_tis(items) reset_tis = session.query(TI).filter( filter_for_tis, TI.state.in_(resettable_states)).with_for_update().all() for ti in reset_tis: ti.state = State.NONE session.merge(ti) return result + reset_tis reset_tis = helpers.reduce_in_chunks(query, tis_to_reset, [], self.max_tis_per_query) task_instance_str = '\n\t'.join([repr(x) for x in reset_tis]) session.commit() self.log.info("Reset the following %s TaskInstances:\n\t%s", len(reset_tis), task_instance_str) return reset_tis
class BaseJob(Base, LoggingMixin): """ Abstract class to be derived for jobs. Jobs are processing items with state and duration that aren't task instances. For instance a BackfillJob is a collection of task instance runs, but should have its own state, start and end time. """ __tablename__ = "job" id = Column(Integer, primary_key=True) dag_id = Column(String(ID_LEN), ) state = Column(String(20)) job_type = Column(String(30)) start_date = Column(UtcDateTime()) end_date = Column(UtcDateTime()) latest_heartbeat = Column(UtcDateTime()) executor_class = Column(String(500)) hostname = Column(String(500)) unixname = Column(String(1000)) __mapper_args__ = { 'polymorphic_on': job_type, 'polymorphic_identity': 'BaseJob' } __table_args__ = ( Index('job_type_heart', job_type, latest_heartbeat), Index('idx_job_state_heartbeat', state, latest_heartbeat), ) def __init__(self, executor=executors.get_default_executor(), heartrate=conf.getfloat('scheduler', 'JOB_HEARTBEAT_SEC'), *args, **kwargs): self.hostname = get_hostname() self.executor = executor self.executor_class = executor.__class__.__name__ self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() self.heartrate = heartrate self.unixname = getpass.getuser() self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query') super().__init__(*args, **kwargs) def is_alive(self): return ((timezone.utcnow() - self.latest_heartbeat).seconds < (conf.getint('scheduler', 'JOB_HEARTBEAT_SEC') * 2.1)) @provide_session def kill(self, session=None): job = session.query(BaseJob).filter(BaseJob.id == self.id).first() job.end_date = timezone.utcnow() try: self.on_kill() except Exception as e: self.log.error('on_kill() method failed: %s', str(e)) session.merge(job) session.commit() raise AirflowException("Job shut down externally.") def on_kill(self): """ Will be called when an external kill command is received """ pass def heartbeat_callback(self, session=None): pass def heartbeat(self): """ Heartbeats update the job's entry in the database with a timestamp for the latest_heartbeat and allows for the job to be killed externally. This allows at the system level to monitor what is actually active. For instance, an old heartbeat for SchedulerJob would mean something is wrong. This also allows for any job to be killed externally, regardless of who is running it or on which machine it is running. Note that if your heartbeat is set to 60 seconds and you call this method after 10 seconds of processing since the last heartbeat, it will sleep 50 seconds to complete the 60 seconds and keep a steady heart rate. If you go over 60 seconds before calling it, it won't sleep at all. """ try: with create_session() as session: job = session.query(BaseJob).filter_by(id=self.id).one() make_transient(job) session.commit() if job.state == State.SHUTDOWN: self.kill() is_unit_test = conf.getboolean('core', 'unit_test_mode') if not is_unit_test: # Figure out how long to sleep for sleep_for = 0 if job.latest_heartbeat: seconds_remaining = self.heartrate - \ (timezone.utcnow() - job.latest_heartbeat)\ .total_seconds() sleep_for = max(0, seconds_remaining) sleep(sleep_for) # Update last heartbeat time with create_session() as session: job = session.query(BaseJob).filter( BaseJob.id == self.id).first() job.latest_heartbeat = timezone.utcnow() session.merge(job) session.commit() self.heartbeat_callback(session=session) self.log.debug('[heartbeat]') except OperationalError as e: self.log.error("Scheduler heartbeat got an exception: %s", str(e)) def run(self): Stats.incr(self.__class__.__name__.lower() + '_start', 1, 1) # Adding an entry in the DB with create_session() as session: self.state = State.RUNNING session.add(self) session.commit() id_ = self.id make_transient(self) self.id = id_ try: self._execute() # In case of max runs or max duration self.state = State.SUCCESS except SystemExit: # In case of ^C or SIGTERM self.state = State.SUCCESS except Exception: self.state = State.FAILED raise finally: self.end_date = timezone.utcnow() session.merge(self) session.commit() Stats.incr(self.__class__.__name__.lower() + '_end', 1, 1) def _execute(self): raise NotImplementedError("This method needs to be overridden") @provide_session def reset_state_for_orphaned_tasks(self, filter_by_dag_run=None, session=None): """ This function checks if there are any tasks in the dagrun (or all) that have a scheduled state but are not known by the executor. If it finds those it will reset the state to None so they will get picked up again. The batch option is for performance reasons as the queries are made in sequence. :param filter_by_dag_run: the dag_run we want to process, None if all :type filter_by_dag_run: airflow.models.DagRun :return: the TIs reset (in expired SQLAlchemy state) :rtype: list[airflow.models.TaskInstance] """ from airflow.jobs.backfill_job import BackfillJob queued_tis = self.executor.queued_tasks # also consider running as the state might not have changed in the db yet running_tis = self.executor.running resettable_states = [State.SCHEDULED, State.QUEUED] TI = models.TaskInstance DR = models.DagRun if filter_by_dag_run is None: resettable_tis = (session.query(TI).join( DR, and_(TI.dag_id == DR.dag_id, TI.execution_date == DR.execution_date)).filter( DR.state == State.RUNNING, DR.run_id.notlike(BackfillJob.ID_PREFIX + '%'), TI.state.in_(resettable_states))).all() else: resettable_tis = filter_by_dag_run.get_task_instances( state=resettable_states, session=session) tis_to_reset = [] # Can't use an update here since it doesn't support joins for ti in resettable_tis: if ti.key not in queued_tis and ti.key not in running_tis: tis_to_reset.append(ti) if len(tis_to_reset) == 0: return [] def query(result, items): filter_for_tis = ([ and_(TI.dag_id == ti.dag_id, TI.task_id == ti.task_id, TI.execution_date == ti.execution_date) for ti in items ]) reset_tis = (session.query(TI).filter( or_(*filter_for_tis), TI.state.in_(resettable_states)).with_for_update().all()) for ti in reset_tis: ti.state = State.NONE session.merge(ti) return result + reset_tis reset_tis = helpers.reduce_in_chunks(query, tis_to_reset, [], self.max_tis_per_query) task_instance_str = '\n\t'.join([repr(x) for x in reset_tis]) session.commit() self.log.info("Reset the following %s TaskInstances:\n\t%s", len(reset_tis), task_instance_str) return reset_tis
class AstronomerVersionCheck(Base): __tablename__ = "astro_version_check" singleton = Column(Boolean, default=True, nullable=False, primary_key=True) # For infomration only last_checked = Column(UtcDateTime(timezone=True)) last_checked_by = Column(Text) @classmethod def ensure_singleton(cls): """ Ensure that the singleton row exists in this table """ with create_session() as session: # To keep PG logs quieter (it shows an ERROR for the PK violation), # we try and select first if session.query(cls).get({"singleton": True}) is not None: return try: session.bulk_save_objects([cls(singleton=True)]) except sqlalchemy.exc.IntegrityError: # Already exists, we're good session.rollback() @classmethod def acquire_lock(cls, check_interval, session): # type: (datetime.timedelta, sqlalchemy.Session) -> Optional[AstronomerVersionCheck] """ Acquire an exclusive lock to perform an update check if the check is due and if another check is not already in progress. We use the database to hold the lock for as long as this transaction is open using `FOR UPDATE SKIP LOCKED`. This method will either return a row meaning the check is due and we have acquired the lock. The lock will be held for the duration of the database transaction -- be careful to to close this before you are done! This will throw an error if the lock is held by another transaction. """ now = utcnow() return session.query(cls).filter( cls.singleton.is_(True), or_(cls.last_checked.is_(None), cls.last_checked <= now - check_interval), ).with_for_update(nowait=True).one_or_none() @classmethod def get(cls, session): """ Return the update tracking row """ return session.query(cls).filter(cls.singleton.is_(True)).one() @staticmethod def host_identifier(): return "{hostname}-{pid}#{tid}".format( hostname=get_hostname(), pid=os.getpid(), tid=threading.get_ident() )