def __init__( self, dag_id=None, dag_ids=None, subdir=None, test_mode=False, refresh_dags_every=10, num_runs=None, do_pickle=False, *args, **kwargs): # for BaseJob compatibility self.dag_id = dag_id self.dag_ids = [dag_id] if dag_id else [] if dag_ids: self.dag_ids.extend(dag_ids) self.subdir = subdir if test_mode: self.num_runs = 1 else: self.num_runs = num_runs self.refresh_dags_every = refresh_dags_every self.do_pickle = do_pickle super(SchedulerJob, self).__init__(*args, **kwargs) self.heartrate = conf.getint('scheduler', 'SCHEDULER_HEARTBEAT_SEC') self.max_threads = min(conf.getint('scheduler', 'max_threads'), multiprocessing.cpu_count()) if 'sqlite' in conf.get('core', 'sql_alchemy_conn'): if self.max_threads > 1: self.logger.error("Cannot use more than 1 thread when using sqlite. Setting max_threads to 1") self.max_threads = 1
def test_some_resources_specified(self): resources = Resources(cpus=0, disk=1) self.assertEqual(resources.cpus.qty, 0) self.assertEqual(resources.ram.qty, configuration.getint('operators', 'default_ram')) self.assertEqual(resources.disk.qty, 1) self.assertEqual(resources.gpus.qty, configuration.getint('operators', 'default_gpus'))
def test_no_resources_specified(self): resources = Resources() self.assertEqual(resources.cpus.qty, configuration.getint('operators', 'default_cpus')) self.assertEqual(resources.ram.qty, configuration.getint('operators', 'default_ram')) self.assertEqual(resources.disk.qty, configuration.getint('operators', 'default_disk')) self.assertEqual(resources.gpus.qty, configuration.getint('operators', 'default_gpus'))
def configure_orm(): global engine global Session engine_args = {} if "sqlite" not in SQL_ALCHEMY_CONN: # Engine args not supported by sqlite engine_args["pool_size"] = conf.getint("core", "SQL_ALCHEMY_POOL_SIZE") engine_args["pool_recycle"] = conf.getint("core", "SQL_ALCHEMY_POOL_RECYCLE") engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) Session = scoped_session(sessionmaker(autocommit=False, autoflush=False, bind=engine))
def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) global engine global Session engine_args = {} pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') if disable_connection_pool or not pool_connections: engine_args['poolclass'] = NullPool log.debug("settings.configure_orm(): Using NullPool") elif 'sqlite' not in SQL_ALCHEMY_CONN: # Pool size engine args not supported by sqlite. # If no config value is defined for the pool size, select a reasonable value. # 0 means no limit, which could lead to exceeding the Database connection limit. try: pool_size = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') except conf.AirflowConfigException: pool_size = 5 # The DB server already has a value for wait_timeout (number of seconds after # which an idle sleeping connection should be killed). Since other DBs may # co-exist on the same server, SQLAlchemy should set its # pool_recycle to an equal or smaller value. try: pool_recycle = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') except conf.AirflowConfigException: pool_recycle = 1800 log.info("settings.configure_orm(): Using pool settings. pool_size={}, " "pool_recycle={}, pid={}".format(pool_size, pool_recycle, os.getpid())) engine_args['pool_size'] = pool_size engine_args['pool_recycle'] = pool_recycle try: # Allow the user to specify an encoding for their DB otherwise default # to utf-8 so jobs & users with non-latin1 characters can still use # us. engine_args['encoding'] = conf.get('core', 'SQL_ENGINE_ENCODING') except conf.AirflowConfigException: engine_args['encoding'] = 'utf-8' # For Python2 we get back a newstr and need a str engine_args['encoding'] = engine_args['encoding'].__str__() engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') setup_event_handlers(engine, reconnect_timeout) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine, expire_on_commit=False))
def __init__(self, cpus=None, ram=None, disk=None, gpus=None): if cpus is None: cpus = configuration.getint('operators', 'default_cpus') if ram is None: ram = configuration.getint('operators', 'default_ram') if disk is None: disk = configuration.getint('operators', 'default_disk') if gpus is None: gpus = configuration.getint('operators', 'default_gpus') self.cpus = CpuResource(cpus) self.ram = RamResource(ram) self.disk = DiskResource(disk) self.gpus = GpuResource(gpus)
def configure_orm(): global engine global Session engine_args = {} if 'sqlite' not in SQL_ALCHEMY_CONN: # Engine args not supported by sqlite engine_args['pool_size'] = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') engine_args['pool_recycle'] = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') #engine_args['echo'] = True engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine))
def configure_orm(disable_connection_pool=False): global engine global Session engine_args = {} if disable_connection_pool: engine_args['poolclass'] = NullPool elif 'sqlite' not in SQL_ALCHEMY_CONN: # Engine args not supported by sqlite engine_args['pool_size'] = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') engine_args['pool_recycle'] = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine))
def health(): """ An endpoint helping check the health status of the Airflow instance, including metadatabase and scheduler. """ session = settings.Session() BJ = jobs.BaseJob payload = {} scheduler_health_check_threshold = timedelta(seconds=conf.getint('scheduler', 'scheduler_health_check_threshold' )) latest_scheduler_heartbeat = None payload['metadatabase'] = {'status': 'healthy'} try: latest_scheduler_heartbeat = session.query(func.max(BJ.latest_heartbeat)). \ filter(BJ.state == 'running', BJ.job_type == 'SchedulerJob'). \ scalar() except Exception: payload['metadatabase']['status'] = 'unhealthy' if not latest_scheduler_heartbeat: scheduler_status = 'unhealthy' else: if timezone.utcnow() - latest_scheduler_heartbeat <= scheduler_health_check_threshold: scheduler_status = 'healthy' else: scheduler_status = 'unhealthy' payload['scheduler'] = {'status': scheduler_status, 'latest_scheduler_heartbeat': str(latest_scheduler_heartbeat)} return wwwutils.json_response(payload)
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): log = LoggingMixin().log SMTP_HOST = configuration.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT') SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS') SMTP_SSL = configuration.getboolean('smtp', 'SMTP_SSL') SMTP_USER = None SMTP_PASSWORD = None try: SMTP_USER = configuration.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD') except AirflowConfigException: log.debug("No user/password found for SMTP, so logging in with no authentication.") if not dryrun: s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP(SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) log.info("Sent an alert email to %s", e_to) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def __init__( self, dag_id=None, subdir=None, test_mode=False, refresh_dags_every=10, num_runs=None, do_pickle=False, *args, **kwargs ): self.dag_id = dag_id self.subdir = subdir if test_mode: self.num_runs = 1 else: self.num_runs = num_runs self.refresh_dags_every = refresh_dags_every self.do_pickle = do_pickle super(SchedulerJob, self).__init__(*args, **kwargs) self.heartrate = configuration.getint("scheduler", "SCHEDULER_HEARTBEAT_SEC")
def renew_from_kt(): # The config is specified in seconds. But we ask for that same amount in # minutes to give ourselves a large renewal buffer. renewal_lifetime = "%sm" % configuration.getint('kerberos', 'reinit_frequency') principal = configuration.get('kerberos', 'principal').replace("_HOST", socket.getfqdn()) cmdv = [configuration.get('kerberos', 'kinit_path'), "-r", renewal_lifetime, "-k", # host ticket "-t", configuration.get('kerberos', 'keytab'), # specify keytab "-c", configuration.get('kerberos', 'ccache'), # specify credentials cache principal] LOG.info("Reinitting kerberos from keytab: " + " ".join(cmdv)) subp = subprocess.Popen(cmdv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, bufsize=-1) subp.wait() if subp.returncode != 0: LOG.error("Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" % ( subp.returncode, "\n".join(subp.stdout.readlines()), "\n".join(subp.stderr.readlines()))) sys.exit(subp.returncode) global NEED_KRB181_WORKAROUND if NEED_KRB181_WORKAROUND is None: NEED_KRB181_WORKAROUND = detect_conf_var() if NEED_KRB181_WORKAROUND: # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we # renew the ticket after the initial valid time. time.sleep(1.5) perform_krb181_workaround()
def run(): if configuration.get('kerberos', 'keytab') is None: log.debug("Keytab renewer not starting, no keytab configured") sys.exit(0) while True: renew_from_kt() time.sleep(configuration.getint('kerberos', 'reinit_frequency'))
def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) global engine global Session engine_args = {} pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') if disable_connection_pool or not pool_connections: engine_args['poolclass'] = NullPool elif 'sqlite' not in SQL_ALCHEMY_CONN: # Engine args not supported by sqlite engine_args['pool_size'] = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') engine_args['pool_recycle'] = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') setup_event_handlers(engine, reconnect_timeout) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine))
def _read(self, ti, try_number, metadata=None): """ Template method that contains custom logic of reading logs given the try_number. :param ti: task instance record :param try_number: current try_number to read log from :param metadata: log metadata, can be used for steaming log reading and auto-tailing. :return: log message as a string and metadata. """ # Task instance here might be different from task instance when # initializing the handler. Thus explicitly getting log location # is needed to get correct log path. log_relative_path = self._render_filename(ti, try_number) location = os.path.join(self.local_base, log_relative_path) log = "" if os.path.exists(location): try: with open(location) as f: log += "*** Reading local file: {}\n".format(location) log += "".join(f.readlines()) except Exception as e: log = "*** Failed to load local log file: {}\n".format(location) log += "*** {}\n".format(str(e)) else: url = os.path.join( "http://{ti.hostname}:{worker_log_server_port}/log", log_relative_path ).format( ti=ti, worker_log_server_port=conf.get('celery', 'WORKER_LOG_SERVER_PORT') ) log += "*** Log file does not exist: {}\n".format(location) log += "*** Fetching from: {}\n".format(url) try: timeout = None # No timeout try: timeout = conf.getint('webserver', 'log_fetch_timeout_sec') except (AirflowConfigException, ValueError): pass response = requests.get(url, timeout=timeout) # Check if the resource was properly fetched response.raise_for_status() log += '\n' + response.text except Exception as e: log += "*** Failed to fetch log file from worker. {}\n".format(str(e)) return log, {'end_of_log': True}
def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) global engine global Session engine_args = {} pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') if disable_connection_pool or not pool_connections: engine_args['poolclass'] = NullPool log.debug("settings.configure_orm(): Using NullPool") elif 'sqlite' not in SQL_ALCHEMY_CONN: # Engine args not supported by sqlite. # If no config value is defined for the pool size, select a reasonable value. # 0 means no limit, which could lead to exceeding the Database connection limit. try: pool_size = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') except conf.AirflowConfigException: pool_size = 5 # The DB server already has a value for wait_timeout (number of seconds after # which an idle sleeping connection should be killed). Since other DBs may # co-exist on the same server, SQLAlchemy should set its # pool_recycle to an equal or smaller value. try: pool_recycle = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') except conf.AirflowConfigException: pool_recycle = 1800 log.info("setting.configure_orm(): Using pool settings. pool_size={}, " "pool_recycle={}".format(pool_size, pool_recycle)) engine_args['pool_size'] = pool_size engine_args['pool_recycle'] = pool_recycle engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') setup_event_handlers(engine, reconnect_timeout) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine))
def start_refresh(gunicorn_master_proc): batch_size = conf.getint('webserver', 'worker_refresh_batch_size') logging.debug('%s doing a refresh of %s workers', state, batch_size) sys.stdout.flush() sys.stderr.flush() excess = 0 for _ in range(batch_size): gunicorn_master_proc.send_signal(signal.SIGTTIN) excess += 1 wait_until_true(lambda: num_workers_expected + excess == get_num_workers_running(gunicorn_master_proc))
def __init__(self): super(CeleryExecutor, self).__init__() # Celery doesn't support querying the state of multiple tasks in parallel # (which can become a bottleneck on bigger clusters) so we use # a multiprocessing pool to speed this up. # How many worker processes are created for checking celery task state. self._sync_parallelism = configuration.getint('celery', 'SYNC_PARALLELISM') if self._sync_parallelism == 0: self._sync_parallelism = max(1, cpu_count() - 1) self._sync_pool = None self.tasks = {} self.last_state = {}
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): SMTP_HOST = configuration.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT') SMTP_USER = configuration.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD') SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS') if not dryrun: s = smtplib.SMTP(SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) logging.info("Sent an alert email to " + str(e_to)) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def get_date_time_num_runs_dag_runs_form_data(request, session, dag): dttm = request.args.get('execution_date') if dttm: dttm = pendulum.parse(dttm) else: dttm = dag.latest_execution_date or timezone.utcnow() base_date = request.args.get('base_date') if base_date: base_date = timezone.parse(base_date) else: # The DateTimeField widget truncates milliseconds and would loose # the first dag run. Round to next second. base_date = (dttm + timedelta(seconds=1)).replace(microsecond=0) default_dag_run = conf.getint('webserver', 'default_dag_run_display_number') num_runs = request.args.get('num_runs') num_runs = int(num_runs) if num_runs else default_dag_run DR = models.DagRun drs = (session.query(DR).filter( DR.dag_id == dag.dag_id, DR.execution_date <= base_date).order_by( desc(DR.execution_date)).limit(num_runs).all()) dr_choices = [] dr_state = None for dr in drs: dr_choices.append((dr.execution_date.isoformat(), dr.run_id)) if dttm == dr.execution_date: dr_state = dr.state # Happens if base_date was changed and the selected dag run is not in result if not dr_state and drs: dr = drs[0] dttm = dr.execution_date dr_state = dr.state return { 'dttm': dttm, 'base_date': base_date, 'num_runs': num_runs, 'execution_date': dttm.isoformat(), 'dr_choices': dr_choices, 'dr_state': dr_state, }
def renew_from_kt(): # The config is specified in seconds. But we ask for that same amount in # minutes to give ourselves a large renewal buffer. renewal_lifetime = "%sm" % configuration.getint('kerberos', 'reinit_frequency') principal = configuration.get('kerberos', 'principal').replace("_HOST", socket.getfqdn()) cmdv = [ configuration.get('kerberos', 'kinit_path'), "-r", renewal_lifetime, "-k", # host ticket "-t", configuration.get('kerberos', 'keytab'), # specify keytab "-c", configuration.get('kerberos', 'ccache'), # specify credentials cache principal ] log.info("Reinitting kerberos from keytab: " + " ".join(cmdv)) subp = subprocess.Popen(cmdv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, bufsize=-1, universal_newlines=True) subp.wait() if subp.returncode != 0: log.error( "Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" % (subp.returncode, "\n".join(subp.stdout.readlines()), "\n".join( subp.stderr.readlines()))) sys.exit(subp.returncode) global NEED_KRB181_WORKAROUND if NEED_KRB181_WORKAROUND is None: NEED_KRB181_WORKAROUND = detect_conf_var() if NEED_KRB181_WORKAROUND: # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we # renew the ticket after the initial valid time. time.sleep(1.5) perform_krb181_workaround()
def _read(self, ti, try_number): """ Template method that contains custom logic of reading logs given the try_number. :param ti: task instance record :param try_number: current try_number to read log from :return: log message as a string """ # Task instance here might be different from task instance when # initializing the handler. Thus explicitly getting log location # is needed to get correct log path. log_relative_path = self._render_filename(ti, try_number + 1) loc = os.path.join(self.local_base, log_relative_path) log = "" if os.path.exists(loc): try: with open(loc) as f: log += "*** Reading local log.\n" + "".join(f.readlines()) except Exception as e: log = "*** Failed to load local log file: {}. {}\n".format(loc, str(e)) else: url = os.path.join("http://{ti.hostname}:{worker_log_server_port}/log", log_relative_path).format( ti=ti, worker_log_server_port=conf.get('celery', 'WORKER_LOG_SERVER_PORT')) log += "*** Log file isn't local.\n" log += "*** Fetching here: {url}\n".format(**locals()) try: import requests timeout = None # No timeout try: timeout = conf.getint('webserver', 'log_fetch_timeout_sec') except (AirflowConfigException, ValueError): pass response = requests.get(url, timeout=timeout) response.raise_for_status() log += '\n' + response.text except Exception as e: log += "*** Failed to fetch log file from worker. {}\n".format(str(e)) return log
def test_with_execution_date_parameter_only(self): """ Tests graph view with execution_date URL parameter. Scenario: click link from dag runs view. Should only show dag runs older than execution_date in the drop down. Should select the particular dag run. Should set base date to execution date. """ response = self.app.get(self.endpoint + '&execution_date={}'.format( self.runs[1].execution_date.isoformat())) self.test.assertEqual(response.status_code, 200) data = response.data.decode('utf-8') self.assertBaseDateAndNumRuns( self.runs[1].execution_date, configuration.getint('webserver', 'default_dag_run_display_number'), data) self.assertRunIsNotInDropdown(self.runs[0], data) self.assertRunIsSelected(self.runs[1], data) self.assertRunIsInDropdownNotSelected(self.runs[2], data) self.assertRunIsInDropdownNotSelected(self.runs[3], data)
def __init__( self, dag_id=None, subdir=None, test_mode=False, refresh_dags_every=10, num_runs=None, do_pickle=False, *args, **kwargs): self.dag_id = dag_id self.subdir = subdir if test_mode: self.num_runs = 1 else: self.num_runs = num_runs self.refresh_dags_every = refresh_dags_every self.do_pickle = do_pickle super(SchedulerJob, self).__init__(*args, **kwargs) self.heartrate = configuration.getint('scheduler', 'SCHEDULER_HEARTBEAT_SEC')
def test_with_execution_date_parameter_only(self): """ Tests graph view with execution_date URL parameter. Scenario: click link from dag runs view. Should only show dag runs older than execution_date in the drop down. Should select the particular dag run. Should set base date to execution date. """ response = self.app.get( self.endpoint + '&execution_date={}'.format( self.runs[1].execution_date.isoformat()) ) self.test.assertEqual(response.status_code, 200) data = response.data.decode('utf-8') self.assertBaseDateAndNumRuns( self.runs[1].execution_date, configuration.getint('webserver', 'default_dag_run_display_number'), data) self.assertRunIsNotInDropdown(self.runs[0], data) self.assertRunIsSelected(self.runs[1], data) self.assertRunIsInDropdownNotSelected(self.runs[2], data) self.assertRunIsInDropdownNotSelected(self.runs[3], data)
class CeleryConfig(object): CELERY_ACCEPT_CONTENT = ['json', 'pickle'] CELERY_EVENT_SERIALIZER = 'json' CELERY_RESULT_SERIALIZER = 'pickle' CELERY_TASK_SERIALIZER = 'pickle' CELERYD_PREFETCH_MULTIPLIER = 1 CELERY_ACKS_LATE = True BROKER_URL = configuration.get('celery', 'BROKER_URL') CELERY_RESULT_BACKEND = configuration.get('celery', 'CELERY_RESULT_BACKEND') CELERYD_CONCURRENCY = configuration.getint('celery', 'CELERYD_CONCURRENCY') CELERY_DEFAULT_QUEUE = DEFAULT_QUEUE CELERY_DEFAULT_EXCHANGE = DEFAULT_QUEUE celery_ssl_active = False try: celery_ssl_active = configuration.getboolean('celery', 'CELERY_SSL_ACTIVE') except AirflowConfigException as e: log = LoggingMixin().logger log.warning("Celery Executor will run without SSL") try: if celery_ssl_active: BROKER_USE_SSL = { 'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'), 'certfile': configuration.get('celery', 'CELERY_SSL_CERT'), 'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'), 'cert_reqs': ssl.CERT_REQUIRED } except AirflowConfigException as e: raise AirflowException( 'AirflowConfigException: CELERY_SSL_ACTIVE is True, please ensure CELERY_SSL_KEY, ' 'CELERY_SSL_CERT and CELERY_SSL_CACERT are set') except Exception as e: raise AirflowException( 'Exception: There was an unknown Celery SSL Error. Please ensure you want to use ' 'SSL and/or have all necessary certs and key.')
class CeleryConfig(object): CELERY_ACCEPT_CONTENT = ['json', 'pickle'] CELERY_EVENT_SERIALIZER = 'json' CELERY_RESULT_SERIALIZER = 'pickle' CELERY_TASK_SERIALIZER = 'pickle' CELERYD_PREFETCH_MULTIPLIER = 1 CELERY_ACKS_LATE = True BROKER_URL = configuration.get('celery', 'BROKER_URL') CELERY_RESULT_BACKEND = configuration.get('celery', 'CELERY_RESULT_BACKEND') CELERYD_CONCURRENCY = configuration.getint('celery', 'CELERYD_CONCURRENCY') CELERY_DEFAULT_QUEUE = DEFAULT_QUEUE CELERY_DEFAULT_EXCHANGE = DEFAULT_QUEUE if configuration.getboolean('celery', 'CELERY_SSL_ACTIVE'): try: BROKER_USE_SSL = {'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'), 'certfile': configuration.get('celery', 'CELERY_SSL_CERT'), 'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'), 'cert_reqs': ssl.CERT_REQUIRED} except ValueError: raise AirflowException('ValueError: CELERY_SSL_ACTIVE is True, please ensure CELERY_SSL_KEY, ' 'CELERY_SSL_CERT and CELERY_SSL_CACERT are set') except Exception as e: raise AirflowException('Exception: There was an unknown Celery SSL Error. Please ensure you want to use ' 'SSL and/or have all necessary certs and key.')
def kill_zombies(self, session=None): """ Fail zombie tasks, which are tasks that haven't had a heartbeat for too long, in the current DagBag. :param session: DB session. :type session: sqlalchemy.orm.session.Session """ # Avoid circular import from airflow.models.taskinstance import TaskInstance as TI from airflow.jobs import LocalTaskJob as LJ # How many seconds do we wait for tasks to heartbeat before mark them as zombies. zombie_threshold_secs = (configuration.getint( 'scheduler', 'scheduler_zombie_task_threshold')) limit_dttm = timezone.utcnow() - timedelta( seconds=zombie_threshold_secs) self.log.debug("Failing jobs without heartbeat after %s", limit_dttm) tis = (session.query(TI).join(LJ, TI.job_id == LJ.id).filter( TI.state == State.RUNNING).filter(TI.dag_id.in_(self.dags)).filter( or_( LJ.state != State.RUNNING, LJ.latest_heartbeat < limit_dttm, )).all()) for ti in tis: self.log.info( "Detected zombie job with dag_id %s, task_id %s, and execution date %s", ti.dag_id, ti.task_id, ti.execution_date.isoformat()) ti.test_mode = configuration.getboolean('core', 'unit_test_mode') ti.task = self.dags[ti.dag_id].get_task(ti.task_id) ti.handle_failure("{} detected as zombie".format(ti), ti.test_mode, ti.get_template_context()) self.log.info('Marked zombie job %s as %s', ti, ti.state) Stats.incr('zombies_killed') session.commit()
def _execute(self): self.task_runner = get_task_runner(self) def signal_handler(signum, frame): """Setting kill signal handler""" self.log.error("Received SIGTERM. Terminating subprocesses") self.on_kill() raise AirflowException("LocalTaskJob received SIGTERM signal") signal.signal(signal.SIGTERM, signal_handler) if not self.task_instance._check_and_change_state_before_execution( mark_success=self.mark_success, ignore_all_deps=self.ignore_all_deps, ignore_depends_on_past=self.ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, ignore_ti_state=self.ignore_ti_state, job_id=self.id, pool=self.pool): self.log.info("Task is not able to be run") return try: self.task_runner.start() last_heartbeat_time = time.time() heartbeat_time_limit = conf.getint( 'scheduler', 'scheduler_zombie_task_threshold') while True: # Monitor the task to see if it's done return_code = self.task_runner.return_code() if return_code is not None: self.log.info("Task exited with return code %s", return_code) return # Periodically heartbeat so that the scheduler doesn't think this # is a zombie try: self.heartbeat() last_heartbeat_time = time.time() except OperationalError: Stats.incr('local_task_job_heartbeat_failure', 1, 1) self.log.exception( "Exception while trying to heartbeat! Sleeping for %s seconds", self.heartrate) time.sleep(self.heartrate) # If it's been too long since we've heartbeat, then it's possible that # the scheduler rescheduled this task, so kill launched processes. time_since_last_heartbeat = time.time() - last_heartbeat_time if time_since_last_heartbeat > heartbeat_time_limit: Stats.incr('local_task_job_prolonged_heartbeat_failure', 1, 1) self.log.error("Heartbeat time limited exceeded!") raise AirflowException( "Time since last heartbeat({:.2f}s) " "exceeded limit ({}s).".format( time_since_last_heartbeat, heartbeat_time_limit)) finally: self.on_kill()
def __init__(self): configuration_dict = configuration.as_dict(display_sensitive=True) self.core_configuration = configuration_dict['core'] self.kube_secrets = configuration_dict.get('kubernetes_secrets', {}) self.kube_env_vars = configuration_dict.get( 'kubernetes_environment_variables', {}) self.env_from_configmap_ref = configuration.get( self.kubernetes_section, 'env_from_configmap_ref') self.env_from_secret_ref = configuration.get(self.kubernetes_section, 'env_from_secret_ref') self.airflow_home = settings.AIRFLOW_HOME self.dags_folder = configuration.get(self.core_section, 'dags_folder') self.parallelism = configuration.getint(self.core_section, 'parallelism') self.worker_container_repository = configuration.get( self.kubernetes_section, 'worker_container_repository') self.worker_container_tag = configuration.get(self.kubernetes_section, 'worker_container_tag') self.kube_image = '{}:{}'.format(self.worker_container_repository, self.worker_container_tag) self.kube_image_pull_policy = configuration.get( self.kubernetes_section, "worker_container_image_pull_policy") self.kube_node_selectors = configuration_dict.get( 'kubernetes_node_selectors', {}) self.kube_annotations = configuration_dict.get( 'kubernetes_annotations', {}) self.kube_labels = configuration_dict.get('kubernetes_labels', {}) self.delete_worker_pods = conf.getboolean(self.kubernetes_section, 'delete_worker_pods') self.worker_pods_creation_batch_size = conf.getint( self.kubernetes_section, 'worker_pods_creation_batch_size') self.worker_service_account_name = conf.get( self.kubernetes_section, 'worker_service_account_name') self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets') # NOTE: user can build the dags into the docker image directly, # this will set to True if so self.dags_in_image = conf.getboolean(self.kubernetes_section, 'dags_in_image') # Run as user for pod security context self.worker_run_as_user = self._get_security_context_val('run_as_user') self.worker_fs_group = self._get_security_context_val('fs_group') # NOTE: `git_repo` and `git_branch` must be specified together as a pair # The http URL of the git repository to clone from self.git_repo = conf.get(self.kubernetes_section, 'git_repo') # The branch of the repository to be checked out self.git_branch = conf.get(self.kubernetes_section, 'git_branch') # Optionally, the directory in the git repository containing the dags self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath') # Optionally, the root directory for git operations self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root') # Optionally, the name at which to publish the checked-out files under --root self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest') # Optionally, if git_dags_folder_mount_point is set the worker will use # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder self.git_dags_folder_mount_point = conf.get( self.kubernetes_section, 'git_dags_folder_mount_point') # Optionally a user may supply a (`git_user` AND `git_password`) OR # (`git_ssh_key_secret_name` AND `git_ssh_key_secret_key`) for private repositories self.git_user = conf.get(self.kubernetes_section, 'git_user') self.git_password = conf.get(self.kubernetes_section, 'git_password') self.git_ssh_key_secret_name = conf.get(self.kubernetes_section, 'git_ssh_key_secret_name') self.git_ssh_known_hosts_configmap_name = conf.get( self.kubernetes_section, 'git_ssh_known_hosts_configmap_name') # NOTE: The user may optionally use a volume claim to mount a PV containing # DAGs directly self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim') # This prop may optionally be set for PV Claims and is used to write logs self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim') # This prop may optionally be set for PV Claims and is used to locate DAGs # on a SubPath self.dags_volume_subpath = conf.get(self.kubernetes_section, 'dags_volume_subpath') # This prop may optionally be set for PV Claims and is used to locate logs # on a SubPath self.logs_volume_subpath = conf.get(self.kubernetes_section, 'logs_volume_subpath') # Optionally, hostPath volume containing DAGs self.dags_volume_host = conf.get(self.kubernetes_section, 'dags_volume_host') # Optionally, write logs to a hostPath Volume self.logs_volume_host = conf.get(self.kubernetes_section, 'logs_volume_host') # This prop may optionally be set for PV Claims and is used to write logs self.base_log_folder = configuration.get(self.core_section, 'base_log_folder') # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note # that if your # cluster has RBAC enabled, your scheduler may need service account permissions to # create, watch, get, and delete pods in this namespace. self.kube_namespace = conf.get(self.kubernetes_section, 'namespace') # The Kubernetes Namespace in which pods will be created by the executor. Note # that if your # cluster has RBAC enabled, your workers may need service account permissions to # interact with cluster components. self.executor_namespace = conf.get(self.kubernetes_section, 'namespace') # Task secrets managed by KubernetesExecutor. self.gcp_service_account_keys = conf.get(self.kubernetes_section, 'gcp_service_account_keys') # If the user is using the git-sync container to clone their repository via git, # allow them to specify repository, tag, and pod name for the init container. self.git_sync_container_repository = conf.get( self.kubernetes_section, 'git_sync_container_repository') self.git_sync_container_tag = conf.get(self.kubernetes_section, 'git_sync_container_tag') self.git_sync_container = '{}:{}'.format( self.git_sync_container_repository, self.git_sync_container_tag) self.git_sync_init_container_name = conf.get( self.kubernetes_section, 'git_sync_init_container_name') self.git_sync_run_as_user = self._get_security_context_val( 'git_sync_run_as_user') # The worker pod may optionally have a valid Airflow config loaded via a # configmap self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap') affinity_json = conf.get(self.kubernetes_section, 'affinity') if affinity_json: self.kube_affinity = json.loads(affinity_json) else: self.kube_affinity = None tolerations_json = conf.get(self.kubernetes_section, 'tolerations') if tolerations_json: self.kube_tolerations = json.loads(tolerations_json) else: self.kube_tolerations = None kube_client_request_args = conf.get(self.kubernetes_section, 'kube_client_request_args') if kube_client_request_args: self.kube_client_request_args = json.loads( kube_client_request_args) if self.kube_client_request_args['_request_timeout'] and \ isinstance(self.kube_client_request_args['_request_timeout'], list): self.kube_client_request_args['_request_timeout'] = \ tuple(self.kube_client_request_args['_request_timeout']) else: self.kube_client_request_args = {} self._validate()
def start(self): self.task_queue = Queue() self.result_queue = Queue() framework = mesos_pb2.FrameworkInfo() framework.user = '' if not configuration.get('mesos', 'MASTER'): self.log.error("Expecting mesos master URL for mesos executor") raise AirflowException( "mesos.master not provided for mesos executor") master = configuration.get('mesos', 'MASTER') framework.name = get_framework_name() if not configuration.get('mesos', 'TASK_CPU'): task_cpu = 1 else: task_cpu = configuration.getint('mesos', 'TASK_CPU') if not configuration.get('mesos', 'TASK_MEMORY'): task_memory = 256 else: task_memory = configuration.getint('mesos', 'TASK_MEMORY') if configuration.getboolean('mesos', 'CHECKPOINT'): framework.checkpoint = True if configuration.get('mesos', 'FAILOVER_TIMEOUT'): # Import here to work around a circular import error from airflow.models import Connection # Query the database to get the ID of the Mesos Framework, if available. conn_id = FRAMEWORK_CONNID_PREFIX + framework.name session = Session() connection = session.query(Connection).filter_by( conn_id=conn_id).first() if connection is not None: # Set the Framework ID to let the scheduler reconnect with running tasks. framework.id.value = connection.extra framework.failover_timeout = configuration.getint( 'mesos', 'FAILOVER_TIMEOUT') else: framework.checkpoint = False self.log.info( 'MesosFramework master : %s, name : %s, cpu : %s, mem : %s, checkpoint : %s', master, framework.name, str(task_cpu), str(task_memory), str(framework.checkpoint)) implicit_acknowledgements = 1 if configuration.getboolean('mesos', 'AUTHENTICATE'): if not configuration.get('mesos', 'DEFAULT_PRINCIPAL'): self.log.error( "Expecting authentication principal in the environment") raise AirflowException( "mesos.default_principal not provided in authenticated mode" ) if not configuration.get('mesos', 'DEFAULT_SECRET'): self.log.error( "Expecting authentication secret in the environment") raise AirflowException( "mesos.default_secret not provided in authenticated mode") credential = mesos_pb2.Credential() credential.principal = configuration.get('mesos', 'DEFAULT_PRINCIPAL') credential.secret = configuration.get('mesos', 'DEFAULT_SECRET') framework.principal = credential.principal driver = mesos.native.MesosSchedulerDriver( AirflowMesosScheduler(self.task_queue, self.result_queue, task_cpu, task_memory), framework, master, implicit_acknowledgements, credential) else: framework.principal = 'Airflow' driver = mesos.native.MesosSchedulerDriver( AirflowMesosScheduler(self.task_queue, self.result_queue, task_cpu, task_memory), framework, master, implicit_acknowledgements) self.mesos_driver = driver self.mesos_driver.start()
def is_alive(self): return ((datetime.now() - self.latest_heartbeat).seconds < (conf.getint('scheduler', 'JOB_HEARTBEAT_SEC') * 2.1))
def is_alive(self): return ( (datetime.now() - self.latest_heartbeat).seconds < (conf.getint('scheduler', 'JOB_HEARTBEAT_SEC') * 2.1) )
def auto_conn(): logging.info('Creating connections, pool and sql path') session = Session() def create_new_conn(session, attributes): new_conn = models.Connection() new_conn.conn_id = attributes.get("conn_id") new_conn.conn_type = attributes.get('conn_type') new_conn.host = attributes.get('host') new_conn.port = attributes.get('port') new_conn.schema = attributes.get('schema') new_conn.login = attributes.get('login') new_conn.extra = attributes.get('extra') # new_conn.password = attributes.get('password') new_conn.set_password(attributes.get('password')) session.add(new_conn) session.commit() create_new_conn(session, {"conn_id": configuration.get('s3' , 's3_conn_id'), "conn_type": configuration.get('s3' , 's3_conn_type'), "extra":configuration.get('s3', 's3_extra') }) create_new_conn(session, {"conn_id": configuration.get('mysql', 'mysql_conn_id'), "conn_type": configuration.get('mysql', 'mysql_conn_type'), "schema":configuration.get('mysql', 'mysql_schema'), "host": configuration.get('mysql', 'mysql_host'), "port": configuration.getint('mysql', 'mysql_port'), "login": configuration.get('mysql', 'mysql_login'), "password": configuration.get('mysql', 'mysql_password')}) create_new_conn(session, {"conn_id": configuration.get('postgresql', 'postgresql_conn_id'), "conn_type": configuration.get('postgresql', 'postgresql_conn_type'), "host": configuration.get('postgresql', 'postgresql_host'), "port": configuration.getint('postgresql', 'postgresql_port'), "schema": configuration.get('postgresql', 'postgresql_schema'), "login": configuration.get('postgresql', 'postgresql_login'), "password": configuration.get('postgresql', 'postgresql_password')}) create_new_conn(session, {"conn_id": "airflow_connection", "conn_type": configuration.get('mysql', 'mysql_conn_type'), "schema": "airflow", "host": "localhost", "login": "******", "password": "******"}) create_new_conn(session, { "conn_id": "mongo_connection", "conn_type": "mongo", "host": "13.126.117.239", "port": "27017", "login": "******", "password": "******" }); session.close()
@classmethod def decr(cls, stat, count=1, rate=1): pass @classmethod def gauge(cls, stat, value, rate=1, delta=False): pass Stats = DummyStatsLogger if conf.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient(host=conf.get('scheduler', 'statsd_host'), port=conf.getint('scheduler', 'statsd_port'), prefix=conf.get('scheduler', 'statsd_prefix')) Stats = statsd else: Stats = DummyStatsLogger HEADER = """\ ____________ _____________ ____ |__( )_________ __/__ /________ __ ____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / / ___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ / _/_/ |_/_/ /_/ /_/ /_/ \____/____/|__/ """ BASE_LOG_URL = '/admin/airflow/log' AIRFLOW_HOME = os.path.expanduser(conf.get('core', 'AIRFLOW_HOME'))
def incr(cls, stat, count=1, rate=1): pass @classmethod def decr(cls, stat, count=1, rate=1): pass @classmethod def gauge(cls, stat, value, rate=1, delta=False): pass Stats = DummyStatsLogger if conf.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient( host=conf.get('scheduler', 'statsd_host'), port=conf.getint('scheduler', 'statsd_port'), prefix=conf.get('scheduler', 'statsd_prefix')) Stats = statsd else: Stats = DummyStatsLogger HEADER = """\ ____________ _____________ ____ |__( )_________ __/__ /________ __ ____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / / ___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ / _/_/ |_/_/ /_/ /_/ /_/ \____/____/|__/ """
def __init__(self): configuration_dict = configuration.as_dict(display_sensitive=True) self.core_configuration = configuration_dict['core'] self.kube_secrets = configuration_dict.get('kubernetes_secrets', {}) self.kube_env_vars = configuration_dict.get('kubernetes_environment_variables', {}) self.airflow_home = configuration.get(self.core_section, 'airflow_home') self.dags_folder = configuration.get(self.core_section, 'dags_folder') self.parallelism = configuration.getint(self.core_section, 'PARALLELISM') self.worker_container_repository = configuration.get( self.kubernetes_section, 'worker_container_repository') self.worker_container_tag = configuration.get( self.kubernetes_section, 'worker_container_tag') self.kube_image = '{}:{}'.format( self.worker_container_repository, self.worker_container_tag) self.kube_image_pull_policy = configuration.get( self.kubernetes_section, "worker_container_image_pull_policy" ) self.kube_node_selectors = configuration_dict.get('kubernetes_node_selectors', {}) self.kube_annotations = configuration_dict.get('kubernetes_annotations', {}) self.delete_worker_pods = conf.getboolean( self.kubernetes_section, 'delete_worker_pods') self.worker_pods_creation_batch_size = conf.getint( self.kubernetes_section, 'worker_pods_creation_batch_size') self.worker_service_account_name = conf.get( self.kubernetes_section, 'worker_service_account_name') self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets') # NOTE: user can build the dags into the docker image directly, # this will set to True if so self.dags_in_image = conf.getboolean(self.kubernetes_section, 'dags_in_image') # NOTE: `git_repo` and `git_branch` must be specified together as a pair # The http URL of the git repository to clone from self.git_repo = conf.get(self.kubernetes_section, 'git_repo') # The branch of the repository to be checked out self.git_branch = conf.get(self.kubernetes_section, 'git_branch') # Optionally, the directory in the git repository containing the dags self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath') # Optionally, the root directory for git operations self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root') # Optionally, the name at which to publish the checked-out files under --root self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest') # Optionally, if git_dags_folder_mount_point is set the worker will use # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder self.git_dags_folder_mount_point = conf.get(self.kubernetes_section, 'git_dags_folder_mount_point') # Optionally a user may supply a (`git_user` AND `git_password`) OR # (`git_ssh_key_secret_name` AND `git_ssh_key_secret_key`) for private repositories self.git_user = conf.get(self.kubernetes_section, 'git_user') self.git_password = conf.get(self.kubernetes_section, 'git_password') self.git_ssh_key_secret_name = conf.get(self.kubernetes_section, 'git_ssh_key_secret_name') self.git_ssh_known_hosts_configmap_name = conf.get(self.kubernetes_section, 'git_ssh_known_hosts_configmap_name') # NOTE: The user may optionally use a volume claim to mount a PV containing # DAGs directly self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim') # This prop may optionally be set for PV Claims and is used to write logs self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim') # This prop may optionally be set for PV Claims and is used to locate DAGs # on a SubPath self.dags_volume_subpath = conf.get( self.kubernetes_section, 'dags_volume_subpath') # This prop may optionally be set for PV Claims and is used to locate logs # on a SubPath self.logs_volume_subpath = conf.get( self.kubernetes_section, 'logs_volume_subpath') # Optionally, hostPath volume containing DAGs self.dags_volume_host = conf.get(self.kubernetes_section, 'dags_volume_host') # Optionally, write logs to a hostPath Volume self.logs_volume_host = conf.get(self.kubernetes_section, 'logs_volume_host') # This prop may optionally be set for PV Claims and is used to write logs self.base_log_folder = configuration.get(self.core_section, 'base_log_folder') # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note # that if your # cluster has RBAC enabled, your scheduler may need service account permissions to # create, watch, get, and delete pods in this namespace. self.kube_namespace = conf.get(self.kubernetes_section, 'namespace') # The Kubernetes Namespace in which pods will be created by the executor. Note # that if your # cluster has RBAC enabled, your workers may need service account permissions to # interact with cluster components. self.executor_namespace = conf.get(self.kubernetes_section, 'namespace') # Task secrets managed by KubernetesExecutor. self.gcp_service_account_keys = conf.get(self.kubernetes_section, 'gcp_service_account_keys') # If the user is using the git-sync container to clone their repository via git, # allow them to specify repository, tag, and pod name for the init container. self.git_sync_container_repository = conf.get( self.kubernetes_section, 'git_sync_container_repository') self.git_sync_container_tag = conf.get( self.kubernetes_section, 'git_sync_container_tag') self.git_sync_container = '{}:{}'.format( self.git_sync_container_repository, self.git_sync_container_tag) self.git_sync_init_container_name = conf.get( self.kubernetes_section, 'git_sync_init_container_name') # The worker pod may optionally have a valid Airflow config loaded via a # configmap self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap') affinity_json = conf.get(self.kubernetes_section, 'affinity') if affinity_json: self.kube_affinity = json.loads(affinity_json) else: self.kube_affinity = None tolerations_json = conf.get(self.kubernetes_section, 'tolerations') if tolerations_json: self.kube_tolerations = json.loads(tolerations_json) else: self.kube_tolerations = None self._validate()
def process_file(self, filepath, only_if_updated=True): """ Given a path to a python module or zip file, this method imports the module and look for dag objects within it. """ found_dags = [] # if the source file no longer exists in the DB or in the filesystem, # return an empty list # todo: raise exception? if filepath is None or not os.path.isfile(filepath): return found_dags try: # This failed before in what may have been a git sync # race condition file_last_changed = datetime.fromtimestamp( os.path.getmtime(filepath)) if only_if_updated \ and filepath in self.dagbag.file_last_changed \ and file_last_changed == self.dagbag.file_last_changed[filepath]: return found_dags except Exception as e: self.log.exception(e) return found_dags mods = [] if not zipfile.is_zipfile(filepath): if self.safe_mode and os.path.isfile(filepath): with open(filepath, 'rb') as f: content = f.read() if not all([s in content for s in (b'DAG', b'airflow')]): self.dagbag.file_last_changed[ filepath] = file_last_changed return found_dags self.log.debug("Importing %s", filepath) org_mod_name, _ = os.path.splitext(os.path.split(filepath)[-1]) mod_name = ('unusual_prefix_' + hashlib.sha1(filepath.encode('utf-8')).hexdigest() + '_' + org_mod_name) if mod_name in sys.modules: del sys.modules[mod_name] with timeout(configuration.getint('core', "DAGBAG_IMPORT_TIMEOUT")): try: m = imp.load_source(mod_name, filepath) mods.append(m) except Exception as e: self.log.exception("Failed to import: %s", filepath) self.dagbag.import_errors[filepath] = str(e) self.dagbag.file_last_changed[filepath] = file_last_changed else: zip_file = zipfile.ZipFile(filepath) for mod in zip_file.infolist(): head, _ = os.path.split(mod.filename) mod_name, ext = os.path.splitext(mod.filename) if not head and (ext == '.py' or ext == '.pyc'): if mod_name == '__init__': self.log.warning("Found __init__.%s at root of %s", ext, filepath) if self.safe_mode: with zip_file.open(mod.filename) as zf: self.log.debug("Reading %s from %s", mod.filename, filepath) content = zf.read() if not all( [s in content for s in (b'DAG', b'airflow')]): self.dagbag.file_last_changed[filepath] = ( file_last_changed) # todo: create ignore list return found_dags if mod_name in sys.modules: del sys.modules[mod_name] try: sys.path.insert(0, filepath) m = importlib.import_module(mod_name) mods.append(m) except Exception as e: self.log.exception("Failed to import: %s", filepath) self.dagbag.import_errors[filepath] = str(e) self.dagbag.file_last_changed[ filepath] = file_last_changed for m in mods: for dag in list(m.__dict__.values()): if isinstance(dag, airflow.models.DAG): if not dag.full_filepath: dag.full_filepath = filepath if dag.fileloc != filepath: dag.fileloc = filepath try: dag.is_subdag = False self.dagbag.bag_dag(dag, parent_dag=dag, root_dag=dag) found_dags.append(dag) found_dags += dag.subdags except AirflowDagCycleException as cycle_exception: self.log.exception("Failed to bag_dag: %s", dag.full_filepath) self.dagbag.import_errors[dag.full_filepath] = \ str(cycle_exception) self.dagbag.file_last_changed[dag.full_filepath] = \ file_last_changed self.dagbag.file_last_changed[filepath] = file_last_changed return found_dags
from sqlalchemy import Column, Integer, String, DateTime, func, Index, or_ from sqlalchemy.orm.session import make_transient from airflow import executors, models, settings, utils from airflow import configuration from airflow.utils import AirflowException, State, LoggingMixin Base = models.Base ID_LEN = models.ID_LEN # Setting up a statsd client if needed statsd = None if configuration.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient(host=configuration.get('scheduler', 'statsd_host'), port=configuration.getint('scheduler', 'statsd_port'), prefix=configuration.get('scheduler', 'statsd_prefix')) class BaseJob(Base, LoggingMixin): """ Abstract class to be derived for jobs. Jobs are processing items with state and duration that aren't task instances. For instance a BackfillJob is a collection of task instance runs, but should have it's own state, start and end time. """ __tablename__ = "job" id = Column(Integer, primary_key=True)
from datetime import datetime import getpass import imp import os import re import signal import subprocess import sys import warnings from airflow import configuration from airflow.exceptions import AirflowException # When killing processes, time to wait after issuing a SIGTERM before issuing a # SIGKILL. DEFAULT_TIME_TO_WAIT_AFTER_SIGTERM = configuration.getint('core', 'KILLED_TASK_CLEANUP_TIME') def validate_key(k, max_length=250): if not isinstance(k, basestring): raise TypeError("The key has to be a string") elif len(k) > max_length: raise AirflowException( "The key has to be less than {0} characters".format(max_length)) elif not re.match(r'^[A-Za-z0-9_\-\.]+$', k): raise AirflowException( "The key ({k}) has to be made of alphanumeric characters, dashes, " "dots and underscores exclusively".format(**locals())) else: return True
import imp import os import re import signal import subprocess import sys import warnings from jinja2 import Template from airflow import configuration from airflow.exceptions import AirflowException # When killing processes, time to wait after issuing a SIGTERM before issuing a # SIGKILL. DEFAULT_TIME_TO_WAIT_AFTER_SIGTERM = configuration.getint( 'core', 'KILLED_TASK_CLEANUP_TIME') def validate_key(k, max_length=250): if not isinstance(k, basestring): raise TypeError("The key has to be a string") elif len(k) > max_length: raise AirflowException( "The key has to be less than {0} characters".format(max_length)) elif not re.match(r'^[A-Za-z0-9_\-\.]+$', k): raise AirflowException( "The key ({k}) has to be made of alphanumeric characters, dashes, " "dots and underscores exclusively".format(**locals())) else: return True
def __init__(self): configuration_dict = configuration.as_dict(display_sensitive=True) self.core_configuration = configuration_dict['core'] self.kube_secrets = configuration_dict.get('kubernetes_secrets', {}) self.airflow_home = configuration.get(self.core_section, 'airflow_home') self.dags_folder = configuration.get(self.core_section, 'dags_folder') self.parallelism = configuration.getint(self.core_section, 'PARALLELISM') self.worker_container_repository = configuration.get( self.kubernetes_section, 'worker_container_repository') self.worker_container_tag = configuration.get( self.kubernetes_section, 'worker_container_tag') self.worker_dags_folder = configuration.get( self.kubernetes_section, 'worker_dags_folder') self.kube_image = '{}:{}'.format( self.worker_container_repository, self.worker_container_tag) self.kube_image_pull_policy = configuration.get( self.kubernetes_section, "worker_container_image_pull_policy" ) self.kube_node_selectors = configuration_dict.get('kubernetes_node_selectors', {}) self.delete_worker_pods = conf.getboolean( self.kubernetes_section, 'delete_worker_pods') self.worker_service_account_name = conf.get( self.kubernetes_section, 'worker_service_account_name') self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets') # NOTE: `git_repo` and `git_branch` must be specified together as a pair # The http URL of the git repository to clone from self.git_repo = conf.get(self.kubernetes_section, 'git_repo') # The branch of the repository to be checked out self.git_branch = conf.get(self.kubernetes_section, 'git_branch') # Optionally, the directory in the git repository containing the dags self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath') # Optionally a user may supply a `git_user` and `git_password` for private # repositories self.git_user = conf.get(self.kubernetes_section, 'git_user') self.git_password = conf.get(self.kubernetes_section, 'git_password') # NOTE: The user may optionally use a volume claim to mount a PV containing # DAGs directly self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim') # This prop may optionally be set for PV Claims and is used to write logs self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim') # This prop may optionally be set for PV Claims and is used to locate DAGs # on a SubPath self.dags_volume_subpath = conf.get( self.kubernetes_section, 'dags_volume_subpath') # This prop may optionally be set for PV Claims and is used to locate logs # on a SubPath self.logs_volume_subpath = conf.get( self.kubernetes_section, 'logs_volume_subpath') # This prop may optionally be set for PV Claims and is used to write logs self.base_log_folder = configuration.get(self.core_section, 'base_log_folder') # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note # that if your # cluster has RBAC enabled, your scheduler may need service account permissions to # create, watch, get, and delete pods in this namespace. self.kube_namespace = conf.get(self.kubernetes_section, 'namespace') # The Kubernetes Namespace in which pods will be created by the executor. Note # that if your # cluster has RBAC enabled, your workers may need service account permissions to # interact with cluster components. self.executor_namespace = conf.get(self.kubernetes_section, 'namespace') # Task secrets managed by KubernetesExecutor. self.gcp_service_account_keys = conf.get(self.kubernetes_section, 'gcp_service_account_keys') # If the user is using the git-sync container to clone their repository via git, # allow them to specify repository, tag, and pod name for the init container. self.git_sync_container_repository = conf.get( self.kubernetes_section, 'git_sync_container_repository') self.git_sync_container_tag = conf.get( self.kubernetes_section, 'git_sync_container_tag') self.git_sync_container = '{}:{}'.format( self.git_sync_container_repository, self.git_sync_container_tag) self.git_sync_init_container_name = conf.get( self.kubernetes_section, 'git_sync_init_container_name') # The worker pod may optionally have a valid Airflow config loaded via a # configmap self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap') self._validate()
from airflow.www.forms import (DateTimeForm, DateTimeWithNumRunsForm, DateTimeWithNumRunsWithDagRunsForm) from airflow.www.validators import GreaterEqualThan QUERY_LIMIT = 100000 CHART_LIMIT = 200000 UTF8_READER = codecs.getreader('utf-8') dagbag = models.DagBag(settings.DAGS_FOLDER) # logout_user = airflow.login.logout_user FILTER_BY_OWNER = False PAGE_SIZE = conf.getint('webserver', 'page_size') if conf.getboolean('webserver', 'FILTER_BY_OWNER'): # filter_by_owner if authentication is enabled and filter_by_owner is true FILTER_BY_OWNER = not current_app.config['LOGIN_DISABLED'] def dag_link(v, c, m, p): if m.dag_id is None: return Markup() kwargs = {'dag_id': m.dag_id} # This is called with various objects, TIs, (ORM) DAG - some have this, # some don't if hasattr(m, 'execution_date'):
def prioritize_queued(self, session, executor, dagbag): # Prioritizing queued task instances pools = {p.pool: p for p in session.query(models.Pool).all()} TI = models.TaskInstance queued_tis = ( session.query(TI) .filter(TI.state == State.QUEUED) .all() ) self.logger.info( "Prioritizing {} queued jobs".format(len(queued_tis))) session.expunge_all() d = defaultdict(list) for ti in queued_tis: if ti.dag_id not in dagbag.dags: self.logger.info( "DAG no longer in dagbag, deleting {}".format(ti)) session.delete(ti) session.commit() elif not dagbag.dags[ti.dag_id].has_task(ti.task_id): self.logger.info( "Task no longer exists, deleting {}".format(ti)) session.delete(ti) session.commit() else: d[ti.pool].append(ti) dag_blacklist = set(dagbag.paused_dags()) for pool, tis in list(d.items()): if not pool: # Arbitrary: # If queued outside of a pool, trigger no more than # non_pooled_task_slot_count per run open_slots = conf.getint('core', 'non_pooled_task_slot_count') else: open_slots = pools[pool].open_slots(session=session) queue_size = len(tis) self.logger.info("Pool {pool} has {open_slots} slots, {queue_size} " "task instances in queue".format(**locals())) if open_slots <= 0: continue tis = sorted( tis, key=lambda ti: (-ti.priority_weight, ti.start_date)) for ti in tis: if open_slots <= 0: continue task = None try: task = dagbag.dags[ti.dag_id].get_task(ti.task_id) except: self.logger.error("Queued task {} seems gone".format(ti)) session.delete(ti) session.commit() continue if not task: continue ti.task = task # picklin' dag = dagbag.dags[ti.dag_id] pickle_id = None if self.do_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): self.logger.info("Pickling DAG {}".format(dag)) pickle_id = dag.pickle(session).id if dag.dag_id in dag_blacklist: continue if dag.concurrency_reached: dag_blacklist.add(dag.dag_id) continue if ti.are_dependencies_met(): executor.queue_task_instance(ti, pickle_id=pickle_id) open_slots -= 1 else: session.delete(ti) session.commit() continue ti.task = task session.commit()
def restart_workers(gunicorn_master_proc, num_workers_expected): """ Runs forever, monitoring the child processes of @gunicorn_master_proc and restarting workers occasionally. Each iteration of the loop traverses one edge of this state transition diagram, where each state (node) represents [ num_ready_workers_running / num_workers_running ]. We expect most time to be spent in [n / n]. `bs` is the setting webserver.worker_refresh_batch_size. The horizontal transition at ? happens after the new worker parses all the dags (so it could take a while!) V ────────────────────────────────────────────────────────────────────────┐ [n / n] ──TTIN──> [ [n, n+bs) / n + bs ] ────?───> [n + bs / n + bs] ──TTOU─┘ ^ ^───────────────┘ │ │ ┌────────────────v └──────┴────── [ [0, n) / n ] <─── start We change the number of workers by sending TTIN and TTOU to the gunicorn master process, which increases and decreases the number of child workers respectively. Gunicorn guarantees that on TTOU workers are terminated gracefully and that the oldest worker is terminated. """ def wait_until_true(fn): """ Sleeps until fn is true """ while not fn(): time.sleep(0.1) def get_num_workers_running(gunicorn_master_proc): workers = psutil.Process(gunicorn_master_proc.pid).children() return len(workers) def get_num_ready_workers_running(gunicorn_master_proc): workers = psutil.Process(gunicorn_master_proc.pid).children() ready_workers = [ proc for proc in workers if settings.GUNICORN_WORKER_READY_PREFIX in proc.cmdline()[0] ] return len(ready_workers) def start_refresh(gunicorn_master_proc): batch_size = conf.getint('webserver', 'worker_refresh_batch_size') logging.debug('%s doing a refresh of %s workers', state, batch_size) sys.stdout.flush() sys.stderr.flush() excess = 0 for _ in range(batch_size): gunicorn_master_proc.send_signal(signal.SIGTTIN) excess += 1 wait_until_true(lambda: num_workers_expected + excess == get_num_workers_running(gunicorn_master_proc)) wait_until_true(lambda: num_workers_expected == get_num_workers_running( gunicorn_master_proc)) while True: num_workers_running = get_num_workers_running(gunicorn_master_proc) num_ready_workers_running = get_num_ready_workers_running( gunicorn_master_proc) state = '[{0} / {1}]'.format(num_ready_workers_running, num_workers_running) # Whenever some workers are not ready, wait until all workers are ready if num_ready_workers_running < num_workers_running: logging.debug('%s some workers are starting up, waiting...', state) sys.stdout.flush() time.sleep(1) # Kill a worker gracefully by asking gunicorn to reduce number of workers elif num_workers_running > num_workers_expected: excess = num_workers_running - num_workers_expected logging.debug('%s killing %s workers', state, excess) for _ in range(excess): gunicorn_master_proc.send_signal(signal.SIGTTOU) excess -= 1 wait_until_true(lambda: num_workers_expected + excess == get_num_workers_running(gunicorn_master_proc)) # Start a new worker by asking gunicorn to increase number of workers elif num_workers_running == num_workers_expected: refresh_interval = conf.getint('webserver', 'worker_refresh_interval') logging.debug('%s sleeping for %ss starting doing a refresh...', state, refresh_interval) time.sleep(refresh_interval) start_refresh(gunicorn_master_proc) else: # num_ready_workers_running == num_workers_running < num_workers_expected logging.error(("%s some workers seem to have died and gunicorn" "did not restart them as expected"), state) time.sleep(10) if len(psutil.Process(gunicorn_master_proc.pid).children() ) < num_workers_expected: start_refresh(gunicorn_master_proc)
def prioritize_queued(self, session, executor, dagbag): # Prioritizing queued task instances pools = {p.pool: p for p in session.query(models.Pool).all()} self.logger.info("Prioritizing {} queued jobs".format( len(self.queued_tis))) session.expunge_all() d = defaultdict(list) for ti in self.queued_tis: if ti.dag_id not in dagbag.dags: self.logger.info( "DAG no longer in dagbag, deleting {}".format(ti)) session.delete(ti) session.commit() elif not dagbag.dags[ti.dag_id].has_task(ti.task_id): self.logger.info( "Task no longer exists, deleting {}".format(ti)) session.delete(ti) session.commit() else: d[ti.pool].append(ti) self.queued_tis.clear() dag_blacklist = set(dagbag.paused_dags()) for pool, tis in list(d.items()): if not pool: # Arbitrary: # If queued outside of a pool, trigger no more than # non_pooled_task_slot_count per run open_slots = conf.getint('core', 'non_pooled_task_slot_count') else: open_slots = pools[pool].open_slots(session=session) queue_size = len(tis) self.logger.info( "Pool {pool} has {open_slots} slots, {queue_size} " "task instances in queue".format(**locals())) if open_slots <= 0: continue tis = sorted(tis, key=lambda ti: (-ti.priority_weight, ti.start_date)) for ti in tis: if open_slots <= 0: continue task = None try: task = dagbag.dags[ti.dag_id].get_task(ti.task_id) except: self.logger.error("Queued task {} seems gone".format(ti)) session.delete(ti) session.commit() continue if not task: continue ti.task = task # picklin' dag = dagbag.dags[ti.dag_id] pickle_id = None if self.do_pickle and self.executor.__class__ not in ( executors.LocalExecutor, executors.SequentialExecutor): self.logger.info("Pickling DAG {}".format(dag)) pickle_id = dag.pickle(session).id if dag.dag_id in dag_blacklist: continue if dag.concurrency_reached: dag_blacklist.add(dag.dag_id) continue if ti.are_dependencies_met(): executor.queue_task_instance(ti, pickle_id=pickle_id) open_slots -= 1 else: session.delete(ti) continue ti.task = task session.commit()
def webserver(args): print(settings.HEADER) app = cached_app(conf) access_logfile = args.access_logfile or conf.get('webserver', 'access_logfile') error_logfile = args.error_logfile or conf.get('webserver', 'error_logfile') num_workers = args.workers or conf.get('webserver', 'workers') worker_timeout = (args.worker_timeout or conf.get('webserver', 'webserver_worker_timeout')) ssl_cert = args.ssl_cert or conf.get('webserver', 'web_server_ssl_cert') ssl_key = args.ssl_key or conf.get('webserver', 'web_server_ssl_key') if ssl_cert is None and ssl_key is not None: raise AirflowException( 'An SSL certificate must also be provided for use with ' + ssl_key) if ssl_cert is not None and ssl_key is None: raise AirflowException( 'An SSL key must also be provided for use with ' + ssl_cert) if args.debug: print("Starting the web server on port {0} and host {1}.".format( args.port, args.hostname)) app.run(debug=True, port=args.port, host=args.hostname, ssl_context=(ssl_cert, ssl_key)) else: pid, stdout, stderr, log_file = setup_locations("webserver", pid=args.pid) print( textwrap.dedent('''\ Running the Gunicorn Server with: Workers: {num_workers} {args.workerclass} Host: {args.hostname}:{args.port} Timeout: {worker_timeout} Logfiles: {access_logfile} {error_logfile} =================================================================\ '''.format(**locals()))) run_args = [ 'gunicorn', '-w', str(num_workers), '-k', str(args.workerclass), '-t', str(worker_timeout), '-b', args.hostname + ':' + str(args.port), '-n', 'airflow-webserver', '-p', str(pid), '-c', 'airflow.www.gunicorn_config' ] if args.access_logfile: run_args += ['--access-logfile', str(args.access_logfile)] if args.error_logfile: run_args += ['--error-logfile', str(args.error_logfile)] if args.daemon: run_args += ["-D"] if ssl_cert: run_args += ['--certfile', ssl_cert, '--keyfile', ssl_key] run_args += ["airflow.www.app:cached_app()"] gunicorn_master_proc = subprocess.Popen(run_args) def kill_proc(dummy_signum, dummy_frame): gunicorn_master_proc.terminate() gunicorn_master_proc.wait() sys.exit(0) signal.signal(signal.SIGINT, kill_proc) signal.signal(signal.SIGTERM, kill_proc) # These run forever until SIG{INT, TERM, KILL, ...} signal is sent if conf.getint('webserver', 'worker_refresh_interval') > 0: restart_workers(gunicorn_master_proc, num_workers) else: while True: time.sleep(1)
def _process_backfill_task_instances(self, ti_status, executor, pickle_id, start_date=None, session=None): """ Process a set of task instances from a set of dag runs. Special handling is done to account for different task instance states that could be present when running them in a backfill process. :param ti_status: the internal status of the job :type ti_status: BackfillJob._DagRunTaskStatus :param executor: the executor to run the task instances :type executor: BaseExecutor :param pickle_id: the pickle_id if dag is pickled, None otherwise :type pickle_id: int :param start_date: the start date of the backfill job :type start_date: datetime.datetime :param session: the current session object :type session: sqlalchemy.orm.session.Session :return: the list of execution_dates for the finished dag runs :rtype: list """ executed_run_dates = [] while ((len(ti_status.to_run) > 0 or len(ti_status.running) > 0) and len(ti_status.deadlocked) == 0): self.log.debug("*** Clearing out not_ready list ***") ti_status.not_ready.clear() # we need to execute the tasks bottom to top # or leaf to root, as otherwise tasks might be # determined deadlocked while they are actually # waiting for their upstream to finish @provide_session def _per_task_process(task, key, ti, session=None): ti.refresh_from_db() task = self.dag.get_task(ti.task_id) ti.task = task ignore_depends_on_past = (self.ignore_first_depends_on_past and ti.execution_date == (start_date or ti.start_date)) self.log.debug("Task instance to run %s state %s", ti, ti.state) # The task was already marked successful or skipped by a # different Job. Don't rerun it. if ti.state == State.SUCCESS: ti_status.succeeded.add(key) self.log.debug("Task instance %s succeeded. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return elif ti.state == State.SKIPPED: ti_status.skipped.add(key) self.log.debug("Task instance %s skipped. Don't rerun.", ti) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # guard against externally modified tasks instances or # in case max concurrency has been reached at task runtime elif ti.state == State.NONE: self.log.warning( "FIXME: task instance {} state was set to None " "externally. This should not happen") ti.set_state(State.SCHEDULED, session=session) if self.rerun_failed_tasks: # Rerun failed tasks or upstreamed failed tasks if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance {ti} " "with state {state}".format( ti=ti, state=ti.state)) if key in ti_status.running: ti_status.running.pop(key) # Reset the failed task in backfill to scheduled state ti.set_state(State.SCHEDULED, session=session) else: # Default behaviour which works for subdag. if ti.state in (State.FAILED, State.UPSTREAM_FAILED): self.log.error("Task instance {ti} " "with {state} state".format( ti=ti, state=ti.state)) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return backfill_context = DepContext( deps=RUN_DEPS, ignore_depends_on_past=ignore_depends_on_past, ignore_task_deps=self.ignore_task_deps, flag_upstream_failed=True) # Is the task runnable? -- then run it # the dependency checker can change states of tis if ti.are_dependencies_met(dep_context=backfill_context, session=session, verbose=self.verbose): ti.refresh_from_db(lock_for_update=True, session=session) if ti.state in (State.SCHEDULED, State.UP_FOR_RETRY, State.UP_FOR_RESCHEDULE): if executor.has_task(ti): self.log.debug( "Task Instance %s already in executor " "waiting for queue to clear", ti) else: self.log.debug('Sending %s to executor', ti) # Skip scheduled state, we are executing immediately ti.state = State.QUEUED ti.queued_dttm = timezone.utcnow( ) if not ti.queued_dttm else ti.queued_dttm session.merge(ti) cfg_path = None if executor.__class__ in ( executors.LocalExecutor, executors.SequentialExecutor): cfg_path = tmp_configuration_copy() executor.queue_task_instance( ti, mark_success=self.mark_success, pickle_id=pickle_id, ignore_task_deps=self.ignore_task_deps, ignore_depends_on_past=ignore_depends_on_past, pool=self.pool, cfg_path=cfg_path) ti_status.running[key] = ti ti_status.to_run.pop(key) session.commit() return if ti.state == State.UPSTREAM_FAILED: self.log.error("Task instance %s upstream failed", ti) ti_status.failed.add(key) ti_status.to_run.pop(key) if key in ti_status.running: ti_status.running.pop(key) return # special case if ti.state == State.UP_FOR_RETRY: self.log.debug( "Task instance %s retry period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # special case if ti.state == State.UP_FOR_RESCHEDULE: self.log.debug( "Task instance %s reschedule period not " "expired yet", ti) if key in ti_status.running: ti_status.running.pop(key) ti_status.to_run[key] = ti return # all remaining tasks self.log.debug('Adding %s to not_ready', ti) ti_status.not_ready.add(key) non_pool_slots = conf.getint( 'core', 'non_pooled_backfill_task_slot_count') try: for task in self.dag.topological_sort(): for key, ti in list(ti_status.to_run.items()): if task.task_id != ti.task_id: continue if task.pool: pool = session.query(models.Pool) \ .filter(models.Pool.pool == task.pool) \ .first() if not pool: raise PoolNotFound('Unknown pool: {}'.format( task.pool)) open_slots = pool.open_slots(session=session) if open_slots <= 0: raise NoAvailablePoolSlot( "Not scheduling since there are " "%s open slots in pool %s".format( open_slots, task.pool)) else: if non_pool_slots <= 0: raise NoAvailablePoolSlot( "Not scheduling since there are no " "non_pooled_backfill_task_slot_count.") non_pool_slots -= 1 num_running_task_instances_in_dag = DAG.get_num_task_instances( self.dag_id, states=self.STATES_COUNT_AS_RUNNING, ) if num_running_task_instances_in_dag >= self.dag.concurrency: raise DagConcurrencyLimitReached( "Not scheduling since DAG concurrency limit " "is reached.") if task.task_concurrency: num_running_task_instances_in_task = DAG.get_num_task_instances( dag_id=self.dag_id, task_ids=[task.task_id], states=self.STATES_COUNT_AS_RUNNING, ) if num_running_task_instances_in_task >= task.task_concurrency: raise TaskConcurrencyLimitReached( "Not scheduling since Task concurrency limit " "is reached.") _per_task_process(task, key, ti) except (NoAvailablePoolSlot, DagConcurrencyLimitReached, TaskConcurrencyLimitReached) as e: self.log.debug(e) # execute the tasks in the queue self.heartbeat() executor.heartbeat() # If the set of tasks that aren't ready ever equals the set of # tasks to run and there are no running tasks then the backfill # is deadlocked if (ti_status.not_ready and ti_status.not_ready == set(ti_status.to_run) and len(ti_status.running) == 0): self.log.warning("Deadlock discovered for ti_status.to_run=%s", ti_status.to_run.values()) ti_status.deadlocked.update(ti_status.to_run.values()) ti_status.to_run.clear() # check executor state self._manage_executor_state(ti_status.running) # update the task counters self._update_counters(ti_status=ti_status) # update dag run state _dag_runs = ti_status.active_runs[:] for run in _dag_runs: run.update_state(session=session) if run.state in State.finished(): ti_status.finished_runs += 1 ti_status.active_runs.remove(run) executed_run_dates.append(run.execution_date) self._log_progress(ti_status) # return updated status return executed_run_dates
@classmethod def gauge(cls, stat, value, rate=1, delta=False): pass @classmethod def timing(cls, stat, dt): pass Stats = DummyStatsLogger if conf.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient( host=conf.get('scheduler', 'statsd_host'), port=conf.getint('scheduler', 'statsd_port'), prefix=conf.get('scheduler', 'statsd_prefix')) Stats = statsd else: Stats = DummyStatsLogger HEADER = """\ ____________ _____________ ____ |__( )_________ __/__ /________ __ ____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / / ___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ / _/_/ |_/_/ /_/ /_/ /_/ \____/____/|__/ """ BASE_LOG_URL = '/admin/airflow/log'
def mesos_driver(self): """ Lazily instantiates the Mesos scheduler driver if one was not injected in via the constructor """ if self._mesos_driver is None: framework = Dict() framework.user = '******' if not configuration.get('mesos', 'MASTER'): logging.error("Expecting mesos master URL for mesos executor") raise AirflowException( "mesos.master not provided for mesos executor") master = configuration.get('mesos', 'MASTER') framework.name = get_framework_name() if configuration.getboolean('mesos', 'CHECKPOINT'): framework.checkpoint = True if configuration.get('mesos', 'FAILOVER_TIMEOUT'): # Import here to work around a circular import error from airflow.models import Connection # Query the database to get the ID of the Mesos Framework, if available. conn_id = FRAMEWORK_CONNID_PREFIX + framework.name session = Session() connection = session.query(Connection).filter_by( conn_id=conn_id).first() if connection is not None: # Set the Framework ID to let the scheduler reconnect with running tasks. framework.id.value = connection.extra framework.failover_timeout = configuration.getint( 'mesos', 'FAILOVER_TIMEOUT') else: framework.checkpoint = False logging.info( 'MesosFramework master : %s, name : %s, checkpoint : %s', master, framework.name, str(framework.checkpoint)) if configuration.getboolean('mesos', 'AUTHENTICATE'): if not configuration.get('mesos', 'DEFAULT_PRINCIPAL'): logging.error( "Expecting authentication principal in the environment" ) raise AirflowException( "mesos.default_principal not provided in authenticated mode" ) if not configuration.get('mesos', 'DEFAULT_SECRET'): logging.error( "Expecting authentication secret in the environment") raise AirflowException( "mesos.default_secret not provided in authenticated mode" ) principal = configuration.get('mesos', 'DEFAULT_PRINCIPAL') secret = configuration.get('mesos', 'DEFAULT_SECRET') framework.principal = credential.principal self._mesos_driver = MesosSchedulerDriver( AirflowMesosScheduler(self.task_queue, self.result_queue), framework, master, use_addict=True, principal=principal, secret=secret) else: framework.principal = 'Airflow' self._mesos_driver = MesosSchedulerDriver( AirflowMesosScheduler(self.task_queue, self.result_queue), framework, master, use_addict=True) return self._mesos_driver
broker_transport_options = configuration.getsection( 'celery_broker_transport_options') if broker_transport_options is None: broker_transport_options = {'visibility_timeout': 21600} DEFAULT_CELERY_CONFIG = { 'accept_content': ['json', 'pickle'], 'event_serializer': 'json', 'worker_prefetch_multiplier': 1, 'task_acks_late': True, 'task_default_queue': configuration.get('celery', 'DEFAULT_QUEUE'), 'task_default_exchange': configuration.get('celery', 'DEFAULT_QUEUE'), 'broker_url': configuration.get('celery', 'BROKER_URL'), 'broker_transport_options': broker_transport_options, 'result_backend': configuration.get('celery', 'RESULT_BACKEND'), 'worker_concurrency': configuration.getint('celery', 'WORKER_CONCURRENCY'), } celery_ssl_active = False try: celery_ssl_active = configuration.getboolean('celery', 'SSL_ACTIVE') except AirflowConfigException as e: log.warning("Celery Executor will run without SSL") try: if celery_ssl_active: broker_use_ssl = { 'keyfile': configuration.get('celery', 'SSL_KEY'), 'certfile': configuration.get('celery', 'SSL_CERT'), 'ca_certs': configuration.get('celery', 'SSL_CACERT'), 'cert_reqs': ssl.CERT_REQUIRED
def __init__(self, dag_directory, file_paths, max_runs, processor_factory, processor_timeout, signal_conn, async_mode=True): """ :param dag_directory: Directory where DAG definitions are kept. All files in file_paths should be under this directory :type dag_directory: unicode :param file_paths: list of file paths that contain DAG definitions :type file_paths: list[unicode] :param max_runs: The number of times to parse and schedule each file. -1 for unlimited. :type max_runs: int :param processor_factory: function that creates processors for DAG definition files. Arguments are (dag_definition_path) :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor) :param processor_timeout: How long to wait before timing out a DAG file processor :type processor_timeout: timedelta :param signal_conn: connection to communicate signal with processor agent. :type signal_conn: airflow.models.connection.Connection :param async_mode: whether to start the manager in async mode :type async_mode: bool """ self._file_paths = file_paths self._file_path_queue = [] self._dag_directory = dag_directory self._max_runs = max_runs self._processor_factory = processor_factory self._signal_conn = signal_conn self._async_mode = async_mode self._parallelism = conf.getint('scheduler', 'max_threads') if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: self.log.error("Cannot use more than 1 thread when using sqlite. " "Setting parallelism to 1") self._parallelism = 1 # Parse and schedule each file no faster than this interval. self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval') # How often to print out DAG file processing stats to the log. Default to # 30 seconds. self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval') # Map from file path to the processor self._processors = {} # Map from file path to the last runtime self._last_runtime = {} # Map from file path to the last finish time self._last_finish_time = {} self._last_zombie_query_time = timezone.utcnow() # Last time that the DAG dir was traversed to look for files self.last_dag_dir_refresh_time = timezone.utcnow() # Last time stats were printed self.last_stat_print_time = timezone.datetime(2000, 1, 1) # TODO: Remove magic number self._zombie_query_interval = 10 # Map from file path to the number of runs self._run_count = defaultdict(int) # Manager heartbeat key. self._heart_beat_key = 'heart-beat' # How long to wait before timing out a process to parse a DAG file self._processor_timeout = processor_timeout # How often to scan the DAGs directory for new files. Default to 5 minutes. self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval') self._log = logging.getLogger('airflow.processor_manager') signal.signal(signal.SIGINT, self._exit_gracefully) signal.signal(signal.SIGTERM, self._exit_gracefully)
from airflow import executors, models, settings, utils from airflow import configuration from airflow.utils import AirflowException, State Base = models.Base ID_LEN = models.ID_LEN # Setting up a statsd client if needed statsd = None if configuration.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient( host=configuration.get('scheduler', 'statsd_host'), port=configuration.getint('scheduler', 'statsd_port'), prefix=configuration.get('scheduler', 'statsd_prefix')) class BaseJob(Base): """ Abstract class to be derived for jobs. Jobs are processing items with state and duration that aren't task instances. For instance a BackfillJob is a collection of task instance runs, but should have it's own state, start and end time. """ __tablename__ = "job" id = Column(Integer, primary_key=True) dag_id = Column(String(ID_LEN),)
broker_transport_options = {'visibility_timeout': 21600} DEFAULT_CELERY_CONFIG = { 'accept_content': ['json', 'pickle'], 'event_serializer': 'json', 'result_serializer': 'pickle', 'worker_prefetch_multiplier': 1, 'task_acks_late': True, 'task_default_queue': configuration.get('celery', 'DEFAULT_QUEUE'), 'task_default_exchange': configuration.get('celery', 'DEFAULT_QUEUE'), 'broker_url': configuration.get('celery', 'BROKER_URL'), 'broker_transport_options': { 'visibility_timeout': broker_transport_options }, 'result_backend': configuration.get('celery', 'CELERY_RESULT_BACKEND'), 'worker_concurrency': configuration.getint('celery', 'CELERYD_CONCURRENCY'), } celery_ssl_active = False try: celery_ssl_active = configuration.getboolean('celery', 'CELERY_SSL_ACTIVE') except AirflowConfigException as e: log.warning("Celery Executor will run without SSL") try: if celery_ssl_active: broker_use_ssl = { 'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'), 'certfile': configuration.get('celery', 'CELERY_SSL_CERT'), 'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'), 'cert_reqs': ssl.CERT_REQUIRED
@classmethod def gauge(cls, stat, value, rate=1, delta=False): pass @classmethod def timing(cls, stat, dt): pass Stats = DummyStatsLogger if conf.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient(host=conf.get('scheduler', 'statsd_host'), port=conf.getint('scheduler', 'statsd_port'), prefix=conf.get('scheduler', 'statsd_prefix')) Stats = statsd else: Stats = DummyStatsLogger HEADER = """\ ____________ _____________ ____ |__( )_________ __/__ /________ __ ____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / / ___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ / _/_/ |_/_/ /_/ /_/ /_/ \____/____/|__/ """ BASE_LOG_URL = '/admin/airflow/log' LOGGING_LEVEL = logging.INFO
def __init__(self, dag_directory, file_paths, max_runs, processor_factory, signal_conn, stat_queue, result_queue, async_mode=True): """ :param dag_directory: Directory where DAG definitions are kept. All files in file_paths should be under this directory :type dag_directory: unicode :param file_paths: list of file paths that contain DAG definitions :type file_paths: list[unicode] :param max_runs: The number of times to parse and schedule each file. -1 for unlimited. :type max_runs: int :param processor_factory: function that creates processors for DAG definition files. Arguments are (dag_definition_path) :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor) :param signal_conn: connection to communicate signal with processor agent. :type signal_conn: airflow.models.connection.Connection :param stat_queue: the queue to use for passing back parsing stat to agent. :type stat_queue: multiprocessing.Queue :param result_queue: the queue to use for passing back the result to agent. :type result_queue: multiprocessing.Queue :param async_mode: whether to start the manager in async mode :type async_mode: bool """ self._file_paths = file_paths self._file_path_queue = [] self._dag_directory = dag_directory self._max_runs = max_runs self._processor_factory = processor_factory self._signal_conn = signal_conn self._stat_queue = stat_queue self._result_queue = result_queue self._async_mode = async_mode self._parallelism = conf.getint('scheduler', 'max_threads') if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: self.log.error("Cannot use more than 1 thread when using sqlite. " "Setting parallelism to 1") self._parallelism = 1 # Parse and schedule each file no faster than this interval. self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval') # How often to print out DAG file processing stats to the log. Default to # 30 seconds. self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval') # How many seconds do we wait for tasks to heartbeat before mark them as zombies. self._zombie_threshold_secs = ( conf.getint('scheduler', 'scheduler_zombie_task_threshold')) # Map from file path to the processor self._processors = {} # Map from file path to the last runtime self._last_runtime = {} # Map from file path to the last finish time self._last_finish_time = {} self._last_zombie_query_time = timezone.utcnow() # Last time that the DAG dir was traversed to look for files self.last_dag_dir_refresh_time = timezone.utcnow() # Last time stats were printed self.last_stat_print_time = timezone.datetime(2000, 1, 1) # TODO: Remove magic number self._zombie_query_interval = 10 # Map from file path to the number of runs self._run_count = defaultdict(int) # Manager heartbeat key. self._heart_beat_key = 'heart-beat' # How often to scan the DAGs directory for new files. Default to 5 minutes. self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval') self._log = logging.getLogger('airflow.processor_manager') signal.signal(signal.SIGINT, self._exit_gracefully) signal.signal(signal.SIGTERM, self._exit_gracefully)
# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from builtins import range from airflow import configuration from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State PARALLELISM = configuration.getint('core', 'PARALLELISM') class BaseExecutor(LoggingMixin): def __init__(self, parallelism=PARALLELISM): """ Class to derive in order to interface with executor-type systems like Celery, Mesos, Yarn and the likes. :param parallelism: how many jobs should run at one time. Set to ``0`` for infinity :type parallelism: int """ self.parallelism = parallelism self.queued_tasks = {} self.running = {}
def start(self): self.task_queue = Queue() self.result_queue = Queue() framework = mesos_pb2.FrameworkInfo() framework.user = '' if not configuration.get('mesos', 'MASTER'): logging.error("Expecting mesos master URL for mesos executor") raise AirflowException("mesos.master not provided for mesos executor") master = configuration.get('mesos', 'MASTER') framework.name = get_framework_name() if not configuration.get('mesos', 'TASK_CPU'): task_cpu = 1 else: task_cpu = configuration.getint('mesos', 'TASK_CPU') if not configuration.get('mesos', 'TASK_MEMORY'): task_memory = 256 else: task_memory = configuration.getint('mesos', 'TASK_MEMORY') if configuration.getboolean('mesos', 'CHECKPOINT'): framework.checkpoint = True if configuration.get('mesos', 'FAILOVER_TIMEOUT'): # Import here to work around a circular import error from airflow.models import Connection # Query the database to get the ID of the Mesos Framework, if available. conn_id = FRAMEWORK_CONNID_PREFIX + framework.name session = Session() connection = session.query(Connection).filter_by(conn_id=conn_id).first() if connection is not None: # Set the Framework ID to let the scheduler reconnect with running tasks. framework.id.value = connection.extra framework.failover_timeout = configuration.getint('mesos', 'FAILOVER_TIMEOUT') else: framework.checkpoint = False logging.info('MesosFramework master : %s, name : %s, cpu : %s, mem : %s, checkpoint : %s', master, framework.name, str(task_cpu), str(task_memory), str(framework.checkpoint)) implicit_acknowledgements = 1 if configuration.getboolean('mesos', 'AUTHENTICATE'): if not configuration.get('mesos', 'DEFAULT_PRINCIPAL'): logging.error("Expecting authentication principal in the environment") raise AirflowException("mesos.default_principal not provided in authenticated mode") if not configuration.get('mesos', 'DEFAULT_SECRET'): logging.error("Expecting authentication secret in the environment") raise AirflowException("mesos.default_secret not provided in authenticated mode") credential = mesos_pb2.Credential() credential.principal = configuration.get('mesos', 'DEFAULT_PRINCIPAL') credential.secret = configuration.get('mesos', 'DEFAULT_SECRET') framework.principal = credential.principal driver = mesos.native.MesosSchedulerDriver( AirflowMesosScheduler(self.task_queue, self.result_queue, task_cpu, task_memory), framework, master, implicit_acknowledgements, credential) else: framework.principal = 'Airflow' driver = mesos.native.MesosSchedulerDriver( AirflowMesosScheduler(self.task_queue, self.result_queue, task_cpu, task_memory), framework, master, implicit_acknowledgements) self.mesos_driver = driver self.mesos_driver.start()