def load_login(): auth_backend = "airflow.default_login" try: if configuration.getboolean("webserver", "AUTHENTICATE"): auth_backend = configuration.get("webserver", "auth_backend") except configuration.AirflowConfigException: if configuration.getboolean("webserver", "AUTHENTICATE"): logging.warning( "auth_backend not found in webserver config reverting to *deprecated*" " behavior of importing airflow_login" ) auth_backend = "airflow_login" try: global login login = import_module(auth_backend) except ImportError as err: logging.critical( "Cannot import authentication module %s. " "Please correct your authentication backend or disable authentication: %s", auth_backend, err, ) if configuration.getboolean("webserver", "AUTHENTICATE"): raise AirflowException("Failed to import authentication backend")
def load_login(): log = LoggingMixin().log auth_backend = 'airflow.default_login' try: if conf.getboolean('webserver', 'AUTHENTICATE'): auth_backend = conf.get('webserver', 'auth_backend') except conf.AirflowConfigException: if conf.getboolean('webserver', 'AUTHENTICATE'): log.warning( "auth_backend not found in webserver config reverting to " "*deprecated* behavior of importing airflow_login") auth_backend = "airflow_login" try: global login login = import_module(auth_backend) if hasattr(login, 'login_manager') and not hasattr(login, 'LOGIN_MANAGER'): login.LOGIN_MANAGER = login.login_manager except ImportError as err: log.critical( "Cannot import authentication module %s. " "Please correct your authentication backend or disable authentication: %s", auth_backend, err ) if conf.getboolean('webserver', 'AUTHENTICATE'): raise AirflowException("Failed to import authentication backend")
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): log = LoggingMixin().log SMTP_HOST = configuration.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT') SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS') SMTP_SSL = configuration.getboolean('smtp', 'SMTP_SSL') SMTP_USER = None SMTP_PASSWORD = None try: SMTP_USER = configuration.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD') except AirflowConfigException: log.debug( "No user/password found for SMTP, so logging in with no authentication." ) if not dryrun: s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP( SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) log.info("Sent an alert email to %s", e_to) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def configure_orm(disable_connection_pool=False): pool_enabled = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') try: pool_size = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') except AirflowConfigException: pool_size = 5 try: pool_recycle = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') except AirflowConfigException: pool_recycle = 1800 try: encoding = conf.get('core', 'SQL_ENGINE_ENCODING') except AirflowConfigException: encoding = 'utf-8' echo = conf.getboolean('core', 'SQL_ALCHEMY_ECHO') reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') autocommit = False sql_alchemy_conn = conf.get('core', 'SQL_ALCHEMY_CONN') global engine global Session (engine, Session) = alchemy_orm.configure_orm( sql_alchemy_conn, pool_enabled=pool_enabled, pool_size=pool_size, pool_recycle=pool_recycle, reconnect_timeout=reconnect_timeout, autocommit=autocommit, disable_connection_pool=disable_connection_pool, encoding=encoding, echo=echo) alchemy_orm.Session = Session
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): log = LoggingMixin().log SMTP_HOST = configuration.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT') SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS') SMTP_SSL = configuration.getboolean('smtp', 'SMTP_SSL') SMTP_USER = None SMTP_PASSWORD = None try: SMTP_USER = configuration.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD') except AirflowConfigException: log.debug("No user/password found for SMTP, so logging in with no authentication.") if not dryrun: s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP(SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) log.info("Sent an alert email to %s", e_to) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def load_login(): log = LoggingMixin().log auth_backend = 'airflow.default_login' try: if conf.getboolean('webserver', 'AUTHENTICATE'): auth_backend = conf.get('webserver', 'auth_backend') except conf.AirflowConfigException: if conf.getboolean('webserver', 'AUTHENTICATE'): log.warning( "auth_backend not found in webserver config reverting to " "*deprecated* behavior of importing airflow_login") auth_backend = "airflow_login" try: global login login = import_module(auth_backend) except ImportError as err: log.critical( "Cannot import authentication module %s. " "Please correct your authentication backend or disable authentication: %s", auth_backend, err ) if conf.getboolean('webserver', 'AUTHENTICATE'): raise AirflowException("Failed to import authentication backend")
def load_login(): log = LoggingMixin().log auth_backend = 'airflow.default_login' try: # 获得默认web认证 if conf.getboolean('webserver', 'AUTHENTICATE'): auth_backend = conf.get('webserver', 'auth_backend') except (AirflowConfigException, XToolConfigException): if conf.getboolean('webserver', 'AUTHENTICATE'): log.warning( "auth_backend not found in webserver config reverting to " "*deprecated* behavior of importing airflow_login") auth_backend = "airflow_login" # 导入认证模块 try: global login login = import_module(auth_backend) except ImportError as err: log.critical( "Cannot import authentication module %s. " "Please correct your authentication backend or disable authentication: %s", auth_backend, err ) if conf.getboolean('webserver', 'AUTHENTICATE'): raise AirflowException("Failed to import authentication backend")
def write(self, log, remote_log_location, append=False): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if self.hook: if append: old_log = self.read(remote_log_location) log = old_log + '\n' + log try: self.hook.load_string( log, key=remote_log_location, replace=True, encrypt=configuration.getboolean('core', 'ENCRYPT_S3_LOGS')) return except: pass # raise/return error if we get here logging.error('Could not write logs to {}'.format(remote_log_location))
def test_kill_zombies_when_job_state_is_not_running(self, mock_ti_handle_failure): """ Test that kill zombies calls TI's failure handler with proper context """ dagbag = models.DagBag(dag_folder=self.empty_dir, include_examples=True) with create_session() as session: session.query(TI).delete() session.query(LJ).delete() dag = dagbag.get_dag('example_branch_operator') task = dag.get_task(task_id='run_this_first') ti = TI(task, DEFAULT_DATE, State.RUNNING) lj = LJ(ti) lj.state = State.SHUTDOWN lj.id = 1 ti.job_id = lj.id session.add(lj) session.add(ti) session.commit() dagbag.kill_zombies() mock_ti_handle_failure \ .assert_called_with(ANY, configuration.getboolean('core', 'unit_test_mode'), ANY)
def kill_zombies(self, zombies, session=None): """ Fail given zombie tasks, which are tasks that haven't had a heartbeat for too long, in the current DagBag. :param zombies: zombie task instances to kill. :type zombies: airflow.utils.dag_processing.SimpleTaskInstance :param session: DB session. :type session: sqlalchemy.orm.session.Session """ from airflow.models.taskinstance import TaskInstance # Avoid circular import for zombie in zombies: if zombie.dag_id in self.dags: dag = self.dags[zombie.dag_id] if zombie.task_id in dag.task_ids: task = dag.get_task(zombie.task_id) ti = TaskInstance(task, zombie.execution_date) # Get properties needed for failure handling from SimpleTaskInstance. ti.start_date = zombie.start_date ti.end_date = zombie.end_date ti.try_number = zombie.try_number ti.state = zombie.state ti.test_mode = configuration.getboolean( 'core', 'unit_test_mode') ti.handle_failure("{} detected as zombie".format(ti), ti.test_mode, ti.get_template_context()) self.log.info('Marked zombie job %s as %s', ti, ti.state) Stats.incr('zombies_killed') session.commit()
class CeleryConfig(object): CELERY_ACCEPT_CONTENT = ['json', 'pickle'] CELERY_EVENT_SERIALIZER = 'json' CELERY_RESULT_SERIALIZER = 'pickle' CELERY_TASK_SERIALIZER = 'pickle' CELERYD_PREFETCH_MULTIPLIER = 1 CELERY_ACKS_LATE = True BROKER_URL = configuration.get('celery', 'BROKER_URL') CELERY_RESULT_BACKEND = configuration.get('celery', 'CELERY_RESULT_BACKEND') CELERYD_CONCURRENCY = configuration.getint('celery', 'CELERYD_CONCURRENCY') CELERY_DEFAULT_QUEUE = DEFAULT_QUEUE CELERY_DEFAULT_EXCHANGE = DEFAULT_QUEUE celery_ssl_active = False try: celery_ssl_active = configuration.getboolean('celery', 'CELERY_SSL_ACTIVE') except AirflowConfigException as e: log = LoggingMixin().log log.warning("Celery Executor will run without SSL") try: if celery_ssl_active: BROKER_USE_SSL = {'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'), 'certfile': configuration.get('celery', 'CELERY_SSL_CERT'), 'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'), 'cert_reqs': ssl.CERT_REQUIRED} except AirflowConfigException as e: raise AirflowException('AirflowConfigException: CELERY_SSL_ACTIVE is True, please ensure CELERY_SSL_KEY, ' 'CELERY_SSL_CERT and CELERY_SSL_CACERT are set') except Exception as e: raise AirflowException('Exception: There was an unknown Celery SSL Error. Please ensure you want to use ' 'SSL and/or have all necessary certs and key.')
def write(self, log, remote_log_location, append=False): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if self.hook: if append: old_log = self.read(remote_log_location) log = old_log + '\n' + log try: self.hook.load_string(log, key=remote_log_location, replace=True, encrypt=configuration.getboolean( 'core', 'ENCRYPT_S3_LOGS')) return except: pass # raise/return error if we get here logging.error('Could not write logs to {}'.format(remote_log_location))
class CeleryConfig(object): CELERY_ACCEPT_CONTENT = ['json', 'pickle'] CELERY_EVENT_SERIALIZER = 'json' CELERY_RESULT_SERIALIZER = 'pickle' CELERY_TASK_SERIALIZER = 'pickle' CELERYD_PREFETCH_MULTIPLIER = 1 CELERY_ACKS_LATE = True BROKER_URL = configuration.get('celery', 'BROKER_URL') CELERY_RESULT_BACKEND = configuration.get('celery', 'CELERY_RESULT_BACKEND') CELERYD_CONCURRENCY = configuration.getint('celery', 'CELERYD_CONCURRENCY') CELERY_DEFAULT_QUEUE = DEFAULT_QUEUE CELERY_DEFAULT_EXCHANGE = DEFAULT_QUEUE if configuration.getboolean('celery', 'CELERY_SSL_ACTIVE'): try: BROKER_USE_SSL = { 'keyfile': configuration.get('celery', 'CELERY_SSL_KEY'), 'certfile': configuration.get('celery', 'CELERY_SSL_CERT'), 'ca_certs': configuration.get('celery', 'CELERY_SSL_CACERT'), 'cert_reqs': ssl.CERT_REQUIRED } except ValueError: raise AirflowException( 'ValueError: CELERY_SSL_ACTIVE is True, please ensure CELERY_SSL_KEY, ' 'CELERY_SSL_CERT and CELERY_SSL_CACERT are set') except Exception as e: raise AirflowException( 'Exception: There was an unknown Celery SSL Error. Please ensure you want to use ' 'SSL and/or have all necessary certs and key.')
def s3_write(self, log, remote_log_location, append=True): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if append: old_log = self.read(remote_log_location) log = '\n'.join([old_log, log]) try: self.hook.load_string( log, key=remote_log_location, replace=True, encrypt=configuration.getboolean('core', 'ENCRYPT_S3_LOGS'), ) except: self.logger.error('Could not write logs to %s', remote_log_location)
def validate_session(): """验证数据库是否可用 .""" try: worker_precheck = conf.getboolean('core', 'worker_precheck') except AirflowConfigException: worker_precheck = False return alchemy_orm.validate_session(engine, worker_precheck)
def get_boolean_default(key, default): import airflow.configuration as conf if conf.has_option("airflowdocker", key): return conf.getboolean("airflowdocker", key) else: return default
def backfill(args, dag=None): logging.basicConfig(level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) dag = dag or get_dag(args) if not args.start_date and not args.end_date: raise AirflowException("Provide a start_date and/or end_date") # If only one date is passed, using same as start and end args.end_date = args.end_date or args.start_date args.start_date = args.start_date or args.end_date if args.task_regex: dag = dag.sub_dag(task_regex=args.task_regex, include_upstream=not args.ignore_dependencies) if args.dry_run: print("Dry run of DAG {0} on {1}".format(args.dag_id, args.start_date)) for task in dag.tasks: print("Task {0}".format(task.task_id)) ti = TaskInstance(task, args.start_date) ti.dry_run() else: dag.run(start_date=args.start_date, end_date=args.end_date, mark_success=args.mark_success, include_adhoc=args.include_adhoc, local=args.local, donot_pickle=(args.donot_pickle or conf.getboolean('core', 'donot_pickle')), ignore_first_depends_on_past=args.ignore_first_depends_on_past, ignore_task_deps=args.ignore_dependencies, pool=args.pool)
def s3_write(self, log, remote_log_location, append=True): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if append and self.s3_log_exists(remote_log_location): old_log = self.s3_read(remote_log_location) log = '\n'.join([old_log, log]) if old_log else log try: self.hook.load_string( log, key=remote_log_location, replace=True, encrypt=configuration.getboolean('core', 'ENCRYPT_S3_LOGS'), ) except: self.log.exception('Could not write logs to %s', remote_log_location)
def registered(self, driver, frameworkId, masterInfo): self.log.info( "AirflowScheduler registered to Mesos with framework ID %s", frameworkId.value) if configuration.getboolean('mesos', 'CHECKPOINT') and configuration.get( 'mesos', 'FAILOVER_TIMEOUT'): # Import here to work around a circular import error from airflow.models import Connection # Update the Framework ID in the database. session = Session() conn_id = FRAMEWORK_CONNID_PREFIX + get_framework_name() connection = Session.query(Connection).filter_by( conn_id=conn_id).first() if connection is None: connection = Connection(conn_id=conn_id, conn_type='mesos_framework-id', extra=frameworkId.value) else: connection.extra = frameworkId.value session.add(connection) session.commit() Session.remove()
def backfill(args, dag=None): logging.basicConfig(level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) dag = dag or get_dag(args) if not args.start_date and not args.end_date: raise AirflowException("Provide a start_date and/or end_date") # If only one date is passed, using same as start and end args.end_date = args.end_date or args.start_date args.start_date = args.start_date or args.end_date if args.task_regex: dag = dag.sub_dag(task_regex=args.task_regex, include_upstream=not args.ignore_dependencies) if args.dry_run: print("Dry run of DAG {0} on {1}".format(args.dag_id, args.start_date)) for task in dag.tasks: print("Task {0}".format(task.task_id)) ti = TaskInstance(task, args.start_date) ti.dry_run() else: dag.run( start_date=args.start_date, end_date=args.end_date, mark_success=args.mark_success, include_adhoc=args.include_adhoc, local=args.local, donot_pickle=(args.donot_pickle or conf.getboolean("core", "donot_pickle")), ignore_dependencies=args.ignore_dependencies, ignore_first_depends_on_past=args.ignore_first_depends_on_past, pool=args.pool, )
def test_kill_zombie_when_job_received_no_heartbeat(self, mock_ti_handle_failure): """ Test that kill zombies calls TI's failure handler with proper context """ zombie_threshold_secs = ( configuration.getint('scheduler', 'scheduler_zombie_task_threshold')) dagbag = models.DagBag(dag_folder=self.empty_dir, include_examples=True) with create_session() as session: session.query(TI).delete() session.query(LJ).delete() dag = dagbag.get_dag('example_branch_operator') task = dag.get_task(task_id='run_this_first') ti = TI(task, DEFAULT_DATE, State.RUNNING) lj = LJ(ti) lj.latest_heartbeat = utcnow() - timedelta(seconds=zombie_threshold_secs) lj.state = State.RUNNING lj.id = 1 ti.job_id = lj.id session.add(lj) session.add(ti) session.commit() dagbag.kill_zombies() mock_ti_handle_failure \ .assert_called_with(ANY, configuration.getboolean('core', 'unit_test_mode'), ANY)
def enabled(metric='', default=True): if metric: metric = '{}_'.format(metric) metric = 'airflow_metrics_{}enabled'.format(metric) try: return conf.getboolean('airflow_metrics', metric) except AirflowConfigException: return default
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): SMTP_HOST = configuration.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT') SMTP_USER = configuration.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD') SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS') SMTP_SSL = configuration.getboolean('smtp', 'SMTP_SSL') if not dryrun: s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP(SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) logging.info("Sent an alert email to " + str(e_to)) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def heartbeat(self): """ Heartbeats update the job's entry in the database with a timestamp for the latest_heartbeat and allows for the job to be killed externally. This allows at the system level to monitor what is actually active. For instance, an old heartbeat for SchedulerJob would mean something is wrong. This also allows for any job to be killed externally, regardless of who is running it or on which machine it is running. Note that if your heartbeat is set to 60 seconds and you call this method after 10 seconds of processing since the last heartbeat, it will sleep 50 seconds to complete the 60 seconds and keep a steady heart rate. If you go over 60 seconds before calling it, it won't sleep at all. """ try: with create_session() as session: job = session.query(BaseJob).filter_by(id=self.id).one() make_transient(job) session.commit() if job.state == State.SHUTDOWN: self.kill() is_unit_test = conf.getboolean('core', 'unit_test_mode') if not is_unit_test: # Figure out how long to sleep for sleep_for = 0 if job.latest_heartbeat: seconds_remaining = self.heartrate - \ (timezone.utcnow() - job.latest_heartbeat)\ .total_seconds() sleep_for = max(0, seconds_remaining) sleep(sleep_for) # Update last heartbeat time with create_session() as session: job = session.query(BaseJob).filter( BaseJob.id == self.id).first() job.latest_heartbeat = timezone.utcnow() session.merge(job) session.commit() self.heartbeat_callback(session=session) self.log.debug('[heartbeat]') except OperationalError: Stats.incr( convert_camel_to_snake(self.__class__.__name__) + '_heartbeat_failure', 1, 1) self.log.exception("%s heartbeat got an exception", self.__class__.__name__)
def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) global engine global Session engine_args = {} pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') if disable_connection_pool or not pool_connections: engine_args['poolclass'] = NullPool log.debug("settings.configure_orm(): Using NullPool") elif 'sqlite' not in SQL_ALCHEMY_CONN: # Pool size engine args not supported by sqlite. # If no config value is defined for the pool size, select a reasonable value. # 0 means no limit, which could lead to exceeding the Database connection limit. try: pool_size = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') except conf.AirflowConfigException: pool_size = 5 # The DB server already has a value for wait_timeout (number of seconds after # which an idle sleeping connection should be killed). Since other DBs may # co-exist on the same server, SQLAlchemy should set its # pool_recycle to an equal or smaller value. try: pool_recycle = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') except conf.AirflowConfigException: pool_recycle = 1800 log.info( "settings.configure_orm(): Using pool settings. pool_size={}, " "pool_recycle={}, pid={}".format(pool_size, pool_recycle, os.getpid())) engine_args['pool_size'] = pool_size engine_args['pool_recycle'] = pool_recycle try: # Allow the user to specify an encoding for their DB otherwise default # to utf-8 so jobs & users with non-latin1 characters can still use # us. engine_args['encoding'] = conf.get('core', 'SQL_ENGINE_ENCODING') except conf.AirflowConfigException: engine_args['encoding'] = 'utf-8' # For Python2 we get back a newstr and need a str engine_args['encoding'] = engine_args['encoding'].__str__() engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') setup_event_handlers(engine, reconnect_timeout) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine, expire_on_commit=False))
def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException( "can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException( "no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command( command=self.command, get_pty=get_pty, timeout=self.timeout) stdin.close() output = b'' for line in stdout: output += line.encode('utf-8') self.log.info(line.strip('\n')) exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # only returning on output if do_xcom_push is set # otherwise its not suppose to be disclosed if self.do_xcom_push: enable_pickling = configuration.getboolean( 'core', 'enable_xcom_pickling') if enable_pickling: return output else: return b64encode(output).decode('utf-8') else: error_msg = stderr.read() raise AirflowException( "error running cmd: {0}, error: {1}".format( self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True
def schedule_job(self, context): hook = HttpHook(method='POST', http_conn_id=self.http_conn_id) headers = { 'content-type': 'application/json', 'Accept': 'text/plain', 'X-Bluecore-Token': configuration.get('appengine', 'token').strip(), 'X-Airflow-Dag-Id': self.dag_id, 'X-Airflow-Task-Id': self.task_id, 'X-Airflow-Execution-Date': context['execution_date'].isoformat(), 'X-Airflow-Enable-Xcom-Pickling': str(configuration.getboolean('core', 'enable_xcom_pickling')), 'X-Airflow-Mysql-Db': configuration.get('mysql', 'db').strip(), 'X-Airflow-Mysql-User': configuration.get('mysql', 'username').strip(), 'X-Airflow-Mysql-Password': configuration.get('mysql', 'password').strip(), 'X-Airflow-Fernet-Key': configuration.get('core', 'fernet_key').strip(), } mysql_host = safe_config_get('mysql', 'host') if mysql_host is not None: headers['X-Airflow-Mysql-Host'] = mysql_host mysql_cloudsql_instance = safe_config_get('mysql', 'cloudsql_instance') if mysql_cloudsql_instance is not None: headers[ 'X-Airflow-Mysql-Cloudsql-Instance'] = mysql_cloudsql_instance # generate a unique job name for the command to be added to the App Engine task queue job_id = uniquify_job_name(self, context) logging.info("Job ID: %s", job_id) instance_params = evaluate_xcoms(self.command_params, self, context) post_data = { 'params_dict': instance_params, 'appengine_queue': self.appengine_queue, 'job_id': job_id } hook.run(endpoint='/api/airflow_v2/async/%s' % self.command_name, headers=headers, data=json.dumps(post_data), extra_options=None)
def init_on_load(self): enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: self.value = pickle.loads(self.value) else: try: self.value = json.loads(self.value.decode('UTF-8')) except (UnicodeEncodeError, ValueError): # For backward-compatibility. # Preventing errors in webserver # due to XComs mixed with pickled and unpickled. self.value = pickle.loads(self.value)
def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) global engine global Session engine_args = {} pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') if disable_connection_pool or not pool_connections: engine_args['poolclass'] = NullPool log.debug("settings.configure_orm(): Using NullPool") elif 'sqlite' not in SQL_ALCHEMY_CONN: # Pool size engine args not supported by sqlite. # If no config value is defined for the pool size, select a reasonable value. # 0 means no limit, which could lead to exceeding the Database connection limit. try: pool_size = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') except conf.AirflowConfigException: pool_size = 5 # The DB server already has a value for wait_timeout (number of seconds after # which an idle sleeping connection should be killed). Since other DBs may # co-exist on the same server, SQLAlchemy should set its # pool_recycle to an equal or smaller value. try: pool_recycle = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') except conf.AirflowConfigException: pool_recycle = 1800 log.info("settings.configure_orm(): Using pool settings. pool_size={}, " "pool_recycle={}, pid={}".format(pool_size, pool_recycle, os.getpid())) engine_args['pool_size'] = pool_size engine_args['pool_recycle'] = pool_recycle try: # Allow the user to specify an encoding for their DB otherwise default # to utf-8 so jobs & users with non-latin1 characters can still use # us. engine_args['encoding'] = conf.get('core', 'SQL_ENGINE_ENCODING') except conf.AirflowConfigException: engine_args['encoding'] = 'utf-8' # For Python2 we get back a newstr and need a str engine_args['encoding'] = engine_args['encoding'].__str__() engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') setup_event_handlers(engine, reconnect_timeout) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine, expire_on_commit=False))
def kill_zombies(self, session=None): """ Fails tasks that haven't had a heartbeat in too long """ secs = configuration.conf.getint('scheduler', 'scheduler_zombie_task_threshold') now = datetime.now() limit_dttm = now - timedelta(seconds=secs) self.log.info( "Finding 'running' jobs without a recent heartbeat after %s", limit_dttm) # 任务实例正在运行,但是job已经停止,或者job的心跳长时间未更新 begin_time = now - timedelta( days=configuration.conf.getint('core', 'sql_query_history_days')) TI = TaskInstance from airflow.jobs import LocalTaskJob as LJ # SELECT task_instance.try_number AS task_instance_try_number, task_instance.task_id AS task_instance_task_id, task_instance.dag_id AS task_instance_dag_id, task_instance.execution_date AS task_instance_execution_date, task_instance.start_date AS task_instance_start_date, task_instance.end_date AS task_instance_end_date, task_instance.duration AS task_instance_duration, task_instance.state AS task_instance_state, task_instance.max_tries AS task_instance_max_tries, task_instance.hostname AS task_instance_hostname, task_instance.unixname AS task_instance_unixname, task_instance.job_id AS task_instance_job_id, task_instance.pool AS task_instance_pool, task_instance.queue AS task_instance_queue, task_instance.priority_weight AS task_instance_priority_weight, task_instance.operator AS task_instance_operator, task_instance.queued_dttm AS task_instance_queued_dttm, task_instance.pid AS task_instance_pid, task_instance.executor_config AS task_instance_executor_config # FROM task_instance INNER JOIN job ON task_instance.job_id = job.id AND job.job_type IN (LocalTaskJob) # WHERE task_instance.execution_date > %s AND task_instance.state = 'running' AND (job.latest_heartbeat < %s OR job.state != 'running') # 1. 获取正在运行的任务实例 # 2. 或者 now > LJ.latest_heartbeat + scheduler_zombie_task_threshold tis = ( session.query(TI).join(LJ, TI.job_id == LJ.id).filter( TI.execution_date > begin_time).filter( TI.state == State.RUNNING) # 任务实例正在运行 .filter( or_( LJ.latest_heartbeat < limit_dttm, # 没有上报心跳 LJ.state != State.RUNNING, # 但是job不是运行态 )).all()) # 执行任务实例的失败处理函数 for ti in tis: if ti.dag_id in self.dags: # 根据任务实例获取dag dag = self.dags[ti.dag_id] # 获得dag中所有的任务 if ti.task_id in dag.task_ids: # 获得任务 task = dag.get_task(ti.task_id) # now set non db backed vars on ti ti.task = task ti.test_mode = configuration.getboolean( 'core', 'unit_test_mode') # 任务执行失败,发送通知,并执行失败回调函数 ti.handle_failure("{} detected as zombie".format(ti), ti.test_mode, ti.get_template_context()) self.log.info('Marked zombie job %s as %s', ti, ti.state) Stats.incr('zombies_killed') session.commit()
def test_login_logout_ldap(self): assert configuration.getboolean('webserver', 'authenticate') is True response = self.login('user1', 'userx') assert 'Incorrect login details' in response.data.decode('utf-8') response = self.login('userz', 'user1') assert 'Incorrect login details' in response.data.decode('utf-8') response = self.login('user1', 'user1') assert 'Data Profiling' in response.data.decode('utf-8') response = self.logout() assert 'form-signin' in response.data.decode('utf-8')
def test_login_logout_password_auth(self): assert configuration.getboolean('webserver', 'authenticate') is True response = self.login('user1', 'whatever') assert 'Incorrect login details' in response.data.decode('utf-8') response = self.login('airflow_passwordauth', 'wrongpassword') assert 'Incorrect login details' in response.data.decode('utf-8') response = self.login('airflow_passwordauth', 'password') assert 'Data Profiling' in response.data.decode('utf-8') response = self.logout() assert 'form-signin' in response.data.decode('utf-8')
def set( cls, key, value, execution_date, task_id, dag_id, session=None): """保存中间结果 Store an XCom value. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :return: None """ # 调用底层,清除所有session实例 session.expunge_all() enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') # 序列化中间结果 if enable_pickling: value = pickle.dumps(value) else: try: # 注意编码转换 value = json.dumps(value).encode('UTF-8') except ValueError: log = LoggingMixin().log log.error("Could not serialize the XCOM value into JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise # remove any duplicate XComs # 删除相同key的中间结果 session.query(cls).filter( cls.key == key, cls.execution_date == execution_date, cls.task_id == task_id, cls.dag_id == dag_id).delete() session.commit() # insert new XCom session.add(XCom( key=key, value=value, execution_date=execution_date, task_id=task_id, dag_id=dag_id)) session.commit()
def test_login_logout_ldap(self): assert configuration.getboolean("webserver", "authenticate") is True response = self.login("user1", "userx") assert "Incorrect login details" in response.data.decode("utf-8") response = self.login("userz", "user1") assert "Incorrect login details" in response.data.decode("utf-8") response = self.login("user1", "user1") assert "Data Profiling" in response.data.decode("utf-8") response = self.logout() assert "form-signin" in response.data.decode("utf-8")
def test_login_logout_password_auth(self): assert configuration.getboolean("webserver", "authenticate") is True response = self.login("user1", "whatever") assert "Incorrect login details" in response.data.decode("utf-8") response = self.login("airflow_passwordauth", "wrongpassword") assert "Incorrect login details" in response.data.decode("utf-8") response = self.login("airflow_passwordauth", "password") assert "Data Profiling" in response.data.decode("utf-8") response = self.logout() assert "form-signin" in response.data.decode("utf-8")
def set( cls, key, value, execution_date, task_id, dag_id, session=None): """ Store an XCom value. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :return: None """ session.expunge_all() enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: value = pickle.dumps(value) else: try: value = json.dumps(value).encode('UTF-8') except ValueError: log = LoggingMixin().log log.error("Could not serialize the XCOM value into JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise # remove any duplicate XComs session.query(cls).filter( cls.key == key, cls.execution_date == execution_date, cls.task_id == task_id, cls.dag_id == dag_id).delete() session.commit() # insert new XCom session.add(XCom( key=key, value=value, execution_date=execution_date, task_id=task_id, dag_id=dag_id)) session.commit()
def validate_session(): worker_precheck = conf.getboolean('core', 'worker_precheck', fallback=False) if not worker_precheck: return True else: check_session = sessionmaker(bind=engine) session = check_session() try: session.execute("select 1") conn_status = True except exc.DBAPIError as err: log.error(err) conn_status = False session.close() return conn_status
def serialize_value(value): # TODO: "pickling" has been deprecated and JSON is preferred. # "pickling" will be removed in Airflow 2.0. if configuration.getboolean('core', 'enable_xcom_pickling'): return pickle.dumps(value) try: return json.dumps(value).encode('UTF-8') except ValueError: log = LoggingMixin().log log.error("Could not serialize the XCOM value into JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise
def try_get_one(execution_date, key=None, task_id=None, dag_id=None, include_prior_dates=False, enable_pickling=None, session=None): """ Retrieve an XCom value, optionally meeting certain criteria. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :param enable_pickling: If pickling is not enabled, the XCOM value will be parsed to JSON instead. :return: XCom value """ filters = [] if key: filters.append(XCom.key == key) if task_id: filters.append(XCom.task_id == task_id) if dag_id: filters.append(XCom.dag_id == dag_id) if include_prior_dates: filters.append(XCom.execution_date <= execution_date) else: filters.append(XCom.execution_date == execution_date) query = (session.query(XCom.value).filter(and_(*filters)).order_by( XCom.execution_date.desc(), XCom.timestamp.desc())) result = query.first() if result: if enable_pickling is None: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return (True, pickle.loads(result.value)) else: try: return (True, json.loads(result.value.decode('UTF-8'))) except ValueError: log = LoggingMixin().log log.error("Could not serialize the XCOM value into JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise return (False, None)
def execute(self, context): try: if self.ssh_conn_id and not self.ssh_hook: self.ssh_hook = SSHHook(ssh_conn_id=self.ssh_conn_id) if not self.ssh_hook: raise AirflowException("can not operate without ssh_hook or ssh_conn_id") if self.remote_host is not None: self.ssh_hook.remote_host = self.remote_host ssh_client = self.ssh_hook.get_conn() if not self.command: raise AirflowException("no command specified so nothing to execute here.") # Auto apply tty when its required in case of sudo get_pty = False if self.command.startswith('sudo'): get_pty = True # set timeout taken as params stdin, stdout, stderr = ssh_client.exec_command(command=self.command, get_pty=get_pty, timeout=self.timeout ) exit_status = stdout.channel.recv_exit_status() if exit_status is 0: # only returning on output if do_xcom_push is set # otherwise its not suppose to be disclosed if self.do_xcom_push: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return stdout.read() else: return b64encode(stdout.read()).decode('utf-8') else: error_msg = stderr.read() raise AirflowException("error running cmd: {0}, error: {1}" .format(self.command, error_msg)) except Exception as e: raise AirflowException("SSH operator error: {0}".format(str(e))) return True
def get_one(cls, execution_date, key=None, task_id=None, dag_id=None, include_prior_dates=False, session=None): """获取一条中间结果 Retrieve an XCom value, optionally meeting certain criteria. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :return: XCom value """ # 构造搜索条件 filters = [] if key: filters.append(cls.key == key) if task_id: filters.append(cls.task_id == task_id) if dag_id: filters.append(cls.dag_id == dag_id) if include_prior_dates: filters.append(cls.execution_date <= execution_date) else: filters.append(cls.execution_date == execution_date) query = ( session.query(cls.value).filter(and_(*filters)) .order_by(cls.execution_date.desc(), cls.timestamp.desc())) # 获得最近的一条记录 result = query.first() if result: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return pickle.loads(result.value) else: try: # 注意编码转换 return json.loads(result.value.decode('UTF-8')) except ValueError: log = LoggingMixin().log log.error("Could not deserialize the XCOM value from JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise
def get_one(cls, execution_date, key=None, task_id=None, dag_id=None, include_prior_dates=False, session=None): """ Retrieve an XCom value, optionally meeting certain criteria. TODO: "pickling" has been deprecated and JSON is preferred. "pickling" will be removed in Airflow 2.0. :return: XCom value """ filters = [] if key: filters.append(cls.key == key) if task_id: filters.append(cls.task_id == task_id) if dag_id: filters.append(cls.dag_id == dag_id) if include_prior_dates: filters.append(cls.execution_date <= execution_date) else: filters.append(cls.execution_date == execution_date) query = ( session.query(cls.value).filter(and_(*filters)) .order_by(cls.execution_date.desc(), cls.timestamp.desc())) result = query.first() if result: enable_pickling = configuration.getboolean('core', 'enable_xcom_pickling') if enable_pickling: return pickle.loads(result.value) else: try: return json.loads(result.value.decode('UTF-8')) except ValueError: log = LoggingMixin().log log.error("Could not deserialize the XCOM value from JSON. " "If you are using pickles instead of JSON " "for XCOM, then you need to enable pickle " "support for XCOM in your airflow config.") raise
def registered(self, driver, frameworkId, masterInfo): logging.info("AirflowScheduler registered to mesos with framework ID %s", frameworkId.value) if configuration.getboolean('mesos', 'CHECKPOINT') and configuration.get('mesos', 'FAILOVER_TIMEOUT'): # Import here to work around a circular import error from airflow.models import Connection # Update the Framework ID in the database. session = Session() conn_id = FRAMEWORK_CONNID_PREFIX + get_framework_name() connection = Session.query(Connection).filter_by(conn_id=conn_id).first() if connection is None: connection = Connection(conn_id=conn_id, conn_type='mesos_framework-id', extra=frameworkId.value) else: connection.extra = frameworkId.value session.add(connection) session.commit() Session.remove()
def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) global engine global Session engine_args = {} pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') if disable_connection_pool or not pool_connections: engine_args['poolclass'] = NullPool elif 'sqlite' not in SQL_ALCHEMY_CONN: # Engine args not supported by sqlite engine_args['pool_size'] = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') engine_args['pool_recycle'] = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') setup_event_handlers(engine, reconnect_timeout) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine))
def backfill(args): logging.basicConfig( level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) dagbag = DagBag(process_subdir(args.subdir)) if args.dag_id not in dagbag.dags: raise AirflowException('dag_id could not be found') dag = dagbag.dags[args.dag_id] if args.start_date: args.start_date = dateutil.parser.parse(args.start_date) if args.end_date: args.end_date = dateutil.parser.parse(args.end_date) # If only one date is passed, using same as start and end args.end_date = args.end_date or args.start_date args.start_date = args.start_date or args.end_date if args.task_regex: dag = dag.sub_dag( task_regex=args.task_regex, include_upstream=not args.ignore_dependencies) if args.dry_run: print("Dry run of DAG {0} on {1}".format(args.dag_id, args.start_date)) for task in dag.tasks: print("Task {0}".format(task.task_id)) ti = TaskInstance(task, args.start_date) ti.dry_run() else: dag.run( start_date=args.start_date, end_date=args.end_date, mark_success=args.mark_success, include_adhoc=args.include_adhoc, local=args.local, donot_pickle=(args.donot_pickle or configuration.getboolean('core', 'donot_pickle')), ignore_dependencies=args.ignore_dependencies, pool=args.pool)
def test_kill_zombies(self, mock_ti_handle_failure): """ Test that kill zombies call TIs failure handler with proper context """ dagbag = models.DagBag() with create_session() as session: session.query(TI).delete() dag = dagbag.get_dag('example_branch_operator') task = dag.get_task(task_id='run_this_first') ti = TI(task, DEFAULT_DATE, State.RUNNING) session.add(ti) session.commit() zombies = [SimpleTaskInstance(ti)] dagbag.kill_zombies(zombies) mock_ti_handle_failure \ .assert_called_with(ANY, configuration.getboolean('core', 'unit_test_mode'), ANY)
def configure_orm(disable_connection_pool=False): log.debug("Setting up DB connection pool (PID %s)" % os.getpid()) global engine global Session engine_args = {} pool_connections = conf.getboolean('core', 'SQL_ALCHEMY_POOL_ENABLED') if disable_connection_pool or not pool_connections: engine_args['poolclass'] = NullPool log.debug("settings.configure_orm(): Using NullPool") elif 'sqlite' not in SQL_ALCHEMY_CONN: # Engine args not supported by sqlite. # If no config value is defined for the pool size, select a reasonable value. # 0 means no limit, which could lead to exceeding the Database connection limit. try: pool_size = conf.getint('core', 'SQL_ALCHEMY_POOL_SIZE') except conf.AirflowConfigException: pool_size = 5 # The DB server already has a value for wait_timeout (number of seconds after # which an idle sleeping connection should be killed). Since other DBs may # co-exist on the same server, SQLAlchemy should set its # pool_recycle to an equal or smaller value. try: pool_recycle = conf.getint('core', 'SQL_ALCHEMY_POOL_RECYCLE') except conf.AirflowConfigException: pool_recycle = 1800 log.info("setting.configure_orm(): Using pool settings. pool_size={}, " "pool_recycle={}".format(pool_size, pool_recycle)) engine_args['pool_size'] = pool_size engine_args['pool_recycle'] = pool_recycle engine = create_engine(SQL_ALCHEMY_CONN, **engine_args) reconnect_timeout = conf.getint('core', 'SQL_ALCHEMY_RECONNECT_TIMEOUT') setup_event_handlers(engine, reconnect_timeout) Session = scoped_session( sessionmaker(autocommit=False, autoflush=False, bind=engine))
def list_py_file_paths(directory, safe_mode=True, include_examples=None): """ Traverse a directory and look for Python files. :param directory: the directory to traverse :type directory: unicode :param safe_mode: whether to use a heuristic to determine whether a file contains Airflow DAG definitions :return: a list of paths to Python files in the specified directory :rtype: list[unicode] """ if include_examples is None: include_examples = conf.getboolean('core', 'LOAD_EXAMPLES') file_paths = [] if directory is None: return [] elif os.path.isfile(directory): return [directory] elif os.path.isdir(directory): patterns_by_dir = {} for root, dirs, files in os.walk(directory, followlinks=True): patterns = patterns_by_dir.get(root, []) ignore_file = os.path.join(root, '.airflowignore') if os.path.isfile(ignore_file): with open(ignore_file, 'r') as f: # If we have new patterns create a copy so we don't change # the previous list (which would affect other subdirs) patterns += [re.compile(p) for p in f.read().split('\n') if p] # If we can ignore any subdirs entirely we should - fewer paths # to walk is better. We have to modify the ``dirs`` array in # place for this to affect os.walk dirs[:] = [ d for d in dirs if not any(p.search(os.path.join(root, d)) for p in patterns) ] # We want patterns defined in a parent folder's .airflowignore to # apply to subdirs too for d in dirs: patterns_by_dir[os.path.join(root, d)] = patterns for f in files: try: file_path = os.path.join(root, f) if not os.path.isfile(file_path): continue mod_name, file_ext = os.path.splitext( os.path.split(file_path)[-1]) if file_ext != '.py' and not zipfile.is_zipfile(file_path): continue if any([re.findall(p, file_path) for p in patterns]): continue # Heuristic that guesses whether a Python file contains an # Airflow DAG definition. might_contain_dag = True if safe_mode and not zipfile.is_zipfile(file_path): with open(file_path, 'rb') as fp: content = fp.read() might_contain_dag = all( [s in content for s in (b'DAG', b'airflow')]) if not might_contain_dag: continue file_paths.append(file_path) except Exception: log = LoggingMixin().log log.exception("Error while examining %s", f) if include_examples: import airflow.example_dags example_dag_folder = airflow.example_dags.__path__[0] file_paths.extend(list_py_file_paths(example_dag_folder, safe_mode, False)) return file_paths
from io import BytesIO as IO import functools import gzip import dateutil.parser as dateparser import json import time from flask import after_this_request, request, Response from flask_login import current_user import wtforms from wtforms.compat import text_type from airflow import configuration, models, settings from airflow.utils.json import AirflowJsonEncoder AUTHENTICATE = configuration.getboolean('webserver', 'AUTHENTICATE') class LoginMixin(object): def is_accessible(self): return ( not AUTHENTICATE or ( not current_user.is_anonymous() and current_user.is_authenticated() ) ) class SuperUserMixin(object): def is_accessible(self): return (
from time import sleep from sqlalchemy import Column, Integer, String, DateTime, func, Index, and_ from sqlalchemy.orm.session import make_transient from airflow import executors, models, settings, utils from airflow import configuration from airflow.utils import AirflowException, State Base = models.Base ID_LEN = models.ID_LEN # Setting up a statsd client if needed statsd = None if configuration.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient( host=configuration.get('scheduler', 'statsd_host'), port=configuration.getint('scheduler', 'statsd_port'), prefix=configuration.get('scheduler', 'statsd_prefix')) class BaseJob(Base): """ Abstract class to be derived for jobs. Jobs are processing items with state and duration that aren't task instances. For instance a BackfillJob is a collection of task instance runs, but should have it's own state, start and end time. """
def create_app(config=None, session=None, testing=False, app_name="Airflow"): global app, appbuilder app = Flask(__name__) if conf.getboolean('webserver', 'ENABLE_PROXY_FIX'): app.wsgi_app = ProxyFix(app.wsgi_app) app.secret_key = conf.get('webserver', 'SECRET_KEY') airflow_home_path = conf.get('core', 'AIRFLOW_HOME') webserver_config_path = airflow_home_path + '/webserver_config.py' app.config.from_pyfile(webserver_config_path, silent=True) app.config['SQLALCHEMY_TRACK_MODIFICATIONS'] = False app.config['APP_NAME'] = app_name app.config['TESTING'] = testing csrf.init_app(app) db = SQLA(app) from airflow import api api.load_auth() api.api_auth.init_app(app) # flake8: noqa: F841 cache = Cache(app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'}) from airflow.www_rbac.blueprints import routes app.register_blueprint(routes) configure_logging() configure_manifest_files(app) with app.app_context(): from airflow.www_rbac.security import AirflowSecurityManager security_manager_class = app.config.get('SECURITY_MANAGER_CLASS') or \ AirflowSecurityManager if not issubclass(security_manager_class, AirflowSecurityManager): raise Exception( """Your CUSTOM_SECURITY_MANAGER must now extend AirflowSecurityManager, not FAB's security manager.""") appbuilder = AppBuilder( app, db.session if not session else session, security_manager_class=security_manager_class, base_template='appbuilder/baselayout.html') def init_views(appbuilder): from airflow.www_rbac import views appbuilder.add_view_no_menu(views.Airflow()) appbuilder.add_view_no_menu(views.DagModelView()) appbuilder.add_view_no_menu(views.ConfigurationView()) appbuilder.add_view_no_menu(views.VersionView()) appbuilder.add_view(views.DagRunModelView, "DAG Runs", category="Browse", category_icon="fa-globe") appbuilder.add_view(views.JobModelView, "Jobs", category="Browse") appbuilder.add_view(views.LogModelView, "Logs", category="Browse") appbuilder.add_view(views.SlaMissModelView, "SLA Misses", category="Browse") appbuilder.add_view(views.TaskInstanceModelView, "Task Instances", category="Browse") appbuilder.add_link("Configurations", href='/configuration', category="Admin", category_icon="fa-user") appbuilder.add_view(views.ConnectionModelView, "Connections", category="Admin") appbuilder.add_view(views.PoolModelView, "Pools", category="Admin") appbuilder.add_view(views.VariableModelView, "Variables", category="Admin") appbuilder.add_view(views.XComModelView, "XComs", category="Admin") appbuilder.add_link("Documentation", href='https://airflow.apache.org/', category="Docs", category_icon="fa-cube") appbuilder.add_link("Github", href='https://github.com/apache/incubator-airflow', category="Docs") appbuilder.add_link('Version', href='/version', category='About', category_icon='fa-th') def integrate_plugins(): """Integrate plugins to the context""" from airflow.plugins_manager import ( flask_appbuilder_views, flask_appbuilder_menu_links) for v in flask_appbuilder_views: log.debug("Adding view %s", v["name"]) appbuilder.add_view(v["view"], v["name"], category=v["category"]) for ml in sorted(flask_appbuilder_menu_links, key=lambda x: x["name"]): log.debug("Adding menu link %s", ml["name"]) appbuilder.add_link(ml["name"], href=ml["href"], category=ml["category"], category_icon=ml["category_icon"]) integrate_plugins() # Garbage collect old permissions/views after they have been modified. # Otherwise, when the name of a view or menu is changed, the framework # will add the new Views and Menus names to the backend, but will not # delete the old ones. init_views(appbuilder) security_manager = appbuilder.sm security_manager.sync_roles() from airflow.www_rbac.api.experimental import endpoints as e # required for testing purposes otherwise the module retains # a link to the default_auth if app.config['TESTING']: if six.PY2: reload(e) # noqa else: import importlib importlib.reload(e) app.register_blueprint(e.api_experimental, url_prefix='/api/experimental') @app.context_processor def jinja_globals(): return { 'hostname': socket.getfqdn(), 'navbar_color': conf.get('webserver', 'NAVBAR_COLOR'), } @app.teardown_appcontext def shutdown_session(exception=None): settings.Session.remove() return app, appbuilder
@classmethod def decr(cls, stat, count=1, rate=1): pass @classmethod def gauge(cls, stat, value, rate=1, delta=False): pass @classmethod def timing(cls, stat, dt): pass Stats = DummyStatsLogger if conf.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient( host=conf.get('scheduler', 'statsd_host'), port=conf.getint('scheduler', 'statsd_port'), prefix=conf.get('scheduler', 'statsd_prefix')) Stats = statsd else: Stats = DummyStatsLogger HEADER = """\ ____________ _____________ ____ |__( )_________ __/__ /________ __ ____ /| |_ /__ ___/_ /_ __ /_ __ \_ | /| / / ___ ___ | / _ / _ __/ _ / / /_/ /_ |/ |/ /
def create_app(config=None): app = Flask(__name__) app.secret_key = configuration.get('webserver', 'SECRET_KEY') app.config['LOGIN_DISABLED'] = not configuration.getboolean('webserver', 'AUTHENTICATE') csrf.init_app(app) #app.config = config airflow.load_login() airflow.login.login_manager.init_app(app) cache = Cache( app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'}) app.register_blueprint(ck, url_prefix='/ck') app.register_blueprint(routes) app.jinja_env.add_extension("chartkick.ext.charts") with app.app_context(): from airflow.www import views admin = Admin( app, name='Airflow', static_url_path='/admin', index_view=views.HomeView(endpoint='', url='/admin', name="DAGs"), template_mode='bootstrap3', ) av = admin.add_view vs = views av(vs.Airflow(name='DAGs', category='DAGs')) av(vs.QueryView(name='Ad Hoc Query', category="Data Profiling")) av(vs.ChartModelView( models.Chart, Session, name="Charts", category="Data Profiling")) av(vs.KnowEventView( models.KnownEvent, Session, name="Known Events", category="Data Profiling")) av(vs.SlaMissModelView( models.SlaMiss, Session, name="SLA Misses", category="Browse")) av(vs.TaskInstanceModelView(models.TaskInstance, Session, name="Task Instances", category="Browse")) av(vs.LogModelView( models.Log, Session, name="Logs", category="Browse")) av(vs.JobModelView( jobs.BaseJob, Session, name="Jobs", category="Browse")) av(vs.PoolModelView( models.Pool, Session, name="Pools", category="Admin")) av(vs.ConfigurationView( name='Configuration', category="Admin")) av(vs.UserModelView( models.User, Session, name="Users", category="Admin")) av(vs.ConnectionModelView( models.Connection, Session, name="Connections", category="Admin")) av(vs.VariableView( models.Variable, Session, name="Variables", category="Admin")) admin.add_link(base.MenuLink( category='Docs', name='Documentation', url='http://pythonhosted.org/airflow/')) admin.add_link( base.MenuLink(category='Docs', name='Github',url='https://github.com/airbnb/airflow')) av(vs.DagRunModelView( models.DagRun, Session, name="DAG Runs", category="Browse")) av(vs.DagModelView(models.DagModel, Session, name=None)) # Hack to not add this view to the menu admin._menu = admin._menu[:-1] def integrate_plugins(): """Integrate plugins to the context""" from airflow.plugins_manager import ( admin_views, flask_blueprints, menu_links) for v in admin_views: admin.add_view(v) for bp in flask_blueprints: app.register_blueprint(bp) for ml in menu_links: admin.add_link(ml) integrate_plugins() @app.context_processor def jinja_globals(): return { 'hostname': socket.gethostname(), } @app.teardown_appcontext def shutdown_session(exception=None): settings.Session.remove() return app
def create_app(config=None, testing=False): log = LoggingMixin().log app = Flask(__name__) app.wsgi_app = ProxyFix(app.wsgi_app) if configuration.conf.get('webserver', 'SECRET_KEY') == "temporary_key": log.info("SECRET_KEY for Flask App is not specified. Using a random one.") app.secret_key = os.urandom(16) else: app.secret_key = configuration.conf.get('webserver', 'SECRET_KEY') app.config['LOGIN_DISABLED'] = not configuration.conf.getboolean( 'webserver', 'AUTHENTICATE') csrf.init_app(app) app.config['TESTING'] = testing airflow.load_login() airflow.login.login_manager.init_app(app) from airflow import api api.load_auth() api.api_auth.init_app(app) cache = Cache( app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'}) app.register_blueprint(routes) configure_logging() with app.app_context(): from airflow.www import views admin = Admin( app, name='Airflow', static_url_path='/admin', index_view=views.HomeView(endpoint='', url='/admin', name="DAGs"), template_mode='bootstrap3', ) av = admin.add_view vs = views av(vs.Airflow(name='DAGs', category='DAGs')) if not conf.getboolean('core', 'secure_mode'): av(vs.QueryView(name='Ad Hoc Query', category="Data Profiling")) av(vs.ChartModelView( models.Chart, Session, name="Charts", category="Data Profiling")) av(vs.KnownEventView( models.KnownEvent, Session, name="Known Events", category="Data Profiling")) av(vs.SlaMissModelView( models.SlaMiss, Session, name="SLA Misses", category="Browse")) av(vs.TaskInstanceModelView(models.TaskInstance, Session, name="Task Instances", category="Browse")) av(vs.LogModelView( models.Log, Session, name="Logs", category="Browse")) av(vs.JobModelView( jobs.BaseJob, Session, name="Jobs", category="Browse")) av(vs.PoolModelView( models.Pool, Session, name="Pools", category="Admin")) av(vs.ConfigurationView( name='Configuration', category="Admin")) av(vs.UserModelView( models.User, Session, name="Users", category="Admin")) av(vs.ConnectionModelView( models.Connection, Session, name="Connections", category="Admin")) av(vs.VariableView( models.Variable, Session, name="Variables", category="Admin")) av(vs.XComView( models.XCom, Session, name="XComs", category="Admin")) admin.add_link(base.MenuLink( category='Docs', name='Documentation', url='https://airflow.incubator.apache.org/')) admin.add_link( base.MenuLink(category='Docs', name='Github', url='https://github.com/apache/incubator-airflow')) av(vs.VersionView(name='Version', category="About")) av(vs.DagRunModelView( models.DagRun, Session, name="DAG Runs", category="Browse")) av(vs.DagModelView(models.DagModel, Session, name=None)) # Hack to not add this view to the menu admin._menu = admin._menu[:-1] def integrate_plugins(): """Integrate plugins to the context""" from airflow.plugins_manager import ( admin_views, flask_blueprints, menu_links) for v in admin_views: log.debug('Adding view %s', v.name) admin.add_view(v) for bp in flask_blueprints: log.debug('Adding blueprint %s', bp.name) app.register_blueprint(bp) for ml in sorted(menu_links, key=lambda x: x.name): log.debug('Adding menu link %s', ml.name) admin.add_link(ml) integrate_plugins() import airflow.www.api.experimental.endpoints as e # required for testing purposes otherwise the module retains # a link to the default_auth if app.config['TESTING']: if six.PY2: reload(e) else: import importlib importlib.reload(e) app.register_blueprint(e.api_experimental, url_prefix='/api/experimental') @app.context_processor def jinja_globals(): return { 'hostname': get_hostname(), 'navbar_color': configuration.get('webserver', 'NAVBAR_COLOR'), } @app.teardown_appcontext def shutdown_session(exception=None): settings.Session.remove() return app
def start(self): self.task_queue = Queue() self.result_queue = Queue() framework = mesos_pb2.FrameworkInfo() framework.user = '' if not configuration.get('mesos', 'MASTER'): logging.error("Expecting mesos master URL for mesos executor") raise AirflowException("mesos.master not provided for mesos executor") master = configuration.get('mesos', 'MASTER') framework.name = get_framework_name() if not configuration.get('mesos', 'TASK_CPU'): task_cpu = 1 else: task_cpu = configuration.getint('mesos', 'TASK_CPU') if not configuration.get('mesos', 'TASK_MEMORY'): task_memory = 256 else: task_memory = configuration.getint('mesos', 'TASK_MEMORY') if configuration.getboolean('mesos', 'CHECKPOINT'): framework.checkpoint = True if configuration.get('mesos', 'FAILOVER_TIMEOUT'): # Import here to work around a circular import error from airflow.models import Connection # Query the database to get the ID of the Mesos Framework, if available. conn_id = FRAMEWORK_CONNID_PREFIX + framework.name session = Session() connection = session.query(Connection).filter_by(conn_id=conn_id).first() if connection is not None: # Set the Framework ID to let the scheduler reconnect with running tasks. framework.id.value = connection.extra framework.failover_timeout = configuration.getint('mesos', 'FAILOVER_TIMEOUT') else: framework.checkpoint = False logging.info('MesosFramework master : %s, name : %s, cpu : %s, mem : %s, checkpoint : %s', master, framework.name, str(task_cpu), str(task_memory), str(framework.checkpoint)) implicit_acknowledgements = 1 if configuration.getboolean('mesos', 'AUTHENTICATE'): if not configuration.get('mesos', 'DEFAULT_PRINCIPAL'): logging.error("Expecting authentication principal in the environment") raise AirflowException("mesos.default_principal not provided in authenticated mode") if not configuration.get('mesos', 'DEFAULT_SECRET'): logging.error("Expecting authentication secret in the environment") raise AirflowException("mesos.default_secret not provided in authenticated mode") credential = mesos_pb2.Credential() credential.principal = configuration.get('mesos', 'DEFAULT_PRINCIPAL') credential.secret = configuration.get('mesos', 'DEFAULT_SECRET') framework.principal = credential.principal driver = mesos.native.MesosSchedulerDriver( AirflowMesosScheduler(self.task_queue, self.result_queue, task_cpu, task_memory), framework, master, implicit_acknowledgements, credential) else: framework.principal = 'Airflow' driver = mesos.native.MesosSchedulerDriver( AirflowMesosScheduler(self.task_queue, self.result_queue, task_cpu, task_memory), framework, master, implicit_acknowledgements) self.mesos_driver = driver self.mesos_driver.start()