def flower(args): broka = conf.get('celery', 'BROKER_URL') args.port = args.port or conf.get('celery', 'FLOWER_PORT') port = '--port=' + args.port api = '' if args.broker_api: api = '--broker_api=' + args.broker_api if not args.foreground: pid, stdout, stderr, log_file = setup_locations("flower", args.pid, args.stdout, args.stderr, args.log_file) stdout = open(stdout, 'w+') stderr = open(stderr, 'w+') ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(pid, -1), stdout=stdout, stderr=stderr, ) with ctx: sp = subprocess.Popen(['flower', '-b', broka, port, api]) sp.wait() stdout.close() stderr.close() else: signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) sp = subprocess.Popen(['flower', '-b', broka, port, api]) sp.wait()
def renew_from_kt(): # The config is specified in seconds. But we ask for that same amount in # minutes to give ourselves a large renewal buffer. renewal_lifetime = "%sm" % configuration.getint('kerberos', 'reinit_frequency') principal = configuration.get('kerberos', 'principal').replace("_HOST", socket.getfqdn()) cmdv = [configuration.get('kerberos', 'kinit_path'), "-r", renewal_lifetime, "-k", # host ticket "-t", configuration.get('kerberos', 'keytab'), # specify keytab "-c", configuration.get('kerberos', 'ccache'), # specify credentials cache principal] LOG.info("Reinitting kerberos from keytab: " + " ".join(cmdv)) subp = subprocess.Popen(cmdv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, bufsize=-1) subp.wait() if subp.returncode != 0: LOG.error("Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" % ( subp.returncode, "\n".join(subp.stdout.readlines()), "\n".join(subp.stderr.readlines()))) sys.exit(subp.returncode) global NEED_KRB181_WORKAROUND if NEED_KRB181_WORKAROUND is None: NEED_KRB181_WORKAROUND = detect_conf_var() if NEED_KRB181_WORKAROUND: # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we # renew the ticket after the initial valid time. time.sleep(1.5) perform_krb181_workaround()
def try_login(username, password): conn = get_ldap_connection(configuration.get("ldap", "bind_user"), configuration.get("ldap", "bind_password")) search_filter = "(&({0})({1}={2}))".format( configuration.get("ldap", "user_filter"), configuration.get("ldap", "user_name_attr"), username ) # todo: BASE or ONELEVEL? res = conn.search(configuration.get("ldap", "basedn"), search_filter, search_scope=LEVEL) # todo: use list or result? if not res: LOG.info("Cannot find user %s", username) raise AuthenticationError("Invalid username or password") entry = conn.response[0] conn.unbind() conn = get_ldap_connection(entry['dn'], password) if not conn: LOG.info("Password incorrect for user %s", username) raise AuthenticationError("Invalid username or password")
def webserver(args): print(settings.HEADER) from airflow.www.app import cached_app app = cached_app(conf) workers = args.workers or conf.get('webserver', 'workers') worker_timeout = (args.worker_timeout or conf.get('webserver', 'webserver_worker_timeout')) if args.debug: print( "Starting the web server on port {0} and host {1}.".format( args.port, args.hostname)) app.run(debug=True, port=args.port, host=args.hostname) else: pid, stdout, stderr, log_file = setup_locations("webserver", pid=args.pid) print( 'Running the Gunicorn server with {workers} {args.workerclass}' 'workers on host {args.hostname} and port ' '{args.port} with a timeout of {worker_timeout}...'.format(**locals())) sp = subprocess.Popen([ 'gunicorn', '-w', str(args.workers), '-k', str(args.workerclass), '-t', str(args.worker_timeout), '-b', args.hostname + ':' + str(args.port), '-n', 'airflow-webserver', '--pid', pid, 'airflow.www.app:cached_app()'] ) if args.foreground: sp.wait()
def configure_vars(): global AIRFLOW_HOME global SQL_ALCHEMY_CONN global DAGS_FOLDER AIRFLOW_HOME = os.path.expanduser(conf.get('core', 'AIRFLOW_HOME')) SQL_ALCHEMY_CONN = conf.get('core', 'SQL_ALCHEMY_CONN') DAGS_FOLDER = os.path.expanduser(conf.get('core', 'DAGS_FOLDER'))
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): log = LoggingMixin().log SMTP_HOST = configuration.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT') SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS') SMTP_SSL = configuration.getboolean('smtp', 'SMTP_SSL') SMTP_USER = None SMTP_PASSWORD = None try: SMTP_USER = configuration.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD') except AirflowConfigException: log.debug("No user/password found for SMTP, so logging in with no authentication.") if not dryrun: s = smtplib.SMTP_SSL(SMTP_HOST, SMTP_PORT) if SMTP_SSL else smtplib.SMTP(SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) log.info("Sent an alert email to %s", e_to) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def get_metastore_client(self): """ Returns a Hive thrift client. """ from thrift.transport import TSocket, TTransport from thrift.protocol import TBinaryProtocol from hive_service import ThriftHive ms = self.metastore_conn auth_mechanism = ms.extra_dejson.get('authMechanism', 'NOSASL') if configuration.get('core', 'security') == 'kerberos': auth_mechanism = ms.extra_dejson.get('authMechanism', 'GSSAPI') kerberos_service_name = ms.extra_dejson.get('kerberos_service_name', 'hive') socket = TSocket.TSocket(ms.host, ms.port) if configuration.get('core', 'security') == 'kerberos' and auth_mechanism == 'GSSAPI': try: import saslwrapper as sasl except ImportError: import sasl def sasl_factory(): sasl_client = sasl.Client() sasl_client.setAttr("host", ms.host) sasl_client("service", kerberos_service_name) sasl_client.init() from thrift_sasl import TSaslClientTransport transport = TSaslClientTransport(sasl_factory, "GSSAPI", socket) else: transport = TTransport.TBufferedTransport(socket) protocol = TBinaryProtocol.TBinaryProtocol(transport) return ThriftHive.Client(protocol)
def flower(args): broka = configuration.get("celery", "BROKER_URL") args.port = args.port or configuration.get("celery", "FLOWER_PORT") port = "--port=" + args.port api = "" if args.broker_api: api = "--broker_api=" + args.broker_api sp = subprocess.Popen(["flower", "-b", broka, port, api]) sp.wait()
def flower(args): broka = configuration.get('celery', 'BROKER_URL') args.port = args.port or configuration.get('celery', 'FLOWER_PORT') port = '--port=' + args.port api = '' if args.broker_api: api = '--broker_api=' + args.broker_api sp = subprocess.Popen(['flower', '-b', broka, port, api]) sp.wait()
def flower(args): broka = configuration.get('celery', 'BROKER_URL') args.port = args.port or configuration.get('celery', 'FLOWER_PORT') port = '--port=' + args.port api = '' if args.broker_api: api = '--broker_api=' + args.broker_api flower = distutils.spawn.find_executable('flower') os.execv(flower, [flower, '-b', broka, port, api])
def __init__(self, cluster_address=None): if cluster_address is None: cluster_address = configuration.conf.get('dask', 'cluster_address') if not cluster_address: raise ValueError( 'Please provide a Dask cluster address in airflow.cfg') self.cluster_address = cluster_address # ssl / tls parameters self.tls_ca = configuration.get('dask', 'tls_ca') self.tls_key = configuration.get('dask', 'tls_key') self.tls_cert = configuration.get('dask', 'tls_cert') super(DaskExecutor, self).__init__(parallelism=0)
def webserver(args): print(settings.HEADER) from airflow.www.app import cached_app app = cached_app(conf) access_logfile = args.access_logfile or conf.get("webserver", "access_logfile") error_logfile = args.error_logfile or conf.get("webserver", "error_logfile") workers = args.workers or conf.get("webserver", "workers") worker_timeout = args.worker_timeout or conf.get("webserver", "webserver_worker_timeout") if args.debug: print("Starting the web server on port {0} and host {1}.".format(args.port, args.hostname)) app.run(debug=True, port=args.port, host=args.hostname) else: pid, stdout, stderr, log_file = setup_locations("webserver", pid=args.pid) print( textwrap.dedent( """\ Running the Gunicorn Server with: Workers: {workers} {args.workerclass} Host: {args.hostname}:{args.port} Timeout: {worker_timeout} Logfiles: {access_logfile} {error_logfile} =================================================================\ """.format( **locals() ) ) ) run_args = [ "gunicorn", "-w " + str(args.workers), "-k " + str(args.workerclass), "-t " + str(args.worker_timeout), "-b " + args.hostname + ":" + str(args.port), "-n " + "airflow-webserver", "-p " + str(pid), ] if args.access_logfile: run_args += ["--access-logfile", str(args.access_logfile)] if args.error_logfile: run_args += ["--error-logfile", str(args.error_logfile)] if args.daemon: run_args += ["-D"] module = "airflow.www.app:cached_app()".encode() run_args += [module] os.execvp("gunicorn", run_args)
def webserver(args): print(settings.HEADER) from airflow.www.app import cached_app app = cached_app(conf) access_logfile = args.access_logfile or conf.get('webserver', 'access_logfile') error_logfile = args.error_logfile or conf.get('webserver', 'error_logfile') workers = args.workers or conf.get('webserver', 'workers') worker_timeout = (args.worker_timeout or conf.get('webserver', 'webserver_worker_timeout')) if args.debug: print( "Starting the web server on port {0} and host {1}.".format( args.port, args.hostname)) app.run(debug=True, port=args.port, host=args.hostname) else: pid, stdout, stderr, log_file = setup_locations("webserver", pid=args.pid) print( textwrap.dedent('''\ Running the Gunicorn Server with: Workers: {workers} {args.workerclass} Host: {args.hostname}:{args.port} Timeout: {worker_timeout} Logfiles: {access_logfile} {error_logfile} =================================================================\ '''.format(**locals()))) run_args = [ 'gunicorn', '-w ' + str(args.workers), '-k ' + str(args.workerclass), '-t ' + str(args.worker_timeout), '-b ' + args.hostname + ':' + str(args.port), '-n ' + 'airflow-webserver', '-p ' + str(pid), ] if args.access_logfile: run_args += ['--access-logfile', str(args.access_logfile)] if args.error_logfile: run_args += ['--error-logfile', str(args.error_logfile)] if args.daemon: run_args += ["-D"] module = "airflow.www.app:cached_app()".encode() run_args += [module] os.execvp( 'gunicorn', run_args )
def authenticate(username, password): service_principal = "%s/%s" % (configuration.get('kerberos', 'principal'), utils.get_fqdn()) realm = configuration.get("kerberos", "default_realm") user_principal = utils.principal_from_username(username) try: # this is pykerberos specific, verify = True is needed to prevent KDC spoofing if not kerberos.checkPassword(user_principal, password, service_principal, realm, True): raise AuthenticationError() except kerberos.KrbError as e: logging.error('Password validation for principal %s failed %s', user_principal, e) raise AuthenticationError(e) return
def __init__(self, task_queue, result_queue, task_cpu=1, task_mem=256): self.task_queue = task_queue self.result_queue = result_queue self.task_cpu = task_cpu self.task_mem = task_mem self.task_counter = 0 self.task_key_map = {} if configuration.get('mesos', 'DOCKER_IMAGE_SLAVE'): self.mesos_slave_docker_image = configuration.get( 'mesos', 'DOCKER_IMAGE_SLAVE' )
def get_conn(self): """ Returns a snakebite HDFSClient object. """ connections = self.get_connections(self.hdfs_conn_id) use_sasl = False if configuration.get('core', 'security') == 'kerberos': use_sasl = True # When using HAClient, proxy_user must be the same, so is ok to always take the first. effective_user = self.proxy_user or connections[0].login if len(connections) == 1: autoconfig = connections[0].extra_dejson.get('autoconfig', False) if autoconfig: client = AutoConfigClient(effective_user=effective_user, use_sasl=use_sasl) else: hdfs_namenode_principal = connections[0].extra_dejson.get('hdfs_namenode_principal') client = Client(connections[0].host, connections[0].port, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal) elif len(connections) > 1: hdfs_namenode_principal = connections[0].extra_dejson.get('hdfs_namenode_principal') nn = [Namenode(conn.host, conn.port) for conn in connections] client = HAClient(nn, effective_user=effective_user, use_sasl=use_sasl, hdfs_namenode_principal=hdfs_namenode_principal) else: raise HDFSHookException("conn_id doesn't exist in the repository") return client
def flower(args): broka = conf.get("celery", "BROKER_URL") address = "--address={}".format(args.hostname) port = "--port={}".format(args.port) api = "" if args.broker_api: api = "--broker_api=" + args.broker_api flower_conf = "" if args.flower_conf: flower_conf = "--conf=" + args.flower_conf if args.daemon: pid, stdout, stderr, log_file = setup_locations("flower", args.pid, args.stdout, args.stderr, args.log_file) stdout = open(stdout, "w+") stderr = open(stderr, "w+") ctx = daemon.DaemonContext(pidfile=TimeoutPIDLockFile(pid, -1), stdout=stdout, stderr=stderr) with ctx: os.execvp("flower", ["flower", "-b", broka, address, port, api, flower_conf]) stdout.close() stderr.close() else: signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) os.execvp("flower", ["flower", "-b", broka, address, port, api, flower_conf])
def send_email(to, subject, html_content, files=None, dryrun=False): """ Send an email with html content >>> send_email('*****@*****.**', 'foo', '<b>Foo</b> bar', ['/dev/null'], dryrun=True) """ SMTP_MAIL_FROM = configuration.get('smtp', 'SMTP_MAIL_FROM') if isinstance(to, basestring): if ',' in to: to = to.split(',') elif ';' in to: to = to.split(';') else: to = [to] msg = MIMEMultipart('alternative') msg['Subject'] = subject msg['From'] = SMTP_MAIL_FROM msg['To'] = ", ".join(to) mime_text = MIMEText(html_content, 'html') msg.attach(mime_text) for fname in files or []: basename = os.path.basename(fname) with open(fname, "rb") as f: msg.attach(MIMEApplication( f.read(), Content_Disposition='attachment; filename="%s"' % basename, Name=basename )) send_MIME_email(SMTP_MAIL_FROM, to, msg, dryrun)
def send_MIME_email(e_from, e_to, mime_msg, dryrun=False): SMTP_HOST = configuration.get('smtp', 'SMTP_HOST') SMTP_PORT = configuration.getint('smtp', 'SMTP_PORT') SMTP_USER = configuration.get('smtp', 'SMTP_USER') SMTP_PASSWORD = configuration.get('smtp', 'SMTP_PASSWORD') SMTP_STARTTLS = configuration.getboolean('smtp', 'SMTP_STARTTLS') if not dryrun: s = smtplib.SMTP(SMTP_HOST, SMTP_PORT) if SMTP_STARTTLS: s.starttls() if SMTP_USER and SMTP_PASSWORD: s.login(SMTP_USER, SMTP_PASSWORD) logging.info("Sent an alert email to " + str(e_to)) s.sendmail(e_from, e_to, mime_msg.as_string()) s.quit()
def flower(args): broka = conf.get('celery', 'BROKER_URL') port = '--port={}'.format(args.port) api = '' if args.broker_api: api = '--broker_api=' + args.broker_api if args.daemon: pid, stdout, stderr, log_file = setup_locations("flower", args.pid, args.stdout, args.stderr, args.log_file) stdout = open(stdout, 'w+') stderr = open(stderr, 'w+') ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(pid, -1), stdout=stdout, stderr=stderr, ) with ctx: os.execvp("flower", ['flower', '-b', broka, port, api]) stdout.close() stderr.close() else: signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) os.execvp("flower", ['flower', '-b', broka, port, api])
def get_results(self, ti=None, fp=None, inline=True, delim=None, fetch=True): """ Get results (or just s3 locations) of a command from Qubole and save into a file :param ti: Task Instance of the dag, used to determine the Quboles command id :param fp: Optional file pointer, will create one and return if None passed :param inline: True to download actual results, False to get s3 locations only :param delim: Replaces the CTL-A chars with the given delim, defaults to ',' :param fetch: when inline is True, get results directly from s3 (if large) :return: file location containing actual results or s3 locations of results """ if fp is None: iso = datetime.datetime.utcnow().isoformat() logpath = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER')) resultpath = logpath + '/' + self.dag_id + '/' + self.task_id + '/results' configuration.mkdir_p(resultpath) fp = open(resultpath + '/' + iso, 'wb') if self.cmd is None: cmd_id = ti.xcom_pull(key="qbol_cmd_id", task_ids=self.task_id) self.cmd = self.cls.find(cmd_id) self.cmd.get_results(fp, inline, delim, fetch) fp.flush() fp.close() return fp.name
def __init__( self, dag_id=None, dag_ids=None, subdir=None, test_mode=False, refresh_dags_every=10, num_runs=None, do_pickle=False, *args, **kwargs): # for BaseJob compatibility self.dag_id = dag_id self.dag_ids = [dag_id] if dag_id else [] if dag_ids: self.dag_ids.extend(dag_ids) self.subdir = subdir if test_mode: self.num_runs = 1 else: self.num_runs = num_runs self.refresh_dags_every = refresh_dags_every self.do_pickle = do_pickle super(SchedulerJob, self).__init__(*args, **kwargs) self.heartrate = conf.getint('scheduler', 'SCHEDULER_HEARTBEAT_SEC') self.max_threads = min(conf.getint('scheduler', 'max_threads'), multiprocessing.cpu_count()) if 'sqlite' in conf.get('core', 'sql_alchemy_conn'): if self.max_threads > 1: self.logger.error("Cannot use more than 1 thread when using sqlite. Setting max_threads to 1") self.max_threads = 1
def __init__( self, hql, hive_cli_conn_id='hive_cli_default', schema='default', hiveconfs=None, hiveconf_jinja_translate=False, script_begin_tag=None, run_as_owner=False, mapred_queue=None, mapred_queue_priority=None, mapred_job_name=None, *args, **kwargs): super(HiveOperator, self).__init__(*args, **kwargs) self.hql = hql self.hive_cli_conn_id = hive_cli_conn_id self.schema = schema self.hiveconfs = hiveconfs or {} self.hiveconf_jinja_translate = hiveconf_jinja_translate self.script_begin_tag = script_begin_tag self.run_as = None if run_as_owner: self.run_as = self.dag.owner self.mapred_queue = mapred_queue self.mapred_queue_priority = mapred_queue_priority self.mapred_job_name = mapred_job_name self.mapred_job_name_template = configuration.get('hive', 'mapred_job_name_template') # assigned lazily - just for consistency we can create the attribute with a # `None` initial value, later it will be populated by the execute method. # This also makes `on_kill` implementation consistent since it assumes `self.hook` # is defined. self.hook = None
def load_login(): log = LoggingMixin().log auth_backend = 'airflow.default_login' try: if conf.getboolean('webserver', 'AUTHENTICATE'): auth_backend = conf.get('webserver', 'auth_backend') except conf.AirflowConfigException: if conf.getboolean('webserver', 'AUTHENTICATE'): log.warning( "auth_backend not found in webserver config reverting to " "*deprecated* behavior of importing airflow_login") auth_backend = "airflow_login" try: global login login = import_module(auth_backend) except ImportError as err: log.critical( "Cannot import authentication module %s. " "Please correct your authentication backend or disable authentication: %s", auth_backend, err ) if conf.getboolean('webserver', 'AUTHENTICATE'): raise AirflowException("Failed to import authentication backend")
def resetdb(args): print("DB: " + configuration.get("core", "SQL_ALCHEMY_CONN")) if input("This will drop existing tables if they exist. " "Proceed? (y/n)").upper() == "Y": logging.basicConfig(level=settings.LOGGING_LEVEL, format=settings.SIMPLE_LOG_FORMAT) utils.resetdb() else: print("Bail.")
def validate_logging_config(logging_config): # Now lets validate the other logging-related settings task_log_reader = conf.get('core', 'task_log_reader') logger = logging.getLogger('airflow.task') def _get_handler(name): return next((h for h in logger.handlers if h.name == name), None) if _get_handler(task_log_reader) is None: # Check for pre 1.10 setting that might be in deployed airflow.cfg files if task_log_reader == "file.task" and _get_handler("task"): warnings.warn( "task_log_reader setting in [core] has a deprecated value of " "{!r}, but no handler with this name was found. Please update " "your config to use {!r}. Running config has been adjusted to " "match".format( task_log_reader, "task", ), DeprecationWarning, ) conf.set('core', 'task_log_reader', 'task') else: raise AirflowConfigException( "Configured task_log_reader {!r} was not a handler of the 'airflow.task' " "logger.".format(task_log_reader) )
def webserver(args): print(settings.HEADER) log_to_stdout() from airflow.www.app import cached_app app = cached_app(configuration) threads = args.threads or configuration.get("webserver", "threads") if args.debug: print("Starting the web server on port {0} and host {1}.".format(args.port, args.hostname)) app.run(debug=True, port=args.port, host=args.hostname) else: print( "Running the Gunicorn server with {threads}" "on host {args.hostname} and port " "{args.port}...".format(**locals()) ) sp = subprocess.Popen( [ "gunicorn", "-w", str(args.threads), "-t", "120", "-b", args.hostname + ":" + str(args.port), "airflow.www.app:cached_app()", ] ) sp.wait()
def serve_logs(filename): # noqa log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER')) return flask.send_from_directory( log, filename, mimetype="application/json", as_attachment=False)
def write(self, log, remote_log_location, append=False): """ Writes the log to the remote_log_location. Fails silently if no hook was created. :param log: the log to write to the remote_log_location :type log: string :param remote_log_location: the log's location in remote storage :type remote_log_location: string (path) :param append: if False, any existing log file is overwritten. If True, the new log is appended to any existing logs. :type append: bool """ if self.hook: if append: old_log = self.read(remote_log_location) log = old_log + '\n' + log try: self.hook.load_string( log, key=remote_log_location, replace=True, encrypt=configuration.get('core', 'ENCRYPT_S3_LOGS')) return except: pass # raise/return error if we get here logging.error('Could not write logs to {}'.format(remote_log_location))
def __init__( self, hive_cli_conn_id="hive_cli_default", run_as=None, mapred_queue=None, mapred_queue_priority=None, mapred_job_name=None): conn = self.get_connection(hive_cli_conn_id) self.hive_cli_params = conn.extra_dejson.get('hive_cli_params', '') self.use_beeline = conn.extra_dejson.get('use_beeline', False) self.auth = conn.extra_dejson.get('auth', 'noSasl') self.conn = conn self.run_as = run_as if mapred_queue_priority: mapred_queue_priority = mapred_queue_priority.upper() if mapred_queue_priority not in HIVE_QUEUE_PRIORITIES: raise AirflowException( "Invalid Mapred Queue Priority. Valid values are: " "{}".format(', '.join(HIVE_QUEUE_PRIORITIES))) self.mapred_queue = mapred_queue or configuration.get('hive', 'default_hive_mapred_queue') self.mapred_queue_priority = mapred_queue_priority self.mapred_job_name = mapred_job_name
def run(args): utils.pessimistic_connection_handling() # Setting up logging log = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER')) directory = log + "/{args.dag_id}/{args.task_id}".format(args=args) if not os.path.exists(directory): os.makedirs(directory) args.execution_date = dateutil.parser.parse(args.execution_date) iso = args.execution_date.isoformat() filename = "{directory}/{iso}".format(**locals()) # store old log (to help with S3 appends) if os.path.exists(filename): with open(filename, 'r') as logfile: old_log = logfile.read() else: old_log = None subdir = process_subdir(args.subdir) logging.basicConfig(filename=filename, level=settings.LOGGING_LEVEL, format=settings.LOG_FORMAT) if not args.pickle: dagbag = DagBag(subdir) if args.dag_id not in dagbag.dags: msg = 'DAG [{0}] could not be found in {1}'.format( args.dag_id, subdir) logging.error(msg) raise AirflowException(msg) dag = dagbag.dags[args.dag_id] task = dag.get_task(task_id=args.task_id) else: session = settings.Session() logging.info('Loading pickle id {args.pickle}'.format(**locals())) dag_pickle = session.query(DagPickle).filter( DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) task_start_date = None if args.task_start_date: task_start_date = dateutil.parser.parse(args.task_start_date) task.start_date = task_start_date ti = TaskInstance(task, args.execution_date) if args.local: print("Logging into: " + filename) run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, force=args.force, pickle_id=args.pickle, task_start_date=task_start_date, ignore_dependencies=args.ignore_dependencies, pool=args.pool) run_job.run() elif args.raw: ti.run( mark_success=args.mark_success, force=args.force, ignore_dependencies=args.ignore_dependencies, job_id=args.job_id, pool=args.pool, ) else: pickle_id = None if args.ship_dag: try: # Running remotely, so pickling the DAG session = settings.Session() pickle = DagPickle(dag) session.add(pickle) session.commit() pickle_id = pickle.id print(('Pickled dag {dag} ' 'as pickle_id:{pickle_id}').format(**locals())) except Exception as e: print('Could not pickle the DAG') print(e) raise e executor = DEFAULT_EXECUTOR executor.start() print("Sending to executor.") executor.queue_task_instance( ti, mark_success=args.mark_success, pickle_id=pickle_id, ignore_dependencies=args.ignore_dependencies, force=args.force) executor.heartbeat() executor.end() if configuration.get('core', 'S3_LOG_FOLDER').startswith('s3:'): import boto s3_log = filename.replace(log, configuration.get('core', 'S3_LOG_FOLDER')) bucket, key = s3_log.lstrip('s3:/').split('/', 1) if os.path.exists(filename): # get logs with open(filename, 'r') as logfile: new_log = logfile.read() # remove old logs (since they are already in S3) if old_log: new_log.replace(old_log, '') try: s3 = boto.connect_s3() s3_key = boto.s3.key.Key(s3.get_bucket(bucket), key) # append new logs to old S3 logs, if available if s3_key.exists(): old_s3_log = s3_key.get_contents_as_string().decode() new_log = old_s3_log + '\n' + new_log # send log to S3 s3_key.set_contents_from_string(new_log) except: print('Could not send logs to S3.')
from airflow import DAG from datetime import datetime, timedelta from airflow.contrib.operators.kubernetes_pod_operator import KubernetesPodOperator from airflow import configuration as conf default_args = { 'owner': 'airflow', 'depends_on_past': False, 'start_date': datetime(2019, 1, 1), 'email_on_failure': False, 'email_on_retry': False, 'retries': 1, 'retry_delay': timedelta(minutes=5), } namespace = conf.get('kubernetes', 'NAMESPACE') # This will detect the default namespace locally and read the # environment namespace when deployed to Astronomer. if namespace == 'default': config_file = '/usr/local/airflow/include/.kube/config' in_cluster = False else: in_cluster = True config_file = None dag = DAG('example_kubernetes_pod', schedule_interval='@once', default_args=default_args) compute_resource = {
import traceback import time import psutil import airflow from airflow import jobs, settings from airflow import configuration as conf from airflow.exceptions import AirflowException from airflow.executors import DEFAULT_EXECUTOR from airflow.models import DagModel, DagBag, TaskInstance, DagPickle, DagRun, Variable from airflow.utils import db as db_utils from airflow.utils import logging as logging_utils from airflow.utils.state import State from airflow.www.app import cached_app DAGS_FOLDER = os.path.expanduser(conf.get('core', 'DAGS_FOLDER')) def sigint_handler(sig, frame): sys.exit(0) def sigquit_handler(sig, frame): """Helps debug deadlocks by printing stacktraces when this gets a SIGQUIT e.g. kill -s QUIT <PID> or CTRL+\ """ print("Dumping stack traces for all threads in PID {}".format(os.getpid())) id_to_name = dict([(th.ident, th.name) for th in threading.enumerate()]) code = [] for thread_id, stack in sys._current_frames().items(): code.append("\n# Thread: {}({})"
from airflow import DAG from airflow.operators import RedshiftVacuumOperator from airflow.operators.dummy_operator import DummyOperator from airflow import configuration; from datetime import datetime args = { 'owner': 'scopeworker', 'provide_context': True } dag = DAG('redshift_vacuum_plugin', description='REDSHIFT VACUUM DAG', schedule_interval='0 5 * * *', start_date=datetime(2017, 3, 20), catchup=False, default_args=args) redshift_operator = RedshiftVacuumOperator(task_id="vacumming_task", redshift_connection_id=configuration.get("postgresql", "postgresql_conn_id"), query = "COMMIT;vacuum; ANALYZE; COMMIT;", dag=dag) dummy_operator = DummyOperator(task_id='dummy_task', retries=3, dag=dag) dummy_operator >> redshift_operator
from airflow.operators import python_operator from airflow.utils.trigger_rule import TriggerRule from airflow.operators import email_operator # We set the start_date of the DAG to the previous date. This will # make the DAG immediately available for scheduling. YESTERDAY = datetime.datetime.combine( datetime.datetime.today() - datetime.timedelta(1), datetime.datetime.min.time()) # We define some variables that we will use in the DAG tasks. SUCCESS_TAG = 'success' FAILURE_TAG = 'failure' DS_TAG = '{{ ds }}' DATAFLOW_FILE = os.path.join(configuration.get('core', 'dags_folder'), 'dataflow', 'process_json.py') DEFAULT_DAG_ARGS = { 'start_date': YESTERDAY, 'email': models.Variable.get('email'), 'email_on_failure': True, 'email_on_retry': False, 'retries': 0, 'project_id': models.Variable.get('gcp_project'), 'dataflow_default_options': { 'project': models.Variable.get('gcp_project'), 'temp_location': models.Variable.get('gcp_temp_location'), 'runner': 'DataflowRunner' } }
def get_es_hosts(): result = conf.get("core", "elasticsearch_hosts") assert result return [x.strip().split(':') for x in result.split(',')]
def verify_s3_prefix(): reach_s3_prefix = conf.get("core", "reach_s3_prefix") assert reach_s3_prefix.startswith('s3://') assert not reach_s3_prefix.endswith('/')
import unittest from airflow import configuration from airflow.models import DAG, DagBag, TaskInstance, State from airflow.jobs import BackfillJob from airflow.operators.python_operator import PythonOperator try: from airflow.executors.dask_executor import DaskExecutor from distributed import LocalCluster SKIP_DASK = False except ImportError: logging.error('Dask unavailable, skipping DaskExecutor tests') SKIP_DASK = True if 'sqlite' in configuration.get('core', 'sql_alchemy_conn'): logging.error('sqlite does not support concurrent access') SKIP_DASK = True DEFAULT_DATE = datetime.datetime(2017, 1, 1) class DaskExecutorTest(unittest.TestCase): def setUp(self): self.dagbag = DagBag(include_examples=True) @unittest.skipIf(SKIP_DASK, 'Dask unsupported by this configuration') def test_dask_executor_functions(self): cluster = LocalCluster() executor = DaskExecutor(cluster_address=cluster.scheduler_address)
import os import pendulum from sqlalchemy import create_engine from sqlalchemy.orm import scoped_session, sessionmaker from sqlalchemy.pool import NullPool from airflow import configuration as conf from airflow.logging_config import configure_logging log = logging.getLogger(__name__) TIMEZONE = pendulum.timezone('UTC') try: tz = conf.get("core", "default_timezone") if tz == "system": TIMEZONE = pendulum.local_timezone() else: TIMEZONE = pendulum.timezone(tz) except: pass log.info("Configured default timezone %s" % TIMEZONE) class DummyStatsLogger(object): @classmethod def incr(cls, stat, count=1, rate=1): pass @classmethod
class CLIFactory(object): args = { # Shared 'dag_id': Arg(("dag_id", ), "The id of the dag"), 'task_id': Arg(("task_id", ), "The id of the task"), 'execution_date': Arg(("execution_date", ), help="The execution date of the DAG", type=parsedate), 'task_regex': Arg(("-t", "--task_regex"), "The regex to filter specific task_ids to backfill (optional)"), 'subdir': Arg(("-sd", "--subdir"), "File location or directory from which to look for the dag", default=DAGS_FOLDER), 'start_date': Arg(("-s", "--start_date"), "Override start_date YYYY-MM-DD", type=parsedate), 'end_date': Arg(("-e", "--end_date"), "Override end_date YYYY-MM-DD", type=parsedate), 'dry_run': Arg(("-dr", "--dry_run"), "Perform a dry run", "store_true"), 'pid': Arg(("--pid", ), "PID file location", nargs='?'), 'foreground': Arg(("-f", "--foreground"), "Do not detach. Run in foreground", "store_true"), 'stderr': Arg(("--stderr", ), "Redirect stderr to this file"), 'stdout': Arg(("--stdout", ), "Redirect stdout to this file"), 'log_file': Arg(("-l", "--log-file"), "Location of the log file"), # backfill 'mark_success': Arg(("-m", "--mark_success"), "Mark jobs as succeeded without running them", "store_true"), 'local': Arg(("-l", "--local"), "Run the task using the LocalExecutor", "store_true"), 'donot_pickle': Arg(("-x", "--donot_pickle"), ("Do not attempt to pickle the DAG object to send over " "to the workers, just tell the workers to run their version " "of the code."), "store_true"), 'include_adhoc': Arg(("-a", "--include_adhoc"), "Include dags with the adhoc parameter.", "store_true"), 'bf_ignore_dependencies': Arg(("-i", "--ignore_dependencies"), ("Skip upstream tasks, run only the tasks " "matching the regexp. Only works in conjunction " "with task_regex"), "store_true"), 'bf_ignore_first_depends_on_past': Arg(("-I", "--ignore_first_depends_on_past"), ("Ignores depends_on_past dependencies for the first " "set of tasks only (subsequent executions in the backfill " "DO respect depends_on_past)."), "store_true"), 'pool': Arg(("--pool", ), "Resource pool to use"), # list_dags 'tree': Arg(("-t", "--tree"), "Tree view", "store_true"), # clear 'upstream': Arg(("-u", "--upstream"), "Include upstream tasks", "store_true"), 'only_failed': Arg(("-f", "--only_failed"), "Only failed jobs", "store_true"), 'only_running': Arg(("-r", "--only_running"), "Only running jobs", "store_true"), 'downstream': Arg(("-d", "--downstream"), "Include downstream tasks", "store_true"), 'no_confirm': Arg(("-c", "--no_confirm"), "Do not request confirmation", "store_true"), # trigger_dag 'run_id': Arg(("-r", "--run_id"), "Helps to indentify this run"), 'conf': Arg(('-c', '--conf'), "json string that gets pickled into the DagRun's conf attribute"), # kerberos 'principal': Arg(("principal", ), "kerberos principal", nargs='?', default=conf.get('kerberos', 'principal')), 'keytab': Arg(("-kt", "--keytab"), "keytab", nargs='?', default=conf.get('kerberos', 'keytab')), # run 'force': Arg(("-f", "--force"), "Force a run regardless or previous success", "store_true"), 'raw': Arg(("-r", "--raw"), argparse.SUPPRESS, "store_true"), 'ignore_dependencies': Arg(("-i", "--ignore_dependencies"), "Ignore upstream and depends_on_past dependencies", "store_true"), 'ignore_depends_on_past': Arg(("-I", "--ignore_depends_on_past"), "Ignore depends_on_past dependencies (but respect " "upstream dependencies)", "store_true"), 'ship_dag': Arg(("--ship_dag", ), "Pickles (serializes) the DAG and ships it to the worker", "store_true"), 'pickle': Arg(("-p", "--pickle"), "Serialized pickle object of the entire dag (used internally)"), 'job_id': Arg(("-j", "--job_id"), argparse.SUPPRESS), # webserver 'port': Arg(("-p", "--port"), default=conf.get('webserver', 'WEB_SERVER_PORT'), type=int, help="The port on which to run the server"), 'workers': Arg(("-w", "--workers"), default=conf.get('webserver', 'WORKERS'), type=int, help="Number of workers to run the webserver on"), 'workerclass': Arg(("-k", "--workerclass"), default=conf.get('webserver', 'WORKER_CLASS'), choices=['sync', 'eventlet', 'gevent', 'tornado'], help="The worker class to use for gunicorn"), 'worker_timeout': Arg(("-t", "--worker_timeout"), default=conf.get('webserver', 'WEB_SERVER_WORKER_TIMEOUT'), type=int, help="The timeout for waiting on webserver workers"), 'hostname': Arg(("-hn", "--hostname"), default=conf.get('webserver', 'WEB_SERVER_HOST'), help="Set the hostname on which to run the web server"), 'debug': Arg(("-d", "--debug"), "Use the server that ships with Flask in debug mode", "store_true"), # resetdb 'yes': Arg(("-y", "--yes"), "Do not prompt to confirm reset. Use with care!", "store_true", default=False), # scheduler 'dag_id_opt': Arg(("-d", "--dag_id"), help="The id of the dag to run"), 'num_runs': Arg(("-n", "--num_runs"), default=None, type=int, help="Set the number of runs to execute before exiting"), # worker 'do_pickle': Arg(("-p", "--do_pickle"), default=False, help=( "Attempt to pickle the DAG object to send over " "to the workers, instead of letting workers run their version " "of the code."), action="store_true"), 'queues': Arg(("-q", "--queues"), help="Comma delimited list of queues to serve", default=conf.get('celery', 'DEFAULT_QUEUE')), 'concurrency': Arg(("-c", "--concurrency"), type=int, help="The number of worker processes", default=conf.get('celery', 'celeryd_concurrency')), # flower 'broker_api': Arg(("-a", "--broker_api"), help="Broker api"), 'flower_port': Arg(("-p", "--port"), default=conf.get('webserver', 'WEB_SERVER_PORT'), type=int, help="The port on which to run the server"), 'task_params': Arg(("-tp", "--task_params"), help="Sends a JSON params dict to the task"), } subparsers = ( { 'func': backfill, 'help': "Run subsections of a DAG for a specified date range", 'args': ('dag_id', 'task_regex', 'start_date', 'end_date', 'mark_success', 'local', 'donot_pickle', 'include_adhoc', 'bf_ignore_dependencies', 'bf_ignore_first_depends_on_past', 'subdir', 'pool', 'dry_run') }, { 'func': list_tasks, 'help': "List the tasks within a DAG", 'args': ('dag_id', 'tree', 'subdir'), }, { 'func': clear, 'help': "Clear a set of task instance, as if they never ran", 'args': ('dag_id', 'task_regex', 'start_date', 'end_date', 'subdir', 'upstream', 'downstream', 'no_confirm'), }, { 'func': pause, 'help': "Pause a DAG", 'args': ('dag_id', 'subdir'), }, { 'func': unpause, 'help': "Pause a DAG", 'args': ('dag_id', 'subdir'), }, { 'func': trigger_dag, 'help': "Trigger a DAG run", 'args': ('dag_id', 'subdir', 'run_id', 'conf'), }, { 'func': kerberos, 'help': "Start a kerberos ticket renewer", 'args': ('principal', 'keytab', 'pid', 'foreground', 'stdout', 'stderr', 'log_file'), }, { 'func': render, 'help': "Render a task instance's template(s)", 'args': ('dag_id', 'task_id', 'execution_date', 'subdir'), }, { 'func': run, 'help': "Run a single task instance", 'args': ('dag_id', 'task_id', 'execution_date', 'subdir', 'mark_success', 'force', 'pool', 'local', 'raw', 'ignore_dependencies', 'ignore_depends_on_past', 'ship_dag', 'pickle', 'job_id'), }, { 'func': initdb, 'help': "Initialize the metadata database", 'args': tuple(), }, { 'func': list_dags, 'help': "List all the DAGs", 'args': ('subdir', ), }, { 'func': task_state, 'help': "Get the status of a task instance", 'args': ('dag_id', 'task_id', 'execution_date', 'subdir'), }, { 'func': serve_logs, 'help': "Serve logs generate by worker", 'args': tuple(), }, { 'func': test, 'help': ("Test a task instance. This will run a task without checking for " "dependencies or recording it's state in the database."), 'args': ('dag_id', 'task_id', 'execution_date', 'subdir', 'dry_run', 'task_params'), }, { 'func': webserver, 'help': "Start a Airflow webserver instance", 'args': ('port', 'workers', 'workerclass', 'worker_timeout', 'hostname', 'pid', 'foreground', 'stdout', 'stderr', 'log_file', 'debug'), }, { 'func': resetdb, 'help': "Burn down and rebuild the metadata database", 'args': ('yes', ), }, { 'func': upgradedb, 'help': "Upgrade metadata database to latest version", 'args': tuple(), }, { 'func': scheduler, 'help': "Start a scheduler scheduler instance", 'args': ('dag_id_opt', 'subdir', 'num_runs', 'do_pickle', 'pid', 'foreground', 'stdout', 'stderr', 'log_file'), }, { 'func': worker, 'help': "Start a Celery worker node", 'args': ('do_pickle', 'queues', 'concurrency', 'pid', 'foreground', 'stdout', 'stderr', 'log_file'), }, { 'func': flower, 'help': "Start a Celery Flower", 'args': ('flower_port', 'broker_api', 'pid', 'foreground', 'stdout', 'stderr', 'log_file'), }, { 'func': version, 'help': "Show the version", 'args': tuple(), }, ) subparsers_dict = {sp['func'].__name__: sp for sp in subparsers} dag_subparsers = ('list_tasks', 'backfill', 'test', 'run', 'pause', 'unpause') @classmethod def get_parser(cls, dag_parser=False): parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(help='sub-command help', dest='subcommand') subparsers.required = True subparser_list = cls.dag_subparsers if dag_parser else cls.subparsers_dict.keys( ) for sub in subparser_list: sub = cls.subparsers_dict[sub] sp = subparsers.add_parser(sub['func'].__name__, help=sub['help']) for arg in sub['args']: if 'dag_id' in arg and dag_parser: continue arg = cls.args[arg] kwargs = { f: getattr(arg, f) for f in arg._fields if f != 'flags' and getattr(arg, f) } sp.add_argument(*arg.flags, **kwargs) sp.set_defaults(func=sub['func']) return parser
def get_service_token(self, method, json): if method == 'POST': request_func = requests.post endpoint = self.service_connection.extra_dejson.get( 'POST_END_POINT', None) if endpoint is None: endpoint = Service_Token_EndPoint else: raise AirflowException('Unexpected HTTP Method: ' + method) url = 'https://{host}/{endpoint}'.format(host=self.parse_host( self.service_connection.host), endpoint=endpoint) logging.info('URL :: ' + url) logging.info(json) for attempt_num in range(1, self.retry_limit + 1): try: if os.getenv("id") is not None: id = os.environ['id'] else: id = configuration.get('service', 'id') if os.getenv("service") is not None: service = os.environ['service'] else: service = configuration.get('service', 'service') if os.getenv("code") is not None: code = os.environ['code'] else: code = configuration.get('service', 'code') if os.getenv("type") is not None: type = os.environ['type'] else: type = configuration.get('service', 'type') logging.info('URL :: ' + url) logging.info(' id :: ' + id) logging.info(' service :: ' + service) logging.info(' code :: ' + code) logging.info(' type :: ' + type) query_params = '?type=%s&id=%s&service=%s&code=%s' % ( type, id, service, code) logging.info('Final query_params :: ' + query_params) url = url + query_params logging.info('Final Appended URL :: ' + url) response = request_func(url, json=json, headers=User_Headers, timeout=self.timeout_seconds) if response.status_code == 200: return response.json() else: raise AirflowException( 'Response: {0}, Status Code: {1}'.format( response.content, response.status_code)) except (requests_exceptions.ConnectionError, requests_exceptions.Timeout) as e: logging.info( 'Attempt %s API Request to Query Service failed with reason: %s', attempt_num, e) raise AirflowException( ('API requests to IMS Gateway Service failed {} times. ' + 'Giving up.').format(self.retry_limit))
def __init__(self, dag_directory, file_paths, max_runs, processor_factory, processor_timeout, signal_conn, async_mode=True): """ :param dag_directory: Directory where DAG definitions are kept. All files in file_paths should be under this directory :type dag_directory: unicode :param file_paths: list of file paths that contain DAG definitions :type file_paths: list[unicode] :param max_runs: The number of times to parse and schedule each file. -1 for unlimited. :type max_runs: int :param processor_factory: function that creates processors for DAG definition files. Arguments are (dag_definition_path) :type processor_factory: (unicode, unicode, list) -> (AbstractDagFileProcessor) :param processor_timeout: How long to wait before timing out a DAG file processor :type processor_timeout: timedelta :param signal_conn: connection to communicate signal with processor agent. :type signal_conn: airflow.models.connection.Connection :param async_mode: whether to start the manager in async mode :type async_mode: bool """ self._file_paths = file_paths self._file_path_queue = [] self._dag_directory = dag_directory self._max_runs = max_runs self._processor_factory = processor_factory self._signal_conn = signal_conn self._async_mode = async_mode self._parallelism = conf.getint('scheduler', 'max_threads') if 'sqlite' in conf.get('core', 'sql_alchemy_conn') and self._parallelism > 1: self.log.error("Cannot use more than 1 thread when using sqlite. " "Setting parallelism to 1") self._parallelism = 1 # Parse and schedule each file no faster than this interval. self._file_process_interval = conf.getint('scheduler', 'min_file_process_interval') # How often to print out DAG file processing stats to the log. Default to # 30 seconds. self.print_stats_interval = conf.getint('scheduler', 'print_stats_interval') # Map from file path to the processor self._processors = {} # Map from file path to the last runtime self._last_runtime = {} # Map from file path to the last finish time self._last_finish_time = {} self._last_zombie_query_time = timezone.utcnow() # Last time that the DAG dir was traversed to look for files self.last_dag_dir_refresh_time = timezone.utcnow() # Last time stats were printed self.last_stat_print_time = timezone.datetime(2000, 1, 1) # TODO: Remove magic number self._zombie_query_interval = 10 # Map from file path to the number of runs self._run_count = defaultdict(int) # Manager heartbeat key. self._heart_beat_key = 'heart-beat' # How long to wait before timing out a process to parse a DAG file self._processor_timeout = processor_timeout # How often to scan the DAGs directory for new files. Default to 5 minutes. self.dag_dir_list_interval = conf.getint('scheduler', 'dag_dir_list_interval') self._log = logging.getLogger('airflow.processor_manager') signal.signal(signal.SIGINT, self._exit_gracefully) signal.signal(signal.SIGTERM, self._exit_gracefully)
def run(args, dag=None): db_utils.pessimistic_connection_handling() if dag: args.dag_id = dag.dag_id # Setting up logging log_base = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER')) directory = log_base + "/{args.dag_id}/{args.task_id}".format(args=args) if not os.path.exists(directory): os.makedirs(directory) iso = args.execution_date.isoformat() filename = "{directory}/{iso}".format(**locals()) logging.root.handlers = [] logging.basicConfig(filename=filename, level=settings.LOGGING_LEVEL, format=settings.LOG_FORMAT) if not args.pickle and not dag: dag = get_dag(args) elif not dag: session = settings.Session() logging.info('Loading pickle id {args.pickle}'.format(**locals())) dag_pickle = session.query(DagPickle).filter( DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) if args.local: print("Logging into: " + filename) run_job = jobs.LocalTaskJob( task_instance=ti, mark_success=args.mark_success, force=args.force, pickle_id=args.pickle, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, pool=args.pool) run_job.run() elif args.raw: ti.run( mark_success=args.mark_success, force=args.force, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, job_id=args.job_id, pool=args.pool, ) else: pickle_id = None if args.ship_dag: try: # Running remotely, so pickling the DAG session = settings.Session() pickle = DagPickle(dag) session.add(pickle) session.commit() pickle_id = pickle.id print(('Pickled dag {dag} ' 'as pickle_id:{pickle_id}').format(**locals())) except Exception as e: print('Could not pickle the DAG') print(e) raise e executor = DEFAULT_EXECUTOR executor.start() print("Sending to executor.") executor.queue_task_instance( ti, mark_success=args.mark_success, pickle_id=pickle_id, ignore_dependencies=args.ignore_dependencies, ignore_depends_on_past=args.ignore_depends_on_past, force=args.force, pool=args.pool) executor.heartbeat() executor.end() # store logs remotely remote_base = conf.get('core', 'REMOTE_BASE_LOG_FOLDER') # deprecated as of March 2016 if not remote_base and conf.get('core', 'S3_LOG_FOLDER'): warnings.warn( 'The S3_LOG_FOLDER conf key has been replaced by ' 'REMOTE_BASE_LOG_FOLDER. Your conf still works but please ' 'update airflow.cfg to ensure future compatibility.', DeprecationWarning) remote_base = conf.get('core', 'S3_LOG_FOLDER') if os.path.exists(filename): # read log and remove old logs to get just the latest additions with open(filename, 'r') as logfile: log = logfile.read() remote_log_location = filename.replace(log_base, remote_base) # S3 if remote_base.startswith('s3:/'): logging_utils.S3Log().write(log, remote_log_location) # GCS elif remote_base.startswith('gs:/'): logging_utils.GCSLog().write(log, remote_log_location, append=True) # Other elif remote_base and remote_base != 'None': logging.error( 'Unsupported remote log location: {}'.format(remote_base))
def __init__(self): configuration_dict = configuration.as_dict(display_sensitive=True) self.core_configuration = configuration_dict['core'] self.kube_secrets = configuration_dict.get('kubernetes_secrets', {}) self.airflow_home = configuration.get(self.core_section, 'airflow_home') self.dags_folder = configuration.get(self.core_section, 'dags_folder') self.parallelism = configuration.getint(self.core_section, 'PARALLELISM') self.worker_container_repository = configuration.get( self.kubernetes_section, 'worker_container_repository') self.worker_container_tag = configuration.get(self.kubernetes_section, 'worker_container_tag') self.kube_image = '{}:{}'.format(self.worker_container_repository, self.worker_container_tag) self.kube_image_pull_policy = configuration.get( self.kubernetes_section, "worker_container_image_pull_policy") self.kube_node_selectors = configuration_dict.get( 'kubernetes_node_selectors', {}) self.delete_worker_pods = conf.getboolean(self.kubernetes_section, 'delete_worker_pods') self.worker_pods_creation_batch_size = conf.getint( self.kubernetes_section, 'worker_pods_creation_batch_size') self.worker_service_account_name = conf.get( self.kubernetes_section, 'worker_service_account_name') self.image_pull_secrets = conf.get(self.kubernetes_section, 'image_pull_secrets') # NOTE: user can build the dags into the docker image directly, # this will set to True if so self.dags_in_image = conf.getboolean(self.kubernetes_section, 'dags_in_image') # NOTE: `git_repo` and `git_branch` must be specified together as a pair # The http URL of the git repository to clone from self.git_repo = conf.get(self.kubernetes_section, 'git_repo') # The branch of the repository to be checked out self.git_branch = conf.get(self.kubernetes_section, 'git_branch') # Optionally, the directory in the git repository containing the dags self.git_subpath = conf.get(self.kubernetes_section, 'git_subpath') # Optionally, the root directory for git operations self.git_sync_root = conf.get(self.kubernetes_section, 'git_sync_root') # Optionally, the name at which to publish the checked-out files under --root self.git_sync_dest = conf.get(self.kubernetes_section, 'git_sync_dest') # Optionally, if git_dags_folder_mount_point is set the worker will use # {git_dags_folder_mount_point}/{git_sync_dest}/{git_subpath} as dags_folder self.git_dags_folder_mount_point = conf.get( self.kubernetes_section, 'git_dags_folder_mount_point') # Optionally a user may supply a `git_user` and `git_password` for private # repositories self.git_user = conf.get(self.kubernetes_section, 'git_user') self.git_password = conf.get(self.kubernetes_section, 'git_password') # NOTE: The user may optionally use a volume claim to mount a PV containing # DAGs directly self.dags_volume_claim = conf.get(self.kubernetes_section, 'dags_volume_claim') # This prop may optionally be set for PV Claims and is used to write logs self.logs_volume_claim = conf.get(self.kubernetes_section, 'logs_volume_claim') # This prop may optionally be set for PV Claims and is used to locate DAGs # on a SubPath self.dags_volume_subpath = conf.get(self.kubernetes_section, 'dags_volume_subpath') # This prop may optionally be set for PV Claims and is used to locate logs # on a SubPath self.logs_volume_subpath = conf.get(self.kubernetes_section, 'logs_volume_subpath') # Optionally, hostPath volume containing DAGs self.dags_volume_host = conf.get(self.kubernetes_section, 'dags_volume_host') # Optionally, write logs to a hostPath Volume self.logs_volume_host = conf.get(self.kubernetes_section, 'logs_volume_host') # This prop may optionally be set for PV Claims and is used to write logs self.base_log_folder = configuration.get(self.core_section, 'base_log_folder') # The Kubernetes Namespace in which the Scheduler and Webserver reside. Note # that if your # cluster has RBAC enabled, your scheduler may need service account permissions to # create, watch, get, and delete pods in this namespace. self.kube_namespace = conf.get(self.kubernetes_section, 'namespace') # The Kubernetes Namespace in which pods will be created by the executor. Note # that if your # cluster has RBAC enabled, your workers may need service account permissions to # interact with cluster components. self.executor_namespace = conf.get(self.kubernetes_section, 'namespace') # Task secrets managed by KubernetesExecutor. self.gcp_service_account_keys = conf.get(self.kubernetes_section, 'gcp_service_account_keys') # If the user is using the git-sync container to clone their repository via git, # allow them to specify repository, tag, and pod name for the init container. self.git_sync_container_repository = conf.get( self.kubernetes_section, 'git_sync_container_repository') self.git_sync_container_tag = conf.get(self.kubernetes_section, 'git_sync_container_tag') self.git_sync_container = '{}:{}'.format( self.git_sync_container_repository, self.git_sync_container_tag) self.git_sync_init_container_name = conf.get( self.kubernetes_section, 'git_sync_init_container_name') # The worker pod may optionally have a valid Airflow config loaded via a # configmap self.airflow_configmap = conf.get(self.kubernetes_section, 'airflow_configmap') affinity_json = conf.get(self.kubernetes_section, 'affinity') if affinity_json: self.kube_affinity = json.loads(affinity_json) else: self.kube_affinity = None tolerations_json = conf.get(self.kubernetes_section, 'tolerations') if tolerations_json: self.kube_tolerations = json.loads(tolerations_json) else: self.kube_tolerations = None self._validate()
def create_app(config=None): app = Flask(__name__) app.secret_key = configuration.get('webserver', 'SECRET_KEY') app.config['LOGIN_DISABLED'] = not configuration.getboolean('webserver', 'AUTHENTICATE') csrf.init_app(app) #app.config = config airflow.load_login() airflow.login.login_manager.init_app(app) cache = Cache( app=app, config={'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp'}) app.register_blueprint(ck, url_prefix='/ck') app.register_blueprint(routes) app.jinja_env.add_extension("chartkick.ext.charts") with app.app_context(): from airflow.www import views admin = Admin( app, name='Airflow', static_url_path='/admin', index_view=views.HomeView(endpoint='', url='/admin', name="DAGs"), template_mode='bootstrap3', ) av = admin.add_view vs = views av(vs.Airflow(name='DAGs', category='DAGs')) av(vs.QueryView(name='Ad Hoc Query', category="Data Profiling")) av(vs.ChartModelView( models.Chart, Session, name="Charts", category="Data Profiling")) av(vs.KnowEventView( models.KnownEvent, Session, name="Known Events", category="Data Profiling")) av(vs.SlaMissModelView( models.SlaMiss, Session, name="SLA Misses", category="Browse")) av(vs.TaskInstanceModelView(models.TaskInstance, Session, name="Task Instances", category="Browse")) av(vs.LogModelView( models.Log, Session, name="Logs", category="Browse")) av(vs.JobModelView( jobs.BaseJob, Session, name="Jobs", category="Browse")) av(vs.PoolModelView( models.Pool, Session, name="Pools", category="Admin")) av(vs.ConfigurationView( name='Configuration', category="Admin")) av(vs.UserModelView( models.User, Session, name="Users", category="Admin")) av(vs.ConnectionModelView( models.Connection, Session, name="Connections", category="Admin")) av(vs.VariableView( models.Variable, Session, name="Variables", category="Admin")) admin.add_link(base.MenuLink( category='Docs', name='Documentation', url='http://pythonhosted.org/airflow/')) admin.add_link( base.MenuLink(category='Docs', name='Github',url='https://github.com/airbnb/airflow')) av(vs.VersionView(name='Version', category="About")) av(vs.DagRunModelView( models.DagRun, Session, name="DAG Runs", category="Browse")) av(vs.DagModelView(models.DagModel, Session, name=None)) # Hack to not add this view to the menu admin._menu = admin._menu[:-1] def integrate_plugins(): """Integrate plugins to the context""" from airflow.plugins_manager import ( admin_views, flask_blueprints, menu_links) for v in admin_views: admin.add_view(v) for bp in flask_blueprints: app.register_blueprint(bp) for ml in sorted(menu_links, key=lambda x: x.name): admin.add_link(ml) integrate_plugins() @app.context_processor def jinja_globals(): return { 'hostname': socket.getfqdn(), } @app.teardown_appcontext def shutdown_session(exception=None): settings.Session.remove() return app
class BackfillJobTest(unittest.TestCase): def setUp(self): self.parser = cli.CLIFactory.get_parser() self.dagbag = DagBag(include_examples=True) @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'), "concurrent access not supported in sqlite") def test_trigger_controller_dag(self): dag = self.dagbag.get_dag('example_trigger_controller_dag') target_dag = self.dagbag.get_dag('example_trigger_target_dag') dag.clear() target_dag.clear() scheduler = SchedulerJob() queue = mock.Mock() scheduler._process_task_instances(target_dag, queue=queue) self.assertFalse(queue.append.called) job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True) job.run() scheduler = SchedulerJob() queue = mock.Mock() scheduler._process_task_instances(target_dag, queue=queue) self.assertTrue(queue.append.called) target_dag.clear() dag.clear() @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'), "concurrent access not supported in sqlite") def test_backfill_multi_dates(self): dag = self.dagbag.get_dag('example_bash_operator') dag.clear() job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE + datetime.timedelta(days=1), ignore_first_depends_on_past=True) job.run() session = settings.Session() drs = session.query(DagRun).filter( DagRun.dag_id == 'example_bash_operator').order_by( DagRun.execution_date).all() self.assertTrue(drs[0].execution_date == DEFAULT_DATE) self.assertTrue(drs[0].state == State.SUCCESS) self.assertTrue(drs[1].execution_date == DEFAULT_DATE + datetime.timedelta(days=1)) self.assertTrue(drs[1].state == State.SUCCESS) dag.clear() session.close() @unittest.skipIf('sqlite' in configuration.get('core', 'sql_alchemy_conn'), "concurrent access not supported in sqlite") def test_backfill_examples(self): """ Test backfilling example dags """ # some DAGs really are just examples... but try to make them work! skip_dags = [ 'example_http_operator', 'example_twitter_dag', 'example_trigger_target_dag', 'example_trigger_controller_dag', # tested above 'test_utils', # sleeps forever ] logger = logging.getLogger('BackfillJobTest.test_backfill_examples') dags = [ dag for dag in self.dagbag.dags.values() if 'example_dags' in dag.full_filepath and dag.dag_id not in skip_dags ] for dag in dags: dag.clear(start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) for i, dag in enumerate(sorted(dags, key=lambda d: d.dag_id)): logger.info('*** Running example DAG #{}: {}'.format( i, dag.dag_id)) job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE, ignore_first_depends_on_past=True) job.run() def test_backfill_pooled_tasks(self): """ Test that queued tasks are executed by BackfillJob Test for https://github.com/airbnb/airflow/pull/1225 """ session = settings.Session() pool = Pool(pool='test_backfill_pooled_task_pool', slots=1) session.add(pool) session.commit() dag = self.dagbag.get_dag('test_backfill_pooled_task_dag') dag.clear() job = BackfillJob(dag=dag, start_date=DEFAULT_DATE, end_date=DEFAULT_DATE) # run with timeout because this creates an infinite loop if not # caught with timeout(seconds=30): job.run() ti = TI(task=dag.get_task('test_backfill_pooled_task'), execution_date=DEFAULT_DATE) ti.refresh_from_db() self.assertEqual(ti.state, State.SUCCESS) def test_backfill_depends_on_past(self): """ Test that backfill respects ignore_depends_on_past """ dag = self.dagbag.get_dag('test_depends_on_past') dag.clear() run_date = DEFAULT_DATE + datetime.timedelta(days=5) # backfill should deadlock self.assertRaisesRegexp( AirflowException, 'BackfillJob is deadlocked', BackfillJob(dag=dag, start_date=run_date, end_date=run_date).run) BackfillJob(dag=dag, start_date=run_date, end_date=run_date, ignore_first_depends_on_past=True).run() # ti should have succeeded ti = TI(dag.tasks[0], run_date) ti.refresh_from_db() self.assertEquals(ti.state, State.SUCCESS) def test_cli_backfill_depends_on_past(self): """ Test that CLI respects -I argument """ dag_id = 'test_dagrun_states_deadlock' run_date = DEFAULT_DATE + datetime.timedelta(days=1) args = [ 'backfill', dag_id, '-l', '-s', run_date.isoformat(), ] dag = self.dagbag.get_dag(dag_id) dag.clear() self.assertRaisesRegexp(AirflowException, 'BackfillJob is deadlocked', cli.backfill, self.parser.parse_args(args)) cli.backfill(self.parser.parse_args(args + ['-I'])) ti = TI(dag.get_task('test_depends_on_past'), run_date) ti.refresh_from_db() # task ran self.assertEqual(ti.state, State.SUCCESS) dag.clear()
def index(self, session=None): TASK_NAME = "Task Name" COMMAND = "Command" request_args_filter = RequestArgsFilter(DcmpDag, request.args, ( ("Category", { "operations": ["contains"] }), (TASK_NAME, { "operations": ["contains"], "no_filters": True }), (COMMAND, { "operations": ["contains"], "no_filters": True }), )) confs = OrderedDict() current_user = get_current_user() curr_user = airflow.login.current_user do_filter = FILTER_BY_OWNER and (not curr_user.is_superuser()) owner_mode = conf.get('webserver', 'OWNER_MODE').strip().lower() if wwwutils.get_filter_by_user(): dcmp_dags = session.query(DcmpDag).order_by( DcmpDag.dag_name).filter( DcmpDag.last_editor_user_name == curr_user.user.username, *request_args_filter.filters) else: dcmp_dags = session.query(DcmpDag).order_by( DcmpDag.dag_name).filter(*request_args_filter.filters) dcmp_dags_count = dcmp_dags.count() dcmp_dags = dcmp_dags[:] for dcmp_dag in dcmp_dags: dcmp_dag.conf = dcmp_dag.get_conf(session=session) if request_args_filter.filters_dict.get(TASK_NAME): task_name_value = request_args_filter.filters_dict.get( TASK_NAME)["value"] def filter_dcmp_dags_by_task_name(dcmp_dag): for task in dcmp_dag.conf["tasks"]: if task_name_value in task["task_name"]: return True return False dcmp_dags = filter(filter_dcmp_dags_by_task_name, dcmp_dags) if request_args_filter.filters_dict.get(COMMAND): command_value = request_args_filter.filters_dict.get( COMMAND)["value"] def filter_dcmp_dags_by_command(dcmp_dag): for task in dcmp_dag.conf["tasks"]: if command_value in task["command"]: return True return False dcmp_dags = filter(filter_dcmp_dags_by_command, dcmp_dags) search = request.args.get("search", "") if search: searched_dcmp_dags = [] for dcmp_dag in dcmp_dags: dcmp_dag.search_results = [] for result_task_name, result_key, result_line in search_conf_iter( search, dcmp_dag.conf): dcmp_dag.search_results.append({ "key": result_key, "full_key": "%s__%s" % (result_task_name, result_key), "line": result_line, "html_line": ('<span class="nb">[%s]</span> ' % result_key if result_key else "") + result_line.replace( search, '<span class="highlighted">%s</span>' % search), }) if dcmp_dag.search_results: searched_dcmp_dags.append(dcmp_dag) dcmp_dags = searched_dcmp_dags return self.render( "dcmp/index.html", can_access_approver=can_access_approver(), dcmp_dags=dcmp_dags, dcmp_dags_count=dcmp_dags_count, filter_groups=request_args_filter.filter_groups, active_filters=request_args_filter.active_filters, search=search, )
def create_app(config=None, testing=False): app = Flask(__name__) app.secret_key = configuration.get('webserver', 'SECRET_KEY') app.config['LOGIN_DISABLED'] = not configuration.getboolean( 'webserver', 'AUTHENTICATE') app.config['PREFERRED_URL_SCHEME'] = configuration.get( 'webserver', 'WEB_PREFERRED_URL_SCHEME') csrf.init_app(app) app.config['TESTING'] = testing airflow.load_login() airflow.login.login_manager.init_app(app) from airflow import api api.load_auth() api.api_auth.init_app(app) cache = Cache(app=app, config={ 'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp' }) app.register_blueprint(routes) log_format = airflow.settings.LOG_FORMAT_WITH_PID airflow.settings.configure_logging(log_format=log_format) with app.app_context(): from airflow.www import views admin = Admin( app, name='Airflow', static_url_path='/admin', index_view=views.HomeView(endpoint='', url='/admin', name="DAGs"), template_mode='bootstrap3', ) av = admin.add_view vs = views av(vs.Airflow(name='DAGs', category='DAGs')) av(vs.QueryView(name='Ad Hoc Query', category="Data Profiling")) av( vs.ChartModelView(models.Chart, Session, name="Charts", category="Data Profiling")) av( vs.KnowEventView(models.KnownEvent, Session, name="Known Events", category="Data Profiling")) av( vs.SlaMissModelView(models.SlaMiss, Session, name="SLA Misses", category="Browse")) av( vs.TaskInstanceModelView(models.TaskInstance, Session, name="Task Instances", category="Browse")) av(vs.LogModelView(models.Log, Session, name="Logs", category="Browse")) av( vs.JobModelView(jobs.BaseJob, Session, name="Jobs", category="Browse")) av( vs.PoolModelView(models.Pool, Session, name="Pools", category="Admin")) av(vs.ConfigurationView(name='Configuration', category="Admin")) av( vs.UserModelView(models.User, Session, name="Users", category="Admin")) av( vs.ConnectionModelView(models.Connection, Session, name="Connections", category="Admin")) av( vs.VariableView(models.Variable, Session, name="Variables", category="Admin")) av(vs.XComView(models.XCom, Session, name="XComs", category="Admin")) admin.add_link( base.MenuLink(category='Docs', name='Documentation', url='http://pythonhosted.org/airflow/')) admin.add_link( base.MenuLink(category='Docs', name='Github', url='https://github.com/apache/incubator-airflow')) av(vs.VersionView(name='Version', category="About")) av( vs.DagRunModelView(models.DagRun, Session, name="DAG Runs", category="Browse")) av(vs.DagModelView(models.DagModel, Session, name=None)) # Hack to not add this view to the menu admin._menu = admin._menu[:-1] def integrate_plugins(): """Integrate plugins to the context""" from airflow.plugins_manager import (admin_views, flask_blueprints, menu_links) for v in admin_views: logging.debug('Adding view ' + v.name) admin.add_view(v) for bp in flask_blueprints: logging.debug('Adding blueprint ' + bp.name) app.register_blueprint(bp) for ml in sorted(menu_links, key=lambda x: x.name): logging.debug('Adding menu link ' + ml.name) admin.add_link(ml) integrate_plugins() import airflow.www.api.experimental.endpoints as e # required for testing purposes otherwise the module retains # a link to the default_auth if app.config['TESTING']: if six.PY2: reload(e) else: import importlib importlib.reload(e) app.register_blueprint(e.api_experimental, url_prefix='/api/experimental') @app.context_processor def jinja_globals(): return { 'hostname': socket.getfqdn(), } @app.teardown_appcontext def shutdown_session(exception=None): settings.Session.remove() return app
def create_app(config=None, session=None, testing=False, app_name="Airflow"): global app, appbuilder app = Flask(__name__) if conf.getboolean('webserver', 'ENABLE_PROXY_FIX'): app.wsgi_app = ProxyFix(app.wsgi_app) app.secret_key = conf.get('webserver', 'SECRET_KEY') airflow_home_path = conf.get('core', 'AIRFLOW_HOME') webserver_config_path = airflow_home_path + '/webserver_config.py' app.config.from_pyfile(webserver_config_path, silent=True) app.config['APP_NAME'] = app_name app.config['TESTING'] = testing csrf.init_app(app) db = SQLA(app) from airflow import api api.load_auth() api.api_auth.init_app(app) # flake8: noqa: F841 cache = Cache(app=app, config={ 'CACHE_TYPE': 'filesystem', 'CACHE_DIR': '/tmp' }) from airflow.www.blueprints import routes app.register_blueprint(routes) configure_logging() configure_manifest_files(app) with app.app_context(): from airflow.www.security import AirflowSecurityManager security_manager_class = app.config.get('SECURITY_MANAGER_CLASS') or \ AirflowSecurityManager if not issubclass(security_manager_class, AirflowSecurityManager): raise Exception( """Your CUSTOM_SECURITY_MANAGER must now extend AirflowSecurityManager, not FAB's security manager.""") appbuilder = AppBuilder(app, db.session if not session else session, security_manager_class=security_manager_class, base_template='appbuilder/baselayout.html') def init_views(appbuilder): from airflow.www import views appbuilder.add_view_no_menu(views.Airflow()) appbuilder.add_view_no_menu(views.DagModelView()) appbuilder.add_view_no_menu(views.ConfigurationView()) appbuilder.add_view_no_menu(views.VersionView()) appbuilder.add_view(views.DagRunModelView, "DAG Runs", category="Browse", category_icon="fa-globe") appbuilder.add_view(views.JobModelView, "Jobs", category="Browse") appbuilder.add_view(views.LogModelView, "Logs", category="Browse") appbuilder.add_view(views.SlaMissModelView, "SLA Misses", category="Browse") appbuilder.add_view(views.TaskInstanceModelView, "Task Instances", category="Browse") appbuilder.add_link("Configurations", href='/configuration', category="Admin", category_icon="fa-user") appbuilder.add_view(views.ConnectionModelView, "Connections", category="Admin") appbuilder.add_view(views.PoolModelView, "Pools", category="Admin") appbuilder.add_view(views.VariableModelView, "Variables", category="Admin") appbuilder.add_view(views.XComModelView, "XComs", category="Admin") appbuilder.add_link("Documentation", href='https://airflow.apache.org/', category="Docs", category_icon="fa-cube") appbuilder.add_link("Github", href='https://github.com/apache/airflow', category="Docs") appbuilder.add_link('Version', href='/version', category='About', category_icon='fa-th') def integrate_plugins(): """Integrate plugins to the context""" from airflow.plugins_manager import ( flask_appbuilder_views, flask_appbuilder_menu_links) for v in flask_appbuilder_views: log.debug("Adding view %s", v["name"]) appbuilder.add_view(v["view"], v["name"], category=v["category"]) for ml in sorted(flask_appbuilder_menu_links, key=lambda x: x["name"]): log.debug("Adding menu link %s", ml["name"]) appbuilder.add_link(ml["name"], href=ml["href"], category=ml["category"], category_icon=ml["category_icon"]) integrate_plugins() # Garbage collect old permissions/views after they have been modified. # Otherwise, when the name of a view or menu is changed, the framework # will add the new Views and Menus names to the backend, but will not # delete the old ones. init_views(appbuilder) security_manager = appbuilder.sm security_manager.sync_roles() from airflow.www.api.experimental import endpoints as e # required for testing purposes otherwise the module retains # a link to the default_auth if app.config['TESTING']: if six.PY2: reload(e) # noqa else: import importlib importlib.reload(e) app.register_blueprint(e.api_experimental, url_prefix='/api/experimental') @app.context_processor def jinja_globals(): return { 'hostname': socket.getfqdn(), 'navbar_color': conf.get('webserver', 'NAVBAR_COLOR'), } @app.teardown_appcontext def shutdown_session(exception=None): settings.Session.remove() return app, appbuilder
def jinja_globals(): return { 'hostname': get_hostname(), 'navbar_color': configuration.get('webserver', 'NAVBAR_COLOR'), }
# specific language governing permissions and limitations # under the License. """Default configuration for the Airflow webserver""" import os from flask_appbuilder.security.manager import AUTH_DB # from flask_appbuilder.security.manager import AUTH_LDAP # from flask_appbuilder.security.manager import AUTH_OAUTH # from flask_appbuilder.security.manager import AUTH_OID # from flask_appbuilder.security.manager import AUTH_REMOTE_USER from airflow import configuration as conf basedir = os.path.abspath(os.path.dirname(__file__)) # The SQLAlchemy connection string. SQLALCHEMY_DATABASE_URI = conf.get('core', 'SQL_ALCHEMY_CONN') # Flask-WTF flag for CSRF CSRF_ENABLED = True # ---------------------------------------------------- # AUTHENTICATION CONFIG # ---------------------------------------------------- # For details on how to set up each of the following authentication, see # http://flask-appbuilder.readthedocs.io/en/latest/security.html# authentication-methods # for details. # The authentication type # AUTH_OID : Is for OpenID # AUTH_DB : Is for database # AUTH_LDAP : Is for LDAP
def upgradedb(args): print("DB: " + configuration.get('core', 'SQL_ALCHEMY_CONN')) utils.upgradedb()
def serve_logs(filename): log = os.path.expanduser(configuration.get('core', 'BASE_LOG_FOLDER')) return flask.send_from_directory(log, filename, mimetype="application/json", as_attachment=False)
def get_parser(): parser = argparse.ArgumentParser() subparsers = parser.add_subparsers(help='sub-command help') ht = "Run subsections of a DAG for a specified date range" parser_backfill = subparsers.add_parser('backfill', help=ht) parser_backfill.add_argument("dag_id", help="The id of the dag to run") parser_backfill.add_argument( "-t", "--task_regex", help="The regex to filter specific task_ids to backfill (optional)") parser_backfill.add_argument("-s", "--start_date", help="Override start_date YYYY-MM-DD") parser_backfill.add_argument("-e", "--end_date", help="Override end_date YYYY-MM-DD") parser_backfill.add_argument("-m", "--mark_success", help=mark_success_help, action="store_true") parser_backfill.add_argument("-l", "--local", help="Run the task using the LocalExecutor", action="store_true") parser_backfill.add_argument( "-x", "--donot_pickle", help=("Do not attempt to pickle the DAG object to send over " "to the workers, just tell the workers to run their version " "of the code."), action="store_true") parser_backfill.add_argument("-a", "--include_adhoc", help="Include dags with the adhoc parameter.", action="store_true") parser_backfill.add_argument( "-i", "--ignore_dependencies", help=( "Skip upstream tasks, run only the tasks " "matching the regexp. Only works in conjunction with task_regex"), action="store_true") parser_backfill.add_argument("-sd", "--subdir", help=subdir_help, default=DAGS_FOLDER) parser_backfill.add_argument("-p", "--pool", help="Pool to use to run the backfill") parser_backfill.add_argument("-dr", "--dry_run", help="Perform a dry run", action="store_true") parser_backfill.set_defaults(func=backfill) ht = "Clear a set of task instance, as if they never ran" parser_clear = subparsers.add_parser('clear', help=ht) parser_clear.add_argument("dag_id", help="The id of the dag to run") parser_clear.add_argument( "-t", "--task_regex", help="The regex to filter specific task_ids to clear (optional)") parser_clear.add_argument("-s", "--start_date", help="Override start_date YYYY-MM-DD") parser_clear.add_argument("-e", "--end_date", help="Override end_date YYYY-MM-DD") ht = "Include upstream tasks" parser_clear.add_argument("-u", "--upstream", help=ht, action="store_true") ht = "Only failed jobs" parser_clear.add_argument("-f", "--only_failed", help=ht, action="store_true") ht = "Only running jobs" parser_clear.add_argument("-r", "--only_running", help=ht, action="store_true") ht = "Include downstream tasks" parser_clear.add_argument("-d", "--downstream", help=ht, action="store_true") parser_clear.add_argument("-sd", "--subdir", help=subdir_help, default=DAGS_FOLDER) parser_clear.add_argument("-c", "--no_confirm", help=ht, action="store_true") parser_clear.set_defaults(func=clear) ht = "Trigger a DAG" parser_trigger_dag = subparsers.add_parser('trigger_dag', help=ht) parser_trigger_dag.add_argument("dag_id", help="The id of the dag to run") parser_trigger_dag.add_argument("-r", "--run_id", help="Helps to indentify this run") parser_trigger_dag.set_defaults(func=trigger_dag) ht = "Run a single task instance" parser_run = subparsers.add_parser('run', help=ht) parser_run.add_argument("dag_id", help="The id of the dag to run") parser_run.add_argument("task_id", help="The task_id to run") parser_run.add_argument("execution_date", help="The execution date to run") parser_run.add_argument("-sd", "--subdir", help=subdir_help, default=DAGS_FOLDER) parser_run.add_argument( "-s", "--task_start_date", help="Override the tasks's start_date (used internally)", ) parser_run.add_argument("-m", "--mark_success", help=mark_success_help, action="store_true") parser_run.add_argument("-f", "--force", help="Force a run regardless or previous success", action="store_true") parser_run.add_argument( "-l", "--local", help="Runs the task locally, don't use the executor", action="store_true") parser_run.add_argument("-r", "--raw", help=argparse.SUPPRESS, action="store_true") parser_run.add_argument("--pool", help="Pool to use to run the task instance") parser_run.add_argument( "-i", "--ignore_dependencies", help="Ignore upstream and depends_on_past dependencies", action="store_true") parser_run.add_argument( "--ship_dag", help="Pickles (serializes) the DAG and ships it to the worker", action="store_true") parser_run.add_argument( "-p", "--pickle", help="Serialized pickle object of the entire dag (used internally)") parser_run.add_argument("-j", "--job_id", help=argparse.SUPPRESS) parser_run.set_defaults(func=run) ht = ("Test a task instance. This will run a task without checking for " "dependencies or recording it's state in the database.") parser_test = subparsers.add_parser('test', help=ht) parser_test.add_argument("dag_id", help="The id of the dag to run") parser_test.add_argument("task_id", help="The task_id to run") parser_test.add_argument("execution_date", help="The execution date to run") parser_test.add_argument("-sd", "--subdir", help=subdir_help, default=DAGS_FOLDER) parser_test.add_argument("-dr", "--dry_run", help="Perform a dry run", action="store_true") parser_test.set_defaults(func=test) ht = "Get the status of a task instance." parser_task_state = subparsers.add_parser('task_state', help=ht) parser_task_state.add_argument("dag_id", help="The id of the dag to check") parser_task_state.add_argument("task_id", help="The task_id to check") parser_task_state.add_argument("execution_date", help="The execution date to check") parser_task_state.add_argument("-sd", "--subdir", help=subdir_help, default=DAGS_FOLDER) parser_task_state.set_defaults(func=task_state) ht = "Start a Airflow webserver instance" parser_webserver = subparsers.add_parser('webserver', help=ht) parser_webserver.add_argument( "-p", "--port", default=configuration.get('webserver', 'WEB_SERVER_PORT'), type=int, help="Set the port on which to run the web server") parser_webserver.add_argument( "-w", "--workers", default=configuration.get('webserver', 'WORKERS'), type=int, help="Number of workers to run the webserver on") parser_webserver.add_argument( "-k", "--workerclass", default=configuration.get('webserver', 'WORKER_CLASS'), choices=['sync', 'eventlet', 'gevent', 'tornado'], help="The worker class to use for gunicorn") parser_webserver.add_argument( "-hn", "--hostname", default=configuration.get('webserver', 'WEB_SERVER_HOST'), help="Set the hostname on which to run the web server") ht = "Use the server that ships with Flask in debug mode" parser_webserver.add_argument("-d", "--debug", help=ht, action="store_true") parser_webserver.set_defaults(func=webserver) ht = "Start a scheduler scheduler instance" parser_scheduler = subparsers.add_parser('scheduler', help=ht) parser_scheduler.add_argument("-d", "--dag_id", help="The id of the dag to run") parser_scheduler.add_argument("-sd", "--subdir", help=subdir_help, default=DAGS_FOLDER) parser_scheduler.add_argument( "-n", "--num_runs", default=None, type=int, help="Set the number of runs to execute before exiting") parser_scheduler.add_argument( "-p", "--do_pickle", default=False, help=("Attempt to pickle the DAG object to send over " "to the workers, instead of letting workers run their version " "of the code."), action="store_true") parser_scheduler.set_defaults(func=scheduler) ht = "Initialize the metadata database" parser_initdb = subparsers.add_parser('initdb', help=ht) parser_initdb.set_defaults(func=initdb) ht = "Burn down and rebuild the metadata database" parser_resetdb = subparsers.add_parser('resetdb', help=ht) parser_resetdb.add_argument( "-y", "--yes", default=False, help="Do not prompt to confirm reset. Use with care!", action="store_true") parser_resetdb.set_defaults(func=resetdb) ht = "Upgrade metadata database to latest version" parser_upgradedb = subparsers.add_parser('upgradedb', help=ht) parser_upgradedb.set_defaults(func=upgradedb) ht = "List the DAGs" parser_list_dags = subparsers.add_parser('list_dags', help=ht) parser_list_dags.add_argument("-sd", "--subdir", help=subdir_help, default=DAGS_FOLDER) parser_list_dags.set_defaults(func=list_dags) ht = "List the tasks within a DAG" parser_list_tasks = subparsers.add_parser('list_tasks', help=ht) parser_list_tasks.add_argument("-t", "--tree", help="Tree view", action="store_true") parser_list_tasks.add_argument("dag_id", help="The id of the dag") parser_list_tasks.add_argument("-sd", "--subdir", help=subdir_help, default=DAGS_FOLDER) parser_list_tasks.set_defaults(func=list_tasks) ht = "Start a Celery worker node" parser_worker = subparsers.add_parser('worker', help=ht) parser_worker.add_argument("-q", "--queues", help="Comma delimited list of queues to serve", default=configuration.get( 'celery', 'DEFAULT_QUEUE')) parser_worker.add_argument("-c", "--concurrency", type=int, help="The number of worker processes", default=configuration.get( 'celery', 'celeryd_concurrency')) parser_worker.set_defaults(func=worker) ht = "Serve logs generate by worker" parser_logs = subparsers.add_parser('serve_logs', help=ht) parser_logs.set_defaults(func=serve_logs) ht = "Start a Celery Flower" parser_flower = subparsers.add_parser('flower', help=ht) parser_flower.add_argument("-p", "--port", help="The port") parser_flower.add_argument("-a", "--broker_api", help="Broker api") parser_flower.set_defaults(func=flower) parser_version = subparsers.add_parser('version', help="Show version") parser_version.set_defaults(func=version) ht = "Start a kerberos ticket renewer" parser_kerberos = subparsers.add_parser('kerberos', help=ht) parser_kerberos.add_argument("-kt", "--keytab", help="keytab", nargs='?', default=configuration.get( 'kerberos', 'keytab')) parser_kerberos.add_argument("principal", help="kerberos principal", nargs='?', default=configuration.get( 'kerberos', 'principal')) parser_kerberos.set_defaults(func=kerberos) return parser
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import multiprocessing import subprocess import time from builtins import range from airflow import configuration from airflow.executors.base_executor import BaseExecutor from airflow.utils.log.logging_mixin import LoggingMixin from airflow.utils.state import State PARALLELISM = configuration.get('core', 'PARALLELISM') class LocalWorker(multiprocessing.Process, LoggingMixin): def __init__(self, task_queue, result_queue): multiprocessing.Process.__init__(self) self.task_queue = task_queue self.result_queue = result_queue self.daemon = True def run(self): while True: key, command = self.task_queue.get() if key is None: # Received poison pill, no more tasks to run self.task_queue.task_done()
def initdb(args): print("DB: " + configuration.get('core', 'SQL_ALCHEMY_CONN')) utils.initdb() print("Done.")
from sqlalchemy.orm.session import make_transient from airflow import executors, models, settings, utils from airflow import configuration from airflow.utils import AirflowException, State, LoggingMixin Base = models.Base ID_LEN = models.ID_LEN # Setting up a statsd client if needed statsd = None if configuration.getboolean('scheduler', 'statsd_on'): from statsd import StatsClient statsd = StatsClient( host=configuration.get('scheduler', 'statsd_host'), port=configuration.getint('scheduler', 'statsd_port'), prefix=configuration.get('scheduler', 'statsd_prefix')) class BaseJob(Base, LoggingMixin): """ Abstract class to be derived for jobs. Jobs are processing items with state and duration that aren't task instances. For instance a BackfillJob is a collection of task instance runs, but should have it's own state, start and end time. """ __tablename__ = "job" id = Column(Integer, primary_key=True)
import os import subprocess from datetime import datetime from builtins import input import argparse import dateutil.parser import airflow from airflow import jobs, settings, utils from airflow import configuration from airflow.executors import DEFAULT_EXECUTOR from airflow.models import DagBag, TaskInstance, DagPickle, DagRun from airflow.utils import AirflowException, State DAGS_FOLDER = os.path.expanduser(configuration.get('core', 'DAGS_FOLDER')) # Common help text across subcommands mark_success_help = "Mark jobs as succeeded without running them" subdir_help = "File location or directory from which to look for the dag" def process_subdir(subdir): dags_folder = configuration.get("core", "DAGS_FOLDER") dags_folder = os.path.expanduser(dags_folder) if subdir: subdir = os.path.expanduser(subdir) if "DAGS_FOLDER" in subdir: subdir = subdir.replace("DAGS_FOLDER", dags_folder) if dags_folder not in subdir: raise AirflowException(
def check_hive_conf(): from airflow import configuration as conf assert conf.get('hive', 'default_hive_mapred_queue') == 'airflow'
def jinja_globals(): return { 'hostname': socket.getfqdn(), 'navbar_color': conf.get('webserver', 'NAVBAR_COLOR'), }