def get_webserver_url(): return "{}:{}".format(conf.get('webserver', 'WEB_SERVER_HOST'), conf.get('webserver', 'WEB_SERVER_PORT'))
def config(): conf.get('cwl', 'jobs') conf.get('cwl', 'limit')
def general_paths(): paths([conf.get('cwl', 'jobs'), DAGS_FOLDER, os.path.join(DAGS_FOLDER, "cwl_dag.py")])
'owner': 'xingya-zhou', 'depends_on_past': False, 'start_date': datetime.datetime.now(), 'email_on_failure': False, 'email_on_retry': False, 'retries': 3, 'retry_delay': timedelta(minutes=5), 'catchup': False, 'retry_delay': timedelta(minutes=5) } dag = DAG('sparkify_dag', default_args=default_args, start_date=datetime.datetime.now()) f = open(os.path.join(conf.get('core', 'dags_folder'), 'create_tables.sql')) create_tables_sql = f.read() create_trips_table = PostgresOperator(task_id="create_trips_table", dag=dag, postgres_conn_id="redshift", sql=create_tables_sql) start_operator = DummyOperator(task_id='Begin_execution', dag=dag) stage_events_to_redshift = StageToRedshiftOperator( task_id='Stage_events', dag=dag, redshift_conn_id="redshift", aws_credentials_id="aws_credentials", table="staging_events",
def create_folders(): logging.info("Create folders for jobs and dags\n- {}\n- {}".format(conf.get('cwl', 'jobs'), DAGS_FOLDER)) get_folder(conf.get('cwl', 'jobs')) get_folder(DAGS_FOLDER)
def worker(args): """Starts Airflow Celery worker""" env = os.environ.copy() env['AIRFLOW_HOME'] = settings.AIRFLOW_HOME if not settings.validate_session(): log = LoggingMixin().log log.error("Worker exiting... database connection precheck failed! ") sys.exit(1) # Celery worker from airflow.executors.celery_executor import app as celery_app from celery.bin import worker # pylint: disable=redefined-outer-name autoscale = args.autoscale if autoscale is None and conf.has_option("celery", "worker_autoscale"): autoscale = conf.get("celery", "worker_autoscale") worker = worker.worker(app=celery_app) # pylint: disable=redefined-outer-name options = { 'optimization': 'fair', 'O': 'fair', 'queues': args.queues, 'concurrency': args.concurrency, 'autoscale': autoscale, 'hostname': args.celery_hostname, 'loglevel': conf.get('core', 'LOGGING_LEVEL'), } if conf.has_option("celery", "pool"): options["pool"] = conf.get("celery", "pool") if args.daemon: pid, stdout, stderr, log_file = setup_locations( "worker", args.pid, args.stdout, args.stderr, args.log_file) handle = setup_logging(log_file) stdout = open(stdout, 'w+') stderr = open(stderr, 'w+') ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(pid, -1), files_preserve=[handle], stdout=stdout, stderr=stderr, ) with ctx: sub_proc = subprocess.Popen(['airflow', 'serve_logs'], env=env, close_fds=True) worker.run(**options) sub_proc.kill() stdout.close() stderr.close() else: signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) sub_proc = subprocess.Popen(['airflow', 'serve_logs'], env=env, close_fds=True) worker.run(**options) sub_proc.kill()
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """Airflow logging settings""" import os from typing import Any, Dict, Union from airflow import AirflowException, conf from airflow.utils.file import mkdirs # TODO: Logging format and level should be configured # in this file instead of from airflow.cfg. Currently # there are other log format and level configurations in # settings.py and cli.py. Please see AIRFLOW-1455. LOG_LEVEL: str = conf.get('logging', 'LOGGING_LEVEL').upper() # Flask appbuilder's info level log is very verbose, # so it's set to 'WARN' by default. FAB_LOG_LEVEL: str = conf.get('logging', 'FAB_LOGGING_LEVEL').upper() LOG_FORMAT: str = conf.get('logging', 'LOG_FORMAT') COLORED_LOG_FORMAT: str = conf.get('logging', 'COLORED_LOG_FORMAT') COLORED_LOG: bool = conf.getboolean('logging', 'COLORED_CONSOLE_LOG') COLORED_FORMATTER_CLASS: str = conf.get('logging', 'COLORED_FORMATTER_CLASS') BASE_LOG_FOLDER: str = conf.get('logging', 'BASE_LOG_FOLDER')
""" Module for Airtunnel's paths, i.e. to the declaration, data and scripts store folders. """ from os import path from airflow import conf P_DECLARATIONS = conf.get(section="airtunnel", key="declarations_folder") P_DATA = conf.get(section="airtunnel", key="data_store_folder") P_SCRIPTS = conf.get(section="airtunnel", key="scripts_folder") P_SCRIPTS_SQL = path.join(P_SCRIPTS, "sql") P_SCRIPTS_PY = path.join(P_SCRIPTS, "py") # define data paths based on data store root: P_DATA_ARCHIVE = path.join(P_DATA, "archive") P_DATA_INGEST = path.join(P_DATA, "ingest") P_DATA_READY = path.join(P_DATA, "ready") P_DATA_STAGING = path.join(P_DATA, "staging") P_DATA_STAGING_PICKEDUP = path.join(P_DATA_STAGING, "pickedup") P_DATA_STAGING_READY = path.join(P_DATA_STAGING, "ready") P_DATA_STAGING_INTERMEDIATE = path.join(P_DATA_STAGING, "intermediate") P_DATA_INGEST_LANDING = path.join(P_DATA_INGEST, "landing") P_DATA_INGEST_ARCHIVE = path.join(P_DATA_INGEST, "archive")
def webserver(args): """Starts Airflow Webserver""" print(settings.HEADER) access_logfile = args.access_logfile or conf.get('webserver', 'access_logfile') error_logfile = args.error_logfile or conf.get('webserver', 'error_logfile') num_workers = args.workers or conf.get('webserver', 'workers') worker_timeout = (args.worker_timeout or conf.get('webserver', 'web_server_worker_timeout')) ssl_cert = args.ssl_cert or conf.get('webserver', 'web_server_ssl_cert') ssl_key = args.ssl_key or conf.get('webserver', 'web_server_ssl_key') if not ssl_cert and ssl_key: raise AirflowException( 'An SSL certificate must also be provided for use with ' + ssl_key) if ssl_cert and not ssl_key: raise AirflowException( 'An SSL key must also be provided for use with ' + ssl_cert) if args.debug: print("Starting the web server on port {0} and host {1}.".format( args.port, args.hostname)) app, _ = create_app(None, testing=conf.getboolean('core', 'unit_test_mode')) app.run(debug=True, use_reloader=not app.config['TESTING'], port=args.port, host=args.hostname, ssl_context=(ssl_cert, ssl_key) if ssl_cert and ssl_key else None) else: os.environ['SKIP_DAGS_PARSING'] = 'True' app = cached_app(None) pid, stdout, stderr, log_file = setup_locations( "webserver", args.pid, args.stdout, args.stderr, args.log_file) os.environ.pop('SKIP_DAGS_PARSING') if args.daemon: handle = setup_logging(log_file) stdout = open(stdout, 'w+') stderr = open(stderr, 'w+') print( textwrap.dedent('''\ Running the Gunicorn Server with: Workers: {num_workers} {workerclass} Host: {hostname}:{port} Timeout: {worker_timeout} Logfiles: {access_logfile} {error_logfile} =================================================================\ '''.format(num_workers=num_workers, workerclass=args.workerclass, hostname=args.hostname, port=args.port, worker_timeout=worker_timeout, access_logfile=access_logfile, error_logfile=error_logfile))) run_args = [ 'gunicorn', '-w', str(num_workers), '-k', str(args.workerclass), '-t', str(worker_timeout), '-b', args.hostname + ':' + str(args.port), '-n', 'airflow-webserver', '-p', str(pid), '-c', 'python:airflow.www.gunicorn_config', ] if args.access_logfile: run_args += ['--access-logfile', str(args.access_logfile)] if args.error_logfile: run_args += ['--error-logfile', str(args.error_logfile)] if args.daemon: run_args += ['-D'] if ssl_cert: run_args += ['--certfile', ssl_cert, '--keyfile', ssl_key] webserver_module = 'www' run_args += ["airflow." + webserver_module + ".app:cached_app()"] gunicorn_master_proc = None def kill_proc(dummy_signum, dummy_frame): # pylint: disable=unused-argument gunicorn_master_proc.terminate() gunicorn_master_proc.wait() sys.exit(0) def monitor_gunicorn(gunicorn_master_proc): # These run forever until SIG{INT, TERM, KILL, ...} signal is sent if conf.getint('webserver', 'worker_refresh_interval') > 0: master_timeout = conf.getint('webserver', 'web_server_master_timeout') restart_workers(gunicorn_master_proc, num_workers, master_timeout) else: while gunicorn_master_proc.poll() is None: time.sleep(1) sys.exit(gunicorn_master_proc.returncode) if args.daemon: base, ext = os.path.splitext(pid) ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(base + "-monitor" + ext, -1), files_preserve=[handle], stdout=stdout, stderr=stderr, signal_map={ signal.SIGINT: kill_proc, signal.SIGTERM: kill_proc }, ) with ctx: subprocess.Popen(run_args, close_fds=True) # Reading pid file directly, since Popen#pid doesn't # seem to return the right value with DaemonContext. while True: try: with open(pid) as file: gunicorn_master_proc_pid = int(file.read()) break except OSError: LOG.debug( "Waiting for gunicorn's pid file to be created.") time.sleep(0.1) gunicorn_master_proc = psutil.Process(gunicorn_master_proc_pid) monitor_gunicorn(gunicorn_master_proc) stdout.close() stderr.close() else: gunicorn_master_proc = subprocess.Popen(run_args, close_fds=True) signal.signal(signal.SIGINT, kill_proc) signal.signal(signal.SIGTERM, kill_proc) monitor_gunicorn(gunicorn_master_proc)
class DeploymentView(BaseView): plugins_folder = conf.get("core", "plugins_folder") template_folder = os.path.join(plugins_folder, "deploy-plugin") repo = git.Repo(conf.get("core", "dags_folder")) route_base = "/deployment" def render(self, template, **context): return render_template( template, base_template=self.appbuilder.base_template, appbuilder=self.appbuilder, **context, ) @expose("/status") @has_access @action_logging def list(self): title = "Deployment" data = dict() remotes = list() for rem in self.repo.remotes: remotes.append((rem.name, rem.url)) try: rem.fetch(prune=True) except GitCommandError as gexc: flash(str(gexc), "error") data["remotes"] = remotes data["active_branch"] = self.repo.active_branch.name data["sha"] = self.repo.head.object.hexsha data["commit_message"] = self.repo.head.object.message data["author"] = self.repo.head.object.author data["committed_date"] = datetime.fromtimestamp( self.repo.head.object.committed_date).strftime("%Y-%m-%d %H:%M:%S") data["local_branches"] = [brn.name for brn in self.repo.branches] remote_branches = [ ref.name for ref in self.repo.remotes.origin.refs if "HEAD" not in ref.name ] form = GitBranchForm() form.git_branches.choices = [(brn, brn) for brn in remote_branches] return self.render_template("deploy.html", title=title, form=form, data=data) @expose("/deploy", methods=["POST"]) @has_access @action_logging def deploy(self): new_branch = request.form.get("git_branches") new_local_branch = new_branch.replace("origin/", "") try: self.repo.git.checkout(new_local_branch) self.repo.git.pull() if new_local_branch == self.repo.active_branch.name: flash(f"Successfully updated branch: {new_local_branch}") else: flash(f"Successfully changed to branch: {new_local_branch}") except GitCommandError as gexc: flash(str(gexc), "error") return redirect("/deployment/status")
class DAGDependenciesView(BaseView): dagbag = None plugins_folder = conf.get("core", "plugins_folder") template_folder = os.path.join(plugins_folder, "dag-dependencies-plugin") route_base = "/" refresh_interval = conf.getint( "dag_dependencies_plugin", "refresh_interval", fallback=300 ) last_refresh = datetime(2000, 1, 1) nodes = [] edges = [] def render(self, template, **context): return render_template( template, base_template=self.appbuilder.base_template, appbuilder=self.appbuilder, **context, ) @expose("/dag-dependencies") @has_access def list(self): title = "DAG Dependencies" if DAGDependenciesView.dagbag is None: DAGDependenciesView.dagbag = models.DagBag(settings.DAGS_FOLDER) if datetime.utcnow() > self.last_refresh + timedelta( seconds=self.refresh_interval ): DAGDependenciesView.dagbag.collect_dags() self.nodes, self.edges = self._generate_graph() self.last_refresh = datetime.utcnow() return self.render_template( "dag_dependencies.html", title=title, nodes=self.nodes, edges=self.edges, last_refresh=self.last_refresh.strftime("%Y-%m-%d %H:%M:%S"), arrange=conf.get("webserver", "dag_orientation"), width=request.args.get("width", "100%"), height=request.args.get("height", "800"), ) @staticmethod def _generate_graph(): nodes = {} edges = [] for dag_id, dag in DAGDependenciesView.dagbag.dags.items(): dag_node_id = "d--" + dag_id nodes[dag_node_id] = DAGDependenciesView._node_dict( dag_node_id, dag_id, "fill: rgb(232, 247, 228)" ) for task in dag.tasks: task_node_id = "t--" + dag_id + "--" + task.task_id if isinstance(task, TriggerDagRunOperator): nodes[task_node_id] = DAGDependenciesView._node_dict( task_node_id, task.task_id, "fill: rgb(255, 239, 235)" ) edges.append({"u": dag_node_id, "v": task_node_id}) edges.append({"u": task_node_id, "v": "d--" + task.trigger_dag_id}) elif isinstance(task, ExternalTaskSensor): nodes[task_node_id] = DAGDependenciesView._node_dict( task_node_id, task.task_id, "fill: rgb(230, 241, 242)" ) edges.append({"u": task_node_id, "v": dag_node_id}) edges.append({"u": "d--" + task.external_dag_id, "v": task_node_id}) implicit = getattr(dag, "implicit_dependencies", None) if isinstance(implicit, list): for dep in implicit: dep_node_id = "i--" + dag_id + "--" + dep nodes[dep_node_id] = DAGDependenciesView._node_dict( dep_node_id, "implicit", "fill: gold" ) edges.append({"u": dep_node_id, "v": dag_node_id}) edges.append({"u": "d--" + dep, "v": dep_node_id}) return list(nodes.values()), edges @staticmethod def _node_dict(node_id, label, style): return { "id": node_id, "value": {"label": label, "style": style, "rx": 5, "ry": 5}, }
def serve_logs(filename): # pylint: disable=unused-variable, redefined-outer-name log = os.path.expanduser(conf.get('core', 'BASE_LOG_FOLDER')) return flask.send_from_directory(log, filename, mimetype="application/json", as_attachment=False)