def scheduler(args): """Starts Airflow Scheduler""" print(settings.HEADER) job = SchedulerJob(subdir=process_subdir(args.subdir), num_runs=args.num_runs, do_pickle=args.do_pickle) if args.daemon: pid, stdout, stderr, log_file = setup_locations( "scheduler", args.pid, args.stdout, args.stderr, args.log_file) handle = setup_logging(log_file) stdout = open(stdout, 'w+') stderr = open(stderr, 'w+') ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(pid, -1), files_preserve=[handle], stdout=stdout, stderr=stderr, ) with ctx: job.run() stdout.close() stderr.close() else: signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) signal.signal(signal.SIGQUIT, sigquit_handler) job.run()
def main(): try: from airflow import conf except ImportError: from airflow.configuration import conf from airflow.jobs.scheduler_job import SchedulerJob from airflow.models import DagBag from dbnd import dbnd_bootstrap from dbnd._core.log.logging_utils import create_file_handler from dbnd_airflow.executors.simple_executor import InProcessExecutor from test_dbnd_airflow.scenarios.scheduler_perf_experiment import ( dag_folder, dag_id, log_scheduler, ) dbnd_bootstrap() conf.set("core", "unit_test_mode", "True") logging.root.addHandler(create_file_handler(log_file=log_scheduler)) dag_bag = DagBag(dag_folder=dag_folder) scheduler_job = SchedulerJob( dag_ids=[dag_id], subdir=dag_folder, do_pickle=False, num_runs=3, executor=InProcessExecutor(dag_bag=dag_bag), ) scheduler_job.run()
def scheduler(args): """Starts Airflow Scheduler""" skip_serve_logs = args.skip_serve_logs print(settings.HEADER) job = SchedulerJob( subdir=process_subdir(args.subdir), num_runs=args.num_runs, do_pickle=args.do_pickle, ) if args.daemon: pid, stdout, stderr, log_file = setup_locations( "scheduler", args.pid, args.stdout, args.stderr, args.log_file) handle = setup_logging(log_file) with open(stdout, 'w+') as stdout_handle, open(stderr, 'w+') as stderr_handle: ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(pid, -1), files_preserve=[handle], stdout=stdout_handle, stderr=stderr_handle, ) with ctx: sub_proc = _serve_logs(skip_serve_logs) job.run() else: signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) signal.signal(signal.SIGQUIT, sigquit_handler) sub_proc = _serve_logs(skip_serve_logs) job.run() if sub_proc: sub_proc.terminate()
def scheduler(args): """Starts Airflow Scheduler""" print(settings.HEADER) job = SchedulerJob( subdir=process_subdir(args.subdir), num_runs=args.num_runs, do_pickle=args.do_pickle, ) scheduler_name = SchedulerFactory.get_scheduler_name() if scheduler_name == SchedulerFactory.DEFAULT_SCHEDULER: pass elif scheduler_name == SchedulerFactory.EVENT_BASED_SCHEDULER: job = EventBasedSchedulerJob(dag_directory=process_subdir(args.subdir), server_uri=args.server_uri) else: scheduler_class = SchedulerFactory.get_default_scheduler() job = scheduler_class() if args.daemon: pid, stdout, stderr, log_file = setup_locations( "scheduler", args.pid, args.stdout, args.stderr, args.log_file) handle = setup_logging(log_file) stdout = open(stdout, 'w+') stderr = open(stderr, 'w+') ctx = daemon.DaemonContext( pidfile=TimeoutPIDLockFile(pid, -1), files_preserve=[handle], stdout=stdout, stderr=stderr, ) with ctx: job.run() stdout.close() stderr.close() else: signal.signal(signal.SIGINT, sigint_handler) signal.signal(signal.SIGTERM, sigint_handler) signal.signal(signal.SIGQUIT, sigquit_handler) job.run()
def main(num_runs, repeat, pre_create_dag_runs, executor_class, dag_ids): """ This script can be used to measure the total "scheduler overhead" of Airflow. By overhead we mean if the tasks executed instantly as soon as they are executed (i.e. they do nothing) how quickly could we schedule them. It will monitor the task completion of the Mock/stub executor (no actual tasks are run) and after the required number of dag runs for all the specified dags have completed all their tasks, it will cleanly shut down the scheduler. The dags you run with need to have an early enough start_date to create the desired number of runs. Care should be taken that other limits (DAG concurrency, pool size etc) are not the bottleneck. This script doesn't help you in that regard. It is recommended to repeat the test at least 3 times (`--repeat=3`, the default) so that you can get somewhat-accurate variance on the reported timing numbers, but this can be disabled for longer runs if needed. """ # Turn on unit test mode so that we don't do any sleep() in the scheduler # loop - not needed on master, but this script can run against older # releases too! os.environ['AIRFLOW__CORE__UNIT_TEST_MODE'] = 'True' os.environ['AIRFLOW__CORE__DAG_CONCURRENCY'] = '500' # Set this so that dags can dynamically configure their end_date os.environ['AIRFLOW_BENCHMARK_MAX_DAG_RUNS'] = str(num_runs) os.environ['PERF_MAX_RUNS'] = str(num_runs) if pre_create_dag_runs: os.environ['AIRFLOW__SCHEDULER__USE_JOB_SCHEDULE'] = 'False' from airflow.jobs.scheduler_job import SchedulerJob from airflow.models.dagbag import DagBag from airflow.utils import db dagbag = DagBag() dags = [] with db.create_session() as session: pause_all_dags(session) for dag_id in dag_ids: dag = dagbag.get_dag(dag_id) dag.sync_to_db(session=session) dags.append(dag) reset_dag(dag, session) next_run_date = dag.normalize_schedule(dag.start_date or min(t.start_date for t in dag.tasks)) for _ in range(num_runs - 1): next_run_date = dag.following_schedule(next_run_date) end_date = dag.end_date or dag.default_args.get('end_date') if end_date != next_run_date: message = ( f"DAG {dag_id} has incorrect end_date ({end_date}) for number of runs! " f"It should be " f" {next_run_date}") sys.exit(message) if pre_create_dag_runs: create_dag_runs(dag, num_runs, session) ShortCircuitExecutor = get_executor_under_test(executor_class) executor = ShortCircuitExecutor(dag_ids_to_watch=dag_ids, num_runs=num_runs) scheduler_job = SchedulerJob(dag_ids=dag_ids, do_pickle=False, executor=executor) executor.scheduler_job = scheduler_job total_tasks = sum(len(dag.tasks) for dag in dags) if 'PYSPY' in os.environ: pid = str(os.getpid()) filename = os.environ.get('PYSPY_O', 'flame-' + pid + '.html') os.spawnlp(os.P_NOWAIT, 'sudo', 'sudo', 'py-spy', 'record', '-o', filename, '-p', pid, '--idle') times = [] # Need a lambda to refer to the _latest_ value for scheduler_job, not just # the initial one code_to_test = lambda: scheduler_job.run() # pylint: disable=unnecessary-lambda for count in range(repeat): gc.disable() start = time.perf_counter() code_to_test() times.append(time.perf_counter() - start) gc.enable() print("Run %d time: %.5f" % (count + 1, times[-1])) if count + 1 != repeat: with db.create_session() as session: for dag in dags: reset_dag(dag, session) executor.reset(dag_ids) scheduler_job = SchedulerJob(dag_ids=dag_ids, do_pickle=False, executor=executor) executor.scheduler_job = scheduler_job print() print() msg = "Time for %d dag runs of %d dags with %d total tasks: %.4fs" if len(times) > 1: print((msg + " (±%.3fs)") % (num_runs, len(dags), total_tasks, statistics.mean(times), statistics.stdev(times))) else: print(msg % (num_runs, len(dags), total_tasks, times[0])) print() print()