def execution_parallelism(self, parallelism=0): executor = LocalExecutor(parallelism=parallelism) executor.start() success_key = 'success {}' success_command = ['true', 'some_parameter'] fail_command = ['false', 'some_parameter'] self.assertTrue(executor.result_queue.empty()) execution_date = datetime.datetime.now() for i in range(self.TEST_SUCCESS_COMMANDS): key_id, command = success_key.format(i), success_command key = key_id, 'fake_ti', execution_date, 0 executor.running.add(key) executor.execute_async(key=key, command=command) fail_key = 'fail', 'fake_ti', execution_date, 0 executor.running.add(fail_key) executor.execute_async(key=fail_key, command=fail_command) executor.end() # By that time Queues are already shutdown so we cannot check if they are empty self.assertEqual(len(executor.running), 0) for i in range(self.TEST_SUCCESS_COMMANDS): key_id = success_key.format(i) key = key_id, 'fake_ti', execution_date, 0 self.assertEqual(executor.event_buffer[key], State.SUCCESS) self.assertEqual(executor.event_buffer[fail_key], State.FAILED) expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism self.assertEqual(executor.workers_used, expected)
def test_gauge_executor_metrics(self, mock_stats_gauge, mock_trigger_tasks, mock_sync): executor = LocalExecutor() executor.heartbeat() calls = [mock.call('executor.open_slots', mock.ANY), mock.call('executor.queued_tasks', mock.ANY), mock.call('executor.running_tasks', mock.ANY)] mock_stats_gauge.assert_has_calls(calls)
def _test_execute(self, parallelism, success_command, fail_command): executor = LocalExecutor(parallelism=parallelism) executor.start() success_key = 'success {}' assert executor.result_queue.empty() execution_date = datetime.datetime.now() for i in range(self.TEST_SUCCESS_COMMANDS): key_id, command = success_key.format(i), success_command key = key_id, 'fake_ti', execution_date, 0 executor.running.add(key) executor.execute_async(key=key, command=command) fail_key = 'fail', 'fake_ti', execution_date, 0 executor.running.add(fail_key) executor.execute_async(key=fail_key, command=fail_command) executor.end() # By that time Queues are already shutdown so we cannot check if they are empty assert len(executor.running) == 0 for i in range(self.TEST_SUCCESS_COMMANDS): key_id = success_key.format(i) key = key_id, 'fake_ti', execution_date, 0 assert executor.event_buffer[key][0] == State.SUCCESS assert executor.event_buffer[fail_key][0] == State.FAILED expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism assert executor.workers_used == expected
def _get_executor(executor_name): """ Creates a new instance of the named executor. In case the executor name is not know in airflow, look for it in the plugins """ if executor_name == 'LocalExecutor': return LocalExecutor() elif executor_name == 'SequentialExecutor': return SequentialExecutor() elif executor_name == 'CeleryExecutor': from airflow.executors.celery_executor import CeleryExecutor return CeleryExecutor() elif executor_name == 'DaskExecutor': from airflow.executors.dask_executor import DaskExecutor return DaskExecutor() elif executor_name == 'MesosExecutor': from airflow.contrib.executors.mesos_executor import MesosExecutor return MesosExecutor() else: # Loading plugins _integrate_plugins() executor_path = executor_name.split('.') if len(executor_path) != 2: raise AirflowException( "Executor {0} not supported: please specify in format plugin_module.executor" .format(executor_name)) if executor_path[0] in globals(): return globals()[executor_path[0]].__dict__[executor_path[1]]() else: raise AirflowException( "Executor {0} not supported.".format(executor_name))
def _get_executor(executor_name: str) -> BaseExecutor: """ Creates a new instance of the named executor. In case the executor name is unknown in airflow, look for it in the plugins """ if executor_name == ExecutorLoader.LOCAL_EXECUTOR: from airflow.executors.local_executor import LocalExecutor return LocalExecutor() elif executor_name == ExecutorLoader.SEQUENTIAL_EXECUTOR: from airflow.executors.sequential_executor import SequentialExecutor return SequentialExecutor() elif executor_name == ExecutorLoader.CELERY_EXECUTOR: from airflow.executors.celery_executor import CeleryExecutor return CeleryExecutor() elif executor_name == ExecutorLoader.DASK_EXECUTOR: from airflow.executors.dask_executor import DaskExecutor return DaskExecutor() elif executor_name == ExecutorLoader.KUBERNETES_EXECUTOR: from airflow.executors.kubernetes_executor import KubernetesExecutor return KubernetesExecutor() else: # Load plugins here for executors as at that time the plugins might not have been initialized yet # TODO: verify the above and remove two lines below in case plugins are always initialized first from airflow import plugins_manager plugins_manager.integrate_executor_plugins() executor_path = executor_name.split('.') assert len(executor_path) == 2, f"Executor {executor_name} not supported: " \ f"please specify in format plugin_module.executor" assert executor_path[0] in globals( ), f"Executor {executor_name} not supported" return globals()[executor_path[0]].__dict__[executor_path[1]]()
def _get_executor(executor_name): """ Creates a new instance of the named executor. In case the executor name is not know in airflow, look for it in the plugins """ parallelism = PARALLELISM if executor_name == Executors.LocalExecutor: return LocalExecutor(parallelism) elif executor_name == Executors.SequentialExecutor: return SequentialExecutor(parallelism) elif executor_name == Executors.CeleryExecutor: from airflow.executors.celery_executor import CeleryExecutor, execute_command return CeleryExecutor(parallelism, execute_command) elif executor_name == Executors.DaskExecutor: from airflow.executors.dask_executor import DaskExecutor cluster_address = configuration.conf.get('dask', 'cluster_address') tls_ca = configuration.conf.get('dask', 'tls_ca') tls_key = configuration.conf.get('dask', 'tls_key') tls_cert = configuration.conf.get('dask', 'tls_cert') return DaskExecutor(parallelism, cluster_address, tls_ca, tls_key, tls_cert) elif executor_name == Executors.MesosExecutor: from airflow.contrib.executors.mesos_executor import MesosExecutor return MesosExecutor(parallelism) elif executor_name == Executors.KubernetesExecutor: from airflow.contrib.executors.kubernetes_executor import KubernetesExecutor return KubernetesExecutor() else: # Loading plugins _integrate_plugins() # 从插件模块中获取指定类 args = [] kwargs = {'parallelism': PARALLELISM} return create_object_from_plugin_module(executor_name, *args, **kwargs)
def subdag_task(database): sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, database, '@once'), task_id=database, dag=main_dag, pool='Pool_max_parallel_500', executor=LocalExecutor()) return sub_dag
def start_scheduler(self, file_path): self.scheduler = EventBasedSchedulerJob( dag_directory=file_path, server_uri="localhost:{}".format(self.port), executor=LocalExecutor(3), max_runs=-1, refresh_dag_dir_interval=30) print("scheduler starting") self.scheduler.run()
def execution_parallelism(self, mock_check_call, parallelism=0): success_command = ['airflow', 'tasks', 'run', 'true', 'some_parameter'] fail_command = ['airflow', 'tasks', 'run', 'false'] def fake_execute_command(command, close_fds=True): # pylint: disable=unused-argument if command != success_command: raise subprocess.CalledProcessError(returncode=1, cmd=command) else: return 0 mock_check_call.side_effect = fake_execute_command executor = LocalExecutor(parallelism=parallelism) executor.start() success_key = 'success {}' self.assertTrue(executor.result_queue.empty()) execution_date = datetime.datetime.now() for i in range(self.TEST_SUCCESS_COMMANDS): key_id, command = success_key.format(i), success_command key = key_id, 'fake_ti', execution_date, 0 executor.running.add(key) executor.execute_async(key=key, command=command) fail_key = 'fail', 'fake_ti', execution_date, 0 executor.running.add(fail_key) executor.execute_async(key=fail_key, command=fail_command) executor.end() # By that time Queues are already shutdown so we cannot check if they are empty self.assertEqual(len(executor.running), 0) for i in range(self.TEST_SUCCESS_COMMANDS): key_id = success_key.format(i) key = key_id, 'fake_ti', execution_date, 0 self.assertEqual(executor.event_buffer[key][0], State.SUCCESS) self.assertEqual(executor.event_buffer[fail_key][0], State.FAILED) expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism self.assertEqual(executor.workers_used, expected)
def start_scheduler(cls, file_path, executor=None): if executor is None: executor = LocalExecutor(3) scheduler = EventBasedSchedulerJob(dag_directory=file_path, server_uri="localhost:{}".format( master_port()), executor=executor, max_runs=-1, refresh_dag_dir_interval=30) print("scheduler starting") scheduler.run()
def execution_parallelism(self, parallelism=0): executor = LocalExecutor(parallelism=parallelism) executor.start() success_key = 'success {}' success_command = ['true', 'some_parameter'] fail_command = ['false', 'some_parameter'] self.assertTrue(executor.result_queue.empty()) for i in range(self.TEST_SUCCESS_COMMANDS): key, command = success_key.format(i), success_command executor.running[key] = True executor.execute_async(key=key, command=command) executor.running['fail'] = True executor.execute_async(key='fail', command=fail_command) executor.end() if isinstance(executor.impl, LocalExecutor._LimitedParallelism): self.assertTrue(executor.queue.empty()) self.assertEqual(len(executor.running), 0) self.assertTrue(executor.result_queue.empty()) for i in range(self.TEST_SUCCESS_COMMANDS): key = success_key.format(i) self.assertEqual(executor.event_buffer[key], State.SUCCESS) self.assertEqual(executor.event_buffer['fail'], State.FAILED) expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism self.assertEqual(executor.workers_used, expected)
start_task >> dt_s3 dt_s3 >> dt_sf dt_sf >> end return one_dag ############################################################################# #Defining Main Dag structure ############################################################################# main_dag = DAG( dag_id=parent_dag_name, default_args=default_args, schedule_interval='@once' #schedule_interval=timedelta(minutes=5), #max_active_runs=1 , concurrency=35) database_list = ['database'] #Each database is an independant task that will run in parallel4 for i in database_list: sub_dag = SubDagOperator(subdag=database_sub_dag(parent_dag_name, i, '@once'), task_id=i, dag=main_dag, pool='Pool_max_parallel_5', executor=LocalExecutor())
from airflow.exceptions import AirflowException def _integrate_plugins(): """Integrate plugins to the context.""" from airflow.plugins_manager import executors_modules for executors_module in executors_modules: sys.modules[executors_module.__name__] = executors_module globals()[executors_module._name] = executors_module _EXECUTOR = configuration.get('core', 'EXECUTOR') if _EXECUTOR == 'LocalExecutor': DEFAULT_EXECUTOR = LocalExecutor() elif _EXECUTOR == 'CeleryExecutor': DEFAULT_EXECUTOR = CeleryExecutor() elif _EXECUTOR == 'SequentialExecutor': DEFAULT_EXECUTOR = SequentialExecutor() elif _EXECUTOR == 'MesosExecutor': from airflow.contrib.executors.mesos_executor import MesosExecutor DEFAULT_EXECUTOR = MesosExecutor() else: # Loading plugins _integrate_plugins() if _EXECUTOR in globals(): DEFAULT_EXECUTOR = globals()[_EXECUTOR]() else: raise AirflowException("Executor {0} not supported.".format(_EXECUTOR))
def execution_parallelism(self, parallelism=0): executor = LocalExecutor(parallelism=parallelism) executor.start() success_key = 'success {}' success_command = 'echo {}' fail_command = 'exit 1' for i in range(self.TEST_SUCCESS_COMMANDS): key, command = success_key.format(i), success_command.format(i) executor.execute_async(key=key, command=command) executor.running[key] = True # errors are propagated for some reason try: executor.execute_async(key='fail', command=fail_command) except: pass executor.running['fail'] = True if parallelism == 0: with timeout(seconds=5): executor.end() else: executor.end() for i in range(self.TEST_SUCCESS_COMMANDS): key = success_key.format(i) self.assertTrue(executor.event_buffer[key], State.SUCCESS) self.assertTrue(executor.event_buffer['fail'], State.FAILED) for i in range(self.TEST_SUCCESS_COMMANDS): self.assertNotIn(success_key.format(i), executor.running) self.assertNotIn('fail', executor.running) expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism self.assertEqual(executor.workers_used, expected)
def test_scheduler_task(self): TEST_DAG_FOLDER = os.environ['AIRFLOW__CORE__DAGS_FOLDER'] DEFAULT_DATE = timezone.datetime(2020, 1, 1) dag_id = 'test_event_based_dag' task_id = 'sleep_1000_secs' with create_session() as session: dag_bag = DagBag( dag_folder=TEST_DAG_FOLDER, include_examples=False, ) dag = dag_bag.get_dag(dag_id) task = dag.get_task(task_id) dag.create_dagrun( run_id="sleep_1000_secs_run", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.state = State.SCHEDULED dag_model = DagModel( dag_id=dag_id, is_paused=False, concurrency=5, has_task_concurrency_limits=False, ) session.merge(dag_model) session.merge(ti) session.commit() executor = LocalExecutor(2) executor.start() executor.heartbeat() executor.schedule_task(ti.key, SchedulingAction.START) executor.heartbeat() time.sleep(30) # wait for task instance started ti.refresh_from_db() self.assertEqual(ti.state, State.RUNNING) process = psutil.Process(ti.pid) self.assertIsNotNone(process) child = process.children(recursive=False) self.assertEqual(1, len(child)) grandchild = child[0].children(recursive=False) self.assertEqual(1, len(grandchild)) tes = self._check_task_execution(ti) self.assertEqual(1, len(tes)) # restart the task instance executor.schedule_task(ti.key, SchedulingAction.RESTART) executor.heartbeat() time.sleep(30) self.assertFalse(self._check_process_exist(process.pid)) self.assertFalse(self._check_process_exist(child[0].pid)) self.assertFalse(self._check_process_exist(grandchild[0].pid)) ti.refresh_from_db() self.assertEqual(ti.state, State.RUNNING) process = psutil.Process(ti.pid) self.assertIsNotNone(process) child = process.children(recursive=False) self.assertEqual(1, len(child)) grandchild = child[0].children(recursive=False) self.assertEqual(1, len(grandchild)) tes = self._check_task_execution(ti) self.assertEqual(2, len(tes)) self.assertEqual(2, tes[0].seq_num) executor.schedule_task(ti.key, SchedulingAction.STOP) ti.refresh_from_db() time.sleep(10) self.assertEqual(State.KILLED, ti.state) self.assertFalse(self._check_process_exist(process.pid)) self.assertFalse(self._check_process_exist(child[0].pid)) self.assertFalse(self._check_process_exist(grandchild[0].pid)) self._check_task_execution(ti) executor.end()
def execution_parallelism(self, parallelism=0): executor = LocalExecutor(parallelism=parallelism) executor.start() success_key = 'success {}' success_command = ['true', 'some_parameter'] fail_command = ['false', 'some_parameter'] self.assertTrue(executor.result_queue.empty()) for i in range(self.TEST_SUCCESS_COMMANDS): key, command = success_key.format(i), success_command executor.running[key] = True executor.execute_async(key=key, command=command) executor.running['fail'] = True executor.execute_async(key='fail', command=fail_command) executor.end() # By that time Queues are already shutdown so we cannot check if they are empty self.assertEqual(len(executor.running), 0) for i in range(self.TEST_SUCCESS_COMMANDS): key = success_key.format(i) self.assertEqual(executor.event_buffer[key], State.SUCCESS) self.assertEqual(executor.event_buffer['fail'], State.FAILED) expected = self.TEST_SUCCESS_COMMANDS + 1 if parallelism == 0 else parallelism self.assertEqual(executor.workers_used, expected)