def monitor_gunicorn(gunicorn_master_proc): # These run forever until SIG{INT, TERM, KILL, ...} signal is sent if conf.getint('webserver', 'worker_refresh_interval') > 0: master_timeout = conf.getint('webserver', 'web_server_master_timeout') restart_workers(gunicorn_master_proc, num_workers, master_timeout) else: while gunicorn_master_proc.poll() is None: time.sleep(1) sys.exit(gunicorn_master_proc.returncode)
def default_pool_open_slots(session): from airflow.models import TaskInstance as TI # To avoid circular imports total_slots = conf.getint('core', 'non_pooled_task_slot_count') used_slots = session.query(func.count()).filter( TI.pool == Pool.default_pool_name).filter( TI.state.in_([State.RUNNING, State.QUEUED])).scalar() return total_slots - used_slots
def start_refresh(gunicorn_master_proc): batch_size = conf.getint('webserver', 'worker_refresh_batch_size') log.debug('%s doing a refresh of %s workers', state, batch_size) sys.stdout.flush() sys.stderr.flush() excess = 0 for _ in range(batch_size): gunicorn_master_proc.send_signal(signal.SIGTTIN) excess += 1 wait_until_true(lambda: num_workers_expected + excess == get_num_workers_running(gunicorn_master_proc), master_timeout)
def test_send_mime_ssl(self, mock_smtp, mock_smtp_ssl): mock_smtp.return_value = mock.Mock() mock_smtp_ssl.return_value = mock.Mock() with conf_vars({('smtp', 'smtp_ssl'): 'True'}): utils.email.send_MIME_email('from', 'to', MIMEMultipart(), dryrun=False) self.assertFalse(mock_smtp.called) mock_smtp_ssl.assert_called_once_with( conf.get('smtp', 'SMTP_HOST'), conf.getint('smtp', 'SMTP_PORT'), )
def test_send_mime_noauth(self, mock_smtp, mock_smtp_ssl): mock_smtp.return_value = mock.Mock() mock_smtp_ssl.return_value = mock.Mock() with conf_vars({ ('smtp', 'smtp_user'): None, ('smtp', 'smtp_password'): None, }): utils.email.send_MIME_email('from', 'to', MIMEMultipart(), dryrun=False) self.assertFalse(mock_smtp_ssl.called) mock_smtp.assert_called_once_with( conf.get('smtp', 'SMTP_HOST'), conf.getint('smtp', 'SMTP_PORT'), ) self.assertFalse(mock_smtp.login.called)
def test_send_mime(self, mock_smtp, mock_smtp_ssl): mock_smtp.return_value = mock.Mock() mock_smtp_ssl.return_value = mock.Mock() msg = MIMEMultipart() utils.email.send_MIME_email('from', 'to', msg, dryrun=False) mock_smtp.assert_called_once_with( conf.get('smtp', 'SMTP_HOST'), conf.getint('smtp', 'SMTP_PORT'), ) self.assertTrue(mock_smtp.return_value.starttls.called) mock_smtp.return_value.login.assert_called_once_with( conf.get('smtp', 'SMTP_USER'), conf.get('smtp', 'SMTP_PASSWORD'), ) mock_smtp.return_value.sendmail.assert_called_once_with( 'from', 'to', msg.as_string()) self.assertTrue(mock_smtp.return_value.quit.called)
def restart_workers(gunicorn_master_proc, num_workers_expected, master_timeout): """ Runs forever, monitoring the child processes of @gunicorn_master_proc and restarting workers occasionally. Each iteration of the loop traverses one edge of this state transition diagram, where each state (node) represents [ num_ready_workers_running / num_workers_running ]. We expect most time to be spent in [n / n]. `bs` is the setting webserver.worker_refresh_batch_size. The horizontal transition at ? happens after the new worker parses all the dags (so it could take a while!) V ────────────────────────────────────────────────────────────────────────┐ [n / n] ──TTIN──> [ [n, n+bs) / n + bs ] ────?───> [n + bs / n + bs] ──TTOU─┘ ^ ^───────────────┘ │ │ ┌────────────────v └──────┴────── [ [0, n) / n ] <─── start We change the number of workers by sending TTIN and TTOU to the gunicorn master process, which increases and decreases the number of child workers respectively. Gunicorn guarantees that on TTOU workers are terminated gracefully and that the oldest worker is terminated. """ def wait_until_true(fn, timeout=0): """ Sleeps until fn is true """ start_time = time.time() while not fn(): if 0 < timeout <= time.time() - start_time: raise AirflowWebServerTimeout( "No response from gunicorn master within {0} seconds" .format(timeout)) time.sleep(0.1) def start_refresh(gunicorn_master_proc): batch_size = conf.getint('webserver', 'worker_refresh_batch_size') log.debug('%s doing a refresh of %s workers', state, batch_size) sys.stdout.flush() sys.stderr.flush() excess = 0 for _ in range(batch_size): gunicorn_master_proc.send_signal(signal.SIGTTIN) excess += 1 wait_until_true(lambda: num_workers_expected + excess == get_num_workers_running(gunicorn_master_proc), master_timeout) try: # pylint: disable=too-many-nested-blocks wait_until_true(lambda: num_workers_expected == get_num_workers_running(gunicorn_master_proc), master_timeout) while True: num_workers_running = get_num_workers_running(gunicorn_master_proc) num_ready_workers_running = \ get_num_ready_workers_running(gunicorn_master_proc) state = '[{0} / {1}]'.format(num_ready_workers_running, num_workers_running) # Whenever some workers are not ready, wait until all workers are ready if num_ready_workers_running < num_workers_running: log.debug('%s some workers are starting up, waiting...', state) sys.stdout.flush() time.sleep(1) # Kill a worker gracefully by asking gunicorn to reduce number of workers elif num_workers_running > num_workers_expected: excess = num_workers_running - num_workers_expected log.debug('%s killing %s workers', state, excess) for _ in range(excess): gunicorn_master_proc.send_signal(signal.SIGTTOU) excess -= 1 wait_until_true(lambda: num_workers_expected + excess == get_num_workers_running(gunicorn_master_proc), master_timeout) # Start a new worker by asking gunicorn to increase number of workers elif num_workers_running == num_workers_expected: refresh_interval = conf.getint('webserver', 'worker_refresh_interval') log.debug( '%s sleeping for %ss starting doing a refresh...', state, refresh_interval ) time.sleep(refresh_interval) start_refresh(gunicorn_master_proc) else: # num_ready_workers_running == num_workers_running < num_workers_expected log.error(( "%s some workers seem to have died and gunicorn" "did not restart them as expected" ), state) time.sleep(10) if len( psutil.Process(gunicorn_master_proc.pid).children() ) < num_workers_expected: start_refresh(gunicorn_master_proc) except (AirflowWebServerTimeout, OSError) as err: log.error(err) log.error("Shutting down webserver") try: gunicorn_master_proc.terminate() gunicorn_master_proc.wait() finally: sys.exit(1)
# KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. """ Base executor - this is the base class for all the implemented executors. """ from collections import OrderedDict from typing import Any, Dict, List, Optional, Set, Tuple, Union from airflow import LoggingMixin, conf from airflow.models import TaskInstance from airflow.models.taskinstance import SimpleTaskInstance, TaskInstanceKeyType from airflow.stats import Stats from airflow.utils.state import State PARALLELISM: int = conf.getint('core', 'PARALLELISM') NOT_STARTED_MESSAGE = "The executor should be started first!" # Command to execute - list of strings # the first element is always "airflow". # It should be result of TaskInstance.generate_command method.q CommandType = List[str] # Task that is queued. It contains all the information that is # needed to run the task. # # Tuple of: command, priority, queue name, SimpleTaskInstance QueuedTaskInstanceType = Tuple[CommandType, int, Optional[str], Union[SimpleTaskInstance, TaskInstance]]
class DAGDependenciesView(BaseView): dagbag = None plugins_folder = conf.get("core", "plugins_folder") template_folder = os.path.join(plugins_folder, "dag-dependencies-plugin") route_base = "/" refresh_interval = conf.getint( "dag_dependencies_plugin", "refresh_interval", fallback=conf.getint("scheduler", "dag_dir_list_interval"), ) last_refresh = datetime.utcnow() - timedelta(seconds=refresh_interval) nodes = [] edges = [] def render(self, template, **context): return render_template( template, base_template=self.appbuilder.base_template, appbuilder=self.appbuilder, **context, ) @expose("/dag-dependencies") @has_access def list(self): title = "DAG Dependencies" if self.dagbag is None: from airflow.www_rbac.views import dagbag self.dagbag = dagbag if datetime.utcnow() > self.last_refresh + timedelta( seconds=self.refresh_interval): self.nodes, self.edges = self._generate_graph() self.last_refresh = datetime.utcnow() return self.render_template( "dag_dependencies.html", title=title, nodes=self.nodes, edges=self.edges, last_refresh=self.last_refresh.strftime("%Y-%m-%d %H:%M:%S"), arrange=conf.get("webserver", "dag_orientation"), width=request.args.get("width", "100%"), height=request.args.get("height", "800"), ) def _generate_graph(self): nodes = {} edges = [] for dag_id, dag in self.dagbag.dags.items(): dag_node_id = f"d--{dag_id}" nodes[dag_node_id] = DAGDependenciesView._node_dict( dag_node_id, dag_id, "fill: rgb(232, 247, 228)") for task in dag.tasks: task_node_id = f"t--{dag_id}--{task.task_id}" if isinstance(task, TriggerDagRunOperator): nodes[task_node_id] = DAGDependenciesView._node_dict( task_node_id, task.task_id, "fill: rgb(255, 239, 235)") edges.extend([ { "u": dag_node_id, "v": task_node_id }, { "u": task_node_id, "v": f"d--{task.trigger_dag_id}" }, ]) elif isinstance(task, ExternalTaskSensor): nodes[task_node_id] = DAGDependenciesView._node_dict( task_node_id, task.task_id, "fill: rgb(230, 241, 242)") edges.extend([ { "u": task_node_id, "v": dag_node_id }, { "u": f"d--{task.external_dag_id}", "v": task_node_id }, ]) implicit = getattr(dag, "implicit_dependencies", None) if isinstance(implicit, list): for dep in implicit: dep_node_id = f"i--{dag_id}--{dep}" nodes[dep_node_id] = DAGDependenciesView._node_dict( dep_node_id, "implicit", "fill: gold") edges.extend([ { "u": dep_node_id, "v": dag_node_id }, { "u": f"d--{dep}", "v": dep_node_id }, ]) return list(nodes.values()), edges @staticmethod def _node_dict(node_id, label, style): return { "id": node_id, "value": { "label": label, "style": style, "rx": 5, "ry": 5 }, }