def get_fqdn(hostname_or_ip=None): # Get hostname try: if hostname_or_ip: fqdn = socket.gethostbyaddr(hostname_or_ip)[0] if fqdn == 'localhost': fqdn = get_hostname() else: fqdn = get_hostname() except IOError: fqdn = hostname_or_ip return fqdn
def test_trigger_dag(self): with self.app.test_client() as c: url_template = '/api/experimental/dags/{}/dag_runs' response = c.post( url_template.format('example_bash_operator'), data=json.dumps(dict(run_id='my_run' + datetime.now().isoformat())), content_type="application/json" ) self.assertEqual(401, response.status_code) response.url = 'http://{}'.format(get_hostname()) class Request(): headers = {} response.request = Request() response.content = '' response.raw = mock.MagicMock() response.connection = mock.MagicMock() response.connection.send = mock.MagicMock() # disable mutual authentication for testing client_auth.mutual_authentication = 3 # case can influence the results client_auth.hostname_override = get_hostname() client_auth.handle_response(response) self.assertIn('Authorization', response.request.headers) response2 = c.post( url_template.format('example_bash_operator'), data=json.dumps(dict(run_id='my_run' + datetime.now().isoformat())), content_type="application/json", headers=response.request.headers ) self.assertEqual(200, response2.status_code)
def heartbeat_callback(self, session=None): """Self destruct task if state has been moved away from running externally""" if self.terminating: # ensure termination if processes are created later self.task_runner.terminate() return self.task_instance.refresh_from_db() ti = self.task_instance if ti.state == State.RUNNING: fqdn = get_hostname() same_hostname = fqdn == ti.hostname if not same_hostname: self.log.warning( "The recorded hostname %s does not match this instance's hostname %s", ti.hostname, fqdn, ) raise AirflowException("Hostname of job runner does not match") current_pid = self.task_runner.process.pid recorded_pid = ti.pid same_process = recorded_pid == current_pid if ti.run_as_user or self.task_runner.run_as_user: recorded_pid = psutil.Process(ti.pid).ppid() same_process = recorded_pid == current_pid if recorded_pid is not None and not same_process: self.log.warning( "Recorded pid %s does not match the current pid %s", recorded_pid, current_pid) raise AirflowException("PID of job runner does not match") elif self.task_runner.return_code() is None and hasattr( self.task_runner, 'process'): self.log.warning( "State of this instance has been externally set to %s. Terminating instance.", ti.state) self.task_runner.terminate() if ti.state == State.SUCCESS: error = None else: # if ti.state is not set by taskinstance.handle_failure, then # error file will not be populated and it must be updated by # external source suck as web UI error = self.task_runner.deserialize_run_error( ) or "task marked as failed externally" ti._run_finished_callback(error=error) self.terminating = True
def task_run(args, dag=None): """Runs a single task instance""" if dag: args.dag_id = dag.dag_id # Load custom airflow config if args.cfg_path: with open(args.cfg_path, 'r') as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if not args.pickle and not dag: dag = get_dag(args) elif not dag: with db.create_session() as session: print(f'Loading pickle id {args.pickle}') dag_pickle = session.query(DagPickle).filter( DagPickle.id == args.pickle).first() if not dag_pickle: raise AirflowException("Who hid the pickle!? [missing pickle]") dag = dag_pickle.pickle task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.refresh_from_db() ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run(args, dag, ti) else: with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run(args, dag, ti) logging.shutdown()
def execute_command(command_to_exec: CommandType) -> None: """Executes command.""" BaseExecutor.validate_command(command_to_exec) log.info("Executing command in Celery: %s", command_to_exec) env = os.environ.copy() try: # pylint: disable=unexpected-keyword-arg subprocess.check_output(command_to_exec, stderr=subprocess.STDOUT, close_fds=True, env=env) # pylint: disable=unexpected-keyword-arg except subprocess.CalledProcessError as e: log.exception('execute_command encountered a CalledProcessError') log.error(e.output) msg = 'Celery command failed on host: ' + get_hostname() raise AirflowException(msg)
def __init__(self, executor=executors.get_default_executor(), heartrate=None, *args, **kwargs): self.hostname = get_hostname() self.executor = executor self.executor_class = executor.__class__.__name__ self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() if heartrate is not None: self.heartrate = heartrate self.unixname = getpass.getuser() self.max_tis_per_query = conf.getint('scheduler', 'max_tis_per_query') super().__init__(*args, **kwargs)
def __init__(self, executor=None, heartrate=None, *args, **kwargs): self.hostname = get_hostname() if executor: self.executor = executor self.executor_class = executor.__class__.__name__ else: self.executor_class = conf.get('core', 'EXECUTOR') self.start_date = timezone.utcnow() self.latest_heartbeat = timezone.utcnow() if heartrate is not None: self.heartrate = heartrate self.unixname = getuser() self.max_tis_per_query: int = conf.getint('scheduler', 'max_tis_per_query') super().__init__(*args, **kwargs)
def execute_command(command_to_exec: CommandType) -> None: """Executes command.""" if command_to_exec[0:3] != ["airflow", "tasks", "run"]: raise ValueError('The command must start with ["airflow", "tasks", "run"].') log.info("Executing command in Celery: %s", command_to_exec) env = os.environ.copy() try: subprocess.check_call(command_to_exec, stderr=subprocess.STDOUT, close_fds=True, env=env) except subprocess.CalledProcessError as e: log.exception('execute_command encountered a CalledProcessError') log.error(e.output) msg = 'Celery command failed on host: ' + get_hostname() raise AirflowException(msg)
def _execute_in_subprocess(command_to_exec: CommandType, celery_task_id: Optional[str] = None) -> None: env = os.environ.copy() if celery_task_id: env["external_executor_id"] = celery_task_id try: subprocess.check_output(command_to_exec, stderr=subprocess.STDOUT, close_fds=True, env=env) except subprocess.CalledProcessError as e: log.exception('execute_command encountered a CalledProcessError') log.error(e.output) msg = 'Celery command failed on host: ' + get_hostname() raise AirflowException(msg)
def task_run(args, dag=None): """Runs a single task instance""" # Load custom airflow config if args.cfg_path: with open(args.cfg_path, 'r') as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if dag and args.pickle: raise AirflowException( "You cannot use the --pickle option when using DAG.cli() method.") elif args.pickle: print(f'Loading pickle id: {args.pickle}') dag = get_dag_by_pickle(args.pickle) elif not dag: dag = get_dag(args.subdir, args.dag_id) else: # Use DAG from parameter pass task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.refresh_from_db() ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run_task_by_selected_method(args, dag, ti) else: with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run_task_by_selected_method(args, dag, ti) logging.shutdown()
def do_GET(self): if self.path == '/health': try: with create_session() as session: scheduler_job = (session.query(SchedulerJob).filter_by( hostname=get_hostname()).order_by( SchedulerJob.latest_heartbeat.desc()).limit( 1).first()) if scheduler_job and scheduler_job.is_alive(): self.send_response(200) self.end_headers() else: self.send_error(503) except Exception: self.send_error(503) else: self.send_error(404)
def jinja_globals(): return { 'hostname': get_hostname() if conf.getboolean('webserver', 'EXPOSE_HOSTNAME', fallback=True) else 'redact', 'navbar_color': conf.get('webserver', 'NAVBAR_COLOR'), 'log_fetch_delay_sec': conf.getint('webserver', 'log_fetch_delay_sec', fallback=2), 'log_auto_tailing_offset': conf.getint('webserver', 'log_auto_tailing_offset', fallback=30), 'log_animation_speed': conf.getint('webserver', 'log_animation_speed', fallback=1000) }
def test_heartbeat_failed_fast(self, mock_getpid): """ Test that task heartbeat will sleep when it fails fast """ mock_getpid.return_value = 1 heartbeat_records = [] def heartbeat_recorder(**kwargs): heartbeat_records.append(timezone.utcnow()) with create_session() as session: dagbag = models.DagBag( dag_folder=TEST_DAG_FOLDER, include_examples=False, ) dag_id = 'test_heartbeat_failed_fast' task_id = 'test_heartbeat_failed_fast_op' dag = dagbag.get_dag(dag_id) task = dag.get_task(task_id) dag.create_dagrun(run_id="test_heartbeat_failed_fast_run", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session) ti = TI(task=task, execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.state = State.RUNNING ti.hostname = get_hostname() ti.pid = 1 session.commit() job = LocalTaskJob(task_instance=ti, executor=MockExecutor(do_update=False)) job.heartrate = 2 job.heartbeat_callback = heartbeat_recorder job._execute() self.assertGreater(len(heartbeat_records), 1) for i in range(1, len(heartbeat_records)): time1 = heartbeat_records[i - 1] time2 = heartbeat_records[i] # Assert that difference small enough delta = (time2 - time1).total_seconds() self.assertAlmostEqual(delta, job.heartrate, delta=0.05)
def run_command(self, run_with=None): """ Run the task command. :param run_with: list of tokens to run the task command with e.g. ``['bash', '-c']`` :return: the process that was run :rtype: subprocess.Popen """ run_with = run_with or [] full_cmd = run_with + self._command self.log.info("Running on host: %s", get_hostname()) self.log.info('Running: %s', full_cmd) with _airflow_parsing_context_manager( dag_id=self._task_instance.dag_id, task_id=self._task_instance.task_id, ): if IS_WINDOWS: proc = subprocess.Popen( full_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, close_fds=True, env=os.environ.copy(), ) else: proc = subprocess.Popen( full_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, universal_newlines=True, close_fds=True, env=os.environ.copy(), preexec_fn=os.setsid, ) # Start daemon thread to read subprocess logging output log_reader = threading.Thread( target=self._read_task_logs, args=(proc.stdout, ), ) log_reader.daemon = True log_reader.start() return proc
def heartbeat_callback(self, session=None): """Self destruct task if state has been moved away from running externally""" if self.terminating: # ensure termination if processes are created later self.task_runner.terminate() return self.task_instance.refresh_from_db() ti = self.task_instance if ti.state == State.RUNNING: fqdn = get_hostname() same_hostname = fqdn == ti.hostname if not same_hostname: self.log.warning( "The recorded hostname %s " "does not match this instance's hostname " "%s", ti.hostname, fqdn, ) raise AirflowException("Hostname of job runner does not match") current_pid = os.getpid() same_process = ti.pid == current_pid if not same_process: self.log.warning( "Recorded pid %s does not match " "the current pid %s", ti.pid, current_pid) raise AirflowException("PID of job runner does not match") elif ti.state == State.KILLING: self.log.warning("This instance is being killed %s", ti) elif self.task_runner.return_code() is None and hasattr( self.task_runner, 'process'): self.log.warning( "State of this instance has been externally set to %s. " "Terminating instance.", ti.state) if ti.state == State.FAILED and ti.task.on_failure_callback: context = ti.get_template_context() ti.task.on_failure_callback(context) if ti.state == State.SUCCESS and ti.task.on_success_callback: context = ti.get_template_context() ti.task.on_success_callback(context) self.task_runner.terminate() self.terminating = True
def test_heartbeat_failed_fast(self): """ Test that task heartbeat will sleep when it fails fast """ self.mock_base_job_sleep.side_effect = time.sleep with create_session() as session: dagbag = DagBag( dag_folder=TEST_DAG_FOLDER, include_examples=False, ) dag_id = 'test_heartbeat_failed_fast' task_id = 'test_heartbeat_failed_fast_op' dag = dagbag.get_dag(dag_id) task = dag.get_task(task_id) dag.create_dagrun( run_id="test_heartbeat_failed_fast_run", state=State.RUNNING, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti.refresh_from_db() ti.state = State.RUNNING ti.hostname = get_hostname() ti.pid = 1 session.commit() job = LocalTaskJob(task_instance=ti, executor=MockExecutor(do_update=False)) job.heartrate = 2 heartbeat_records = [] job.heartbeat_callback = lambda session: heartbeat_records.append( job.latest_heartbeat) job._execute() self.assertGreater(len(heartbeat_records), 2) for i in range(1, len(heartbeat_records)): time1 = heartbeat_records[i - 1] time2 = heartbeat_records[i] # Assert that difference small enough delta = (time2 - time1).total_seconds() self.assertAlmostEqual(delta, job.heartrate, delta=0.05)
def test_localtaskjob_heartbeat(self): session = settings.Session() dag = DAG('test_localtaskjob_heartbeat', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='op1') dag.clear() dr = dag.create_dagrun( run_id="test", state=State.SUCCESS, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = dr.get_task_instance(task_id=op1.task_id, session=session) ti.state = State.RUNNING ti.hostname = "blablabla" session.commit() job1 = LocalTaskJob(task_instance=ti, ignore_ti_state=True, executor=SequentialExecutor()) ti.task = op1 ti.refresh_from_task(op1) job1.task_runner = StandardTaskRunner(job1) job1.task_runner.process = mock.Mock() with pytest.raises(AirflowException): job1.heartbeat_callback() # pylint: disable=no-value-for-parameter job1.task_runner.process.pid = 1 ti.state = State.RUNNING ti.hostname = get_hostname() ti.pid = 1 session.merge(ti) session.commit() assert ti.pid != os.getpid() job1.heartbeat_callback(session=None) job1.task_runner.process.pid = 2 with pytest.raises(AirflowException): job1.heartbeat_callback() # pylint: disable=no-value-for-parameter
def from_dag( dag, dm, dag_folder, include_task_args, git_commit, is_committed, raw_data_only=False, include_source=True, ): # type: (DAG, DagModel, str, bool, str, bool, bool, bool) -> EDag # Can be Dag from DagBag or from DB, therefore not all attributes may exist source_code = _read_dag_file(dag.fileloc) return EDag( description=dag.description or "", root_task_ids=[t.task_id for t in getattr(dag, "roots", [])], tasks=[ ETask.from_task(t, include_task_args, dag, include_source) for t in getattr(dag, "tasks", []) ] if not raw_data_only else [], owner=resolve_attribute_or_default_attribute( dag, ["owner", "owners"]), dag_id=dag.dag_id, schedule_interval=interval_to_str(dag.schedule_interval), catchup=resolve_attribute_or_default_value(dag, "catchup", False), start_date=resolve_attribute_or_default_value( dag, "start_date", None), end_date=resolve_attribute_or_default_value(dag, "end_date", None), dag_folder=dag_folder, hostname=get_hostname(), source_code=source_code if not raw_data_only and include_source else "", module_source_hash=source_md5(source_code), is_subdag=dag.is_subdag, tags=getattr(dm, "tags", []), task_type="DAG", task_args=_extract_args_from_dict(vars(dag)) if include_task_args else {}, is_active=dm.is_active, is_paused=dm.is_paused, git_commit=git_commit, is_committed=is_committed, )
def renew_from_kt(): # The config is specified in seconds. But we ask for that same amount in # minutes to give ourselves a large renewal buffer. renewal_lifetime = "%sm" % configuration.getint('kerberos', 'reinit_frequency') principal = configuration.get('kerberos', 'principal').replace("_HOST", get_hostname()) cmdv = [ configuration.get('kerberos', 'kinit_path'), "-r", renewal_lifetime, "-k", # host ticket "-t", configuration.get('kerberos', 'keytab'), # specify keytab "-c", configuration.get('kerberos', 'ccache'), # specify credentials cache principal ] log.info("Reinitting kerberos from keytab: " + " ".join(cmdv)) subp = subprocess.Popen(cmdv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, bufsize=-1, universal_newlines=True) subp.wait() if subp.returncode != 0: log.error( "Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" % (subp.returncode, "\n".join(subp.stdout.readlines()), "\n".join( subp.stderr.readlines()))) sys.exit(subp.returncode) global NEED_KRB181_WORKAROUND if NEED_KRB181_WORKAROUND is None: NEED_KRB181_WORKAROUND = detect_conf_var() if NEED_KRB181_WORKAROUND: # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we # renew the ticket after the initial valid time. time.sleep(1.5) perform_krb181_workaround()
def from_dag(dag, dag_folder, include_task_args): # type: (DAG, str) -> EDag git_commit, git_committed = _get_git_status(dag_folder) return EDag( description=dag.description, root_task_ids=[t.task_id for t in dag.roots], tasks=[ETask.from_task(t, include_task_args) for t in dag.tasks], owner=dag.owner, dag_id=dag.dag_id, schedule_interval=interval_to_str(dag.schedule_interval), catchup=dag.catchup, start_date=dag.start_date or utcnow(), end_date=dag.end_date, is_committed=git_committed, git_commit=git_commit or "", dag_folder=dag_folder, hostname=get_hostname(), source_code=_read_dag_file(dag.fileloc), is_subdag=dag.is_subdag, )
def test_localtaskjob_double_trigger(self): dagbag = DagBag( dag_folder=TEST_DAG_FOLDER, include_examples=False, ) dag = dagbag.dags.get('test_localtaskjob_double_trigger') task = dag.get_task('test_localtaskjob_double_trigger_task') session = settings.Session() dag.clear() dr = dag.create_dagrun( run_id="test", state=State.SUCCESS, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = dr.get_task_instance(task_id=task.task_id, session=session) ti.state = State.RUNNING ti.hostname = get_hostname() ti.pid = 1 session.merge(ti) session.commit() ti_run = TaskInstance(task=task, execution_date=DEFAULT_DATE) ti_run.refresh_from_db() job1 = LocalTaskJob(task_instance=ti_run, executor=SequentialExecutor()) from airflow.task.task_runner.standard_task_runner import StandardTaskRunner with patch.object(StandardTaskRunner, 'start', return_value=None) as mock_method: job1.run() mock_method.assert_not_called() ti = dr.get_task_instance(task_id=task.task_id, session=session) self.assertEqual(ti.pid, 1) self.assertEqual(ti.state, State.RUNNING) session.close()
def _execute_in_fork(command_to_exec: CommandType, celery_task_id: Optional[str] = None) -> None: pid = os.fork() if pid: # In parent, wait for the child pid, ret = os.waitpid(pid, 0) if ret == 0: return raise AirflowException('Celery command failed on host: ' + get_hostname()) from airflow.sentry import Sentry ret = 1 try: from airflow.cli.cli_parser import get_parser settings.engine.pool.dispose() settings.engine.dispose() parser = get_parser() # [1:] - remove "airflow" from the start of the command args = parser.parse_args(command_to_exec[1:]) args.shut_down_logging = False if celery_task_id: args.external_executor_id = celery_task_id setproctitle(f"airflow task supervisor: {command_to_exec}") args.func(args) ret = 0 except Exception as e: log.exception("Failed to execute task %s.", str(e)) ret = 1 finally: Sentry.flush() logging.shutdown() os._exit(ret)
def _get_sensor_logger(self, si): """Return logger for a sensor instance object.""" # The created log_id is used inside of smart sensor as the key to fetch # the corresponding in memory log handler. si.raw = False # Otherwise set_context will fail log_id = "-".join([si.dag_id, si.task_id, si.execution_date.strftime("%Y_%m_%dT%H_%M_%S_%f"), str(si.try_number)]) logger = logging.getLogger('airflow.task' + '.' + log_id) if len(logger.handlers) == 0: handler = self.create_new_task_handler() logger.addHandler(handler) set_context(logger, si) line_break = ("-" * 120) logger.info(line_break) logger.info("Processing sensor task %s in smart sensor service on host: %s", self.ti_key, get_hostname()) logger.info(line_break) return logger
def init_app(app): global _SERVICE_NAME hostname = app.config.get('SERVER_NAME') if not hostname: hostname = get_hostname() log.info("Kerberos: hostname %s", hostname) service = 'airflow' _SERVICE_NAME = "{}@{}".format(service, hostname) if 'KRB5_KTNAME' not in os.environ: os.environ['KRB5_KTNAME'] = conf.get('kerberos', 'keytab') try: log.info("Kerberos init: %s %s", service, hostname) principal = kerberos.getServerPrincipalDetails(service, hostname) except kerberos.KrbError as err: log.warning("Kerberos: %s", err) else: log.info("Kerberos API: server is %s", principal)
def test_localtaskjob_heartbeat(self, mock_pid): session = settings.Session() dag = DAG('test_localtaskjob_heartbeat', start_date=DEFAULT_DATE, default_args={'owner': 'owner1'}) with dag: op1 = DummyOperator(task_id='op1') dag.clear() dr = dag.create_dagrun( run_id="test", state=State.SUCCESS, execution_date=DEFAULT_DATE, start_date=DEFAULT_DATE, session=session, ) ti = dr.get_task_instance(task_id=op1.task_id, session=session) ti.state = State.RUNNING ti.hostname = "blablabla" session.commit() job1 = LocalTaskJob(task_instance=ti, ignore_ti_state=True, executor=SequentialExecutor()) self.assertRaises(AirflowException, job1.heartbeat_callback) mock_pid.return_value = 1 ti.state = State.RUNNING ti.hostname = get_hostname() ti.pid = 1 session.merge(ti) session.commit() job1.heartbeat_callback(session=None) mock_pid.return_value = 2 self.assertRaises(AirflowException, job1.heartbeat_callback)
def perform_krb181_workaround(): cmdv = [configuration.get('kerberos', 'kinit_path'), "-c", configuration.get('kerberos', 'ccache'), "-R"] # Renew ticket_cache log.info("Renewing kerberos ticket to work around kerberos 1.8.1: " + " ".join(cmdv)) ret = subprocess.call(cmdv) if ret != 0: principal = "%s/%s" % (configuration.get('kerberos', 'principal'), get_hostname()) fmt_dict = dict(princ=principal, ccache=configuration.get('kerberos', 'principal')) log.error("Couldn't renew kerberos ticket in order to work around " "Kerberos 1.8.1 issue. Please check that the ticket for " "'%(princ)s' is still renewable:\n" " $ kinit -f -c %(ccache)s\n" "If the 'renew until' date is the same as the 'valid starting' " "date, the ticket cannot be renewed. Please check your KDC " "configuration, and the ticket renewal policy (maxrenewlife) " "for the '%(princ)s' and `krbtgt' principals." % fmt_dict) sys.exit(ret)
def heartbeat_callback(self, session=None): """Self destruct task if state has been moved away from running externally""" if self.terminating: # ensure termination if processes are created later self.task_runner.terminate() return self.task_instance.refresh_from_db() ti = self.task_instance fqdn = get_hostname() same_hostname = fqdn == ti.hostname same_process = ti.pid == os.getpid() if ti.state == State.RUNNING: if not same_hostname: self.log.warning("The recorded hostname %s " "does not match this instance's hostname " "%s", ti.hostname, fqdn) raise AirflowException("Hostname of job runner does not match") elif not same_process: current_pid = os.getpid() self.log.warning("Recorded pid %s does not match " "the current pid %s", ti.pid, current_pid) raise AirflowException("PID of job runner does not match") elif ( self.task_runner.return_code() is None and hasattr(self.task_runner, 'process') ): self.log.warning( "State of this instance has been externally set to %s. " "Taking the poison pill.", ti.state ) self.task_runner.terminate() self.terminating = True
def renew_from_kt(): # The config is specified in seconds. But we ask for that same amount in # minutes to give ourselves a large renewal buffer. renewal_lifetime = "%sm" % configuration.getint('kerberos', 'reinit_frequency') principal = configuration.get('kerberos', 'principal').replace("_HOST", get_hostname()) cmdv = [configuration.get('kerberos', 'kinit_path'), "-r", renewal_lifetime, "-k", # host ticket "-t", configuration.get('kerberos', 'keytab'), # specify keytab "-c", configuration.get('kerberos', 'ccache'), # specify credentials cache principal] log.info("Reinitting kerberos from keytab: " + " ".join(cmdv)) subp = subprocess.Popen(cmdv, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, bufsize=-1, universal_newlines=True) subp.wait() if subp.returncode != 0: log.error("Couldn't reinit from keytab! `kinit' exited with %s.\n%s\n%s" % ( subp.returncode, "\n".join(subp.stdout.readlines()), "\n".join(subp.stderr.readlines()))) sys.exit(subp.returncode) global NEED_KRB181_WORKAROUND if NEED_KRB181_WORKAROUND is None: NEED_KRB181_WORKAROUND = detect_conf_var() if NEED_KRB181_WORKAROUND: # (From: HUE-640). Kerberos clock have seconds level granularity. Make sure we # renew the ticket after the initial valid time. time.sleep(1.5) perform_krb181_workaround()
def task_run(args, dag=None): """Runs a single task instance""" # Load custom airflow config if args.cfg_path: with open(args.cfg_path, 'r') as conf_file: conf_dict = json.load(conf_file) if os.path.exists(args.cfg_path): os.remove(args.cfg_path) conf.read_dict(conf_dict, source=args.cfg_path) settings.configure_vars() # IMPORTANT, have to use the NullPool, otherwise, each "run" command may leave # behind multiple open sleeping connections while heartbeating, which could # easily exceed the database connection limit when # processing hundreds of simultaneous tasks. settings.configure_orm(disable_connection_pool=True) if dag and args.pickle: raise AirflowException("You cannot use the --pickle option when using DAG.cli() method.") elif args.pickle: print(f'Loading pickle id: {args.pickle}') dag = get_dag_by_pickle(args.pickle) elif not dag: dag = get_dag(args.subdir, args.dag_id) else: # Use DAG from parameter pass task = dag.get_task(task_id=args.task_id) ti = TaskInstance(task, args.execution_date) ti.init_run_context(raw=args.raw) hostname = get_hostname() print(f"Running {ti} on host {hostname}") if args.interactive: _run_task_by_selected_method(args, dag, ti) else: if settings.DONOT_MODIFY_HANDLERS: with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run_task_by_selected_method(args, dag, ti) else: # Get all the Handlers from 'airflow.task' logger # Add these handlers to the root logger so that we can get logs from # any custom loggers defined in the DAG airflow_logger_handlers = logging.getLogger('airflow.task').handlers root_logger = logging.getLogger() root_logger_handlers = root_logger.handlers # Remove all handlers from Root Logger to avoid duplicate logs for handler in root_logger_handlers: root_logger.removeHandler(handler) for handler in airflow_logger_handlers: root_logger.addHandler(handler) root_logger.setLevel(logging.getLogger('airflow.task').level) with redirect_stdout(StreamLogWriter(ti.log, logging.INFO)), \ redirect_stderr(StreamLogWriter(ti.log, logging.WARN)): _run_task_by_selected_method(args, dag, ti) # We need to restore the handlers to the loggers as celery worker process # can call this command multiple times, # so if we don't reset this then logs from next task would go to the wrong place for handler in airflow_logger_handlers: root_logger.removeHandler(handler) for handler in root_logger_handlers: root_logger.addHandler(handler) logging.shutdown()
def jinja_globals(): return { 'hostname': get_hostname(), }
def replace_hostname_pattern(components, host=None): fqdn = host if not fqdn or fqdn == '0.0.0.0': fqdn = get_hostname() return '%s/%s@%s' % (components[0], fqdn.lower(), components[2])
def test_get_hostname_set_missing(self, patched_conf): patched_conf.get = mock.Mock( return_value='tests.utils.test_net:missing_func' ) with self.assertRaises(AttributeError): net.get_hostname()
def test_get_hostname_set(self, patched_conf): patched_conf.get = mock.Mock( return_value='tests.utils.test_net:get_hostname' ) self.assertTrue(net.get_hostname() == 'awesomehostname')
def prepare_file_path_queue(self): """Generate more file paths to process. Result are saved in _file_path_queue.""" self._parsing_start_time = time.perf_counter() # If the file path is already being processed, or if a file was # processed recently, wait until the next batch file_paths_in_progress = self._processors.keys() now = timezone.utcnow() # Sort the file paths by the parsing order mode list_mode = conf.get("scheduler", "file_parsing_sort_mode") files_with_mtime = {} file_paths = [] is_mtime_mode = list_mode == "modified_time" file_paths_recently_processed = [] for file_path in self._file_paths: if is_mtime_mode: try: files_with_mtime[file_path] = os.path.getmtime(file_path) except FileNotFoundError: self.log.warning("Skipping processing of missing file: %s", file_path) continue file_modified_time = timezone.make_aware( datetime.fromtimestamp(files_with_mtime[file_path])) else: file_paths.append(file_path) file_modified_time = None # Find file paths that were recently processed to exclude them # from being added to file_path_queue # unless they were modified recently and parsing mode is "modified_time" # in which case we don't honor "self._file_process_interval" (min_file_process_interval) last_finish_time = self.get_last_finish_time(file_path) if (last_finish_time is not None and (now - last_finish_time).total_seconds() < self._file_process_interval and not (is_mtime_mode and file_modified_time and (file_modified_time > last_finish_time))): file_paths_recently_processed.append(file_path) # Sort file paths via last modified time if is_mtime_mode: file_paths = sorted(files_with_mtime, key=files_with_mtime.get, reverse=True) elif list_mode == "alphabetical": file_paths = sorted(file_paths) elif list_mode == "random_seeded_by_host": # Shuffle the list seeded by hostname so multiple schedulers can work on different # set of files. Since we set the seed, the sort order will remain same per host random.Random(get_hostname()).shuffle(file_paths) files_paths_at_run_limit = [ file_path for file_path, stat in self._file_stats.items() if stat.run_count == self._max_runs ] file_paths_to_exclude = set(file_paths_in_progress).union( file_paths_recently_processed, files_paths_at_run_limit) # Do not convert the following list to set as set does not preserve the order # and we need to maintain the order of file_paths for `[scheduler] file_parsing_sort_mode` files_paths_to_queue = [ file_path for file_path in file_paths if file_path not in file_paths_to_exclude ] for file_path, processor in self._processors.items(): self.log.debug( "File path %s is still being processed (started: %s)", processor.file_path, processor.start_time.isoformat(), ) self.log.debug("Queuing the following files for processing:\n\t%s", "\n\t".join(files_paths_to_queue)) for file_path in files_paths_to_queue: if file_path not in self._file_stats: self._file_stats[file_path] = DagFileStat( num_dags=0, import_errors=0, last_finish_time=None, last_duration=None, run_count=0) self._file_path_queue.extend(files_paths_to_queue)
def jinja_globals(): return { 'hostname': get_hostname(), 'navbar_color': configuration.get('webserver', 'NAVBAR_COLOR'), }
def test_get_hostname_unset(self, patched_conf, patched_socket): patched_conf.get = mock.Mock(return_value=None) patched_socket.getfqdn = mock.Mock(return_value='first') self.assertTrue(net.get_hostname() == 'first')
def replace_hostname_pattern(components, host=None): """Replaces hostname with the right pattern including lowercase of the name.""" fqdn = host if not fqdn or fqdn == '0.0.0.0': fqdn = get_hostname() return '%s/%s@%s' % (components[0], fqdn.lower(), components[2])
def test_get_hostname_set_incorrect(self, patched_conf): patched_conf.get = mock.Mock( return_value='tests.utils.test_net' ) with self.assertRaises(ValueError): net.get_hostname()
def _run_raw_task( self, mark_success: bool = False, test_mode: bool = False, job_id: Optional[str] = None, pool: Optional[str] = None, error_file: Optional[str] = None, session=None, ) -> None: """ Immediately runs the task (without checking or changing db state before execution) and then sets the appropriate final state after completion and runs any post-execute callbacks. Meant to be called only after another function changes the state to running. :param mark_success: Don't run the task, mark its state as success :type mark_success: bool :param test_mode: Doesn't record success or failure in the DB :type test_mode: bool :param pool: specifies the pool to use to run the task instance :type pool: str :param session: SQLAlchemy ORM Session :type session: Session """ task = self.task self.test_mode = test_mode refresh_from_task(self, task, pool_override=pool) # self.refresh_from_db(session=session) self.job_id = job_id self.hostname = get_hostname() context = {} # type: Dict actual_start_date = timezone.utcnow() try: if not mark_success: context = self.get_template_context() _prepare_and_execute_task_with_callbacks(self, context, task) self.refresh_from_db(lock_for_update=True) self.state = State.SUCCESS except AirflowSkipException as e: # Recording SKIP # log only if exception has any arguments to prevent log flooding if e.args: self.log.info(e) self.refresh_from_db(lock_for_update=True) self.state = State.SKIPPED self.log.info( 'Marking task as SKIPPED. ' 'dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s', self.dag_id, self.task_id, _date_or_empty(self, 'execution_date'), _date_or_empty(self, 'start_date'), _date_or_empty(self, 'end_date'), ) # except AirflowRescheduleException as reschedule_exception: # self.refresh_from_db() # self._handle_reschedule(actual_start_date, reschedule_exception, test_mode) # return # except AirflowFailException as e: # self.refresh_from_db() # self.handle_failure(e, test_mode, force_fail=True, error_file=error_file) # raise except AirflowException as e: self.refresh_from_db() # for case when task is marked as success/failed externally # current behavior doesn't hit the success callback if self.state in {State.SUCCESS, State.FAILED}: return else: self.handle_failure(e, test_mode, error_file=error_file) raise except (Exception, KeyboardInterrupt) as e: self.handle_failure(e, test_mode, error_file=error_file) raise # Recording SUCCESS self.end_date = timezone.utcnow() self.log.info( 'Marking task as SUCCESS. ' 'dag_id=%s, task_id=%s, execution_date=%s, start_date=%s, end_date=%s', self.dag_id, self.task_id, _date_or_empty(self, 'execution_date'), _date_or_empty(self, 'start_date'), _date_or_empty(self, 'end_date'), ) self.set_duration() if not test_mode: session.add(Log(self.state, self)) session.merge(self) session.commit() if not test_mode: _run_mini_scheduler_on_child_tasks(self, session)
def get_localhost_name(): return get_hostname()