def __init__(self, model=None): self.parent_workflow_job_id = None self.host_map = {} self.guid = GuidMiddleware.get_guid() self.job_created = None self.recent_event_timings = deque( maxlen=settings.MAX_WEBSOCKET_EVENT_RATE) self.dispatcher = CallbackQueueDispatcher() self.safe_env = {} self.event_ct = 0 self.model = model
def __init__(self, model=None): self.parent_workflow_job_id = None self.host_map = {} self.guid = get_guid() self.job_created = None self.recent_event_timings = deque(maxlen=settings.MAX_WEBSOCKET_EVENT_RATE) self.dispatcher = CallbackQueueDispatcher() self.safe_env = {} self.event_ct = 0 self.model = model self.update_attempts = int(settings.DISPATCHER_DB_DOWNTOWN_TOLLERANCE / 5)
def run(self, instance, private_data_dir, playbook, module, module_args, event_data_key, ident=None): """ Run a job on an isolated host. :param instance: a `model.Job` instance :param private_data_dir: an absolute path on the local file system where job-specific data should be written (i.e., `/tmp/awx_N_xyz/`) :param playbook: the playbook to run :param module: the module to run :param module_args: the module args to use :param event_data_key: e.g., job_id, inventory_id, ... For a completed job run, this function returns (status, rc), representing the status and return code of the isolated `ansible-playbook` run. """ self.ident = ident self.event_data_key = event_data_key self.instance = instance self.private_data_dir = private_data_dir self.runner_params = self.build_runner_params( [instance.execution_node], verbosity=min(5, self.instance.verbosity) ) status, rc = self.dispatch(playbook, module, module_args) if status == 'successful': status, rc = self.check() else: # emit an EOF event event_data = {'event': 'EOF', 'final_counter': 0} event_data.setdefault(self.event_data_key, self.instance.id) CallbackQueueDispatcher().dispatch(event_data) return status, rc
def wrap_stdout_handle(instance, private_data_dir, stdout_handle, event_data_key='job_id'): dispatcher = CallbackQueueDispatcher() def job_event_callback(event_data): event_data.setdefault(event_data_key, instance.id) if 'uuid' in event_data: filename = '{}-partial.json'.format(event_data['uuid']) partial_filename = os.path.join(private_data_dir, 'artifacts', 'job_events', filename) try: with codecs.open(partial_filename, 'r', encoding='utf-8') as f: partial_event_data = json.load(f) event_data.update(partial_event_data) except IOError: if event_data.get('event', '') != 'verbose': logger.error( 'Missing callback data for event type `{}`, uuid {}, job {}.\nevent_data: {}' .format(event_data.get('event', ''), event_data['uuid'], instance.id, event_data)) dispatcher.dispatch(event_data) return OutputEventFilter(stdout_handle, job_event_callback)
def dispatch(self, playbook=None, module=None, module_args=None): ''' Ship the runner payload to a remote host for isolated execution. ''' self.handled_events = set() self.started_at = time.time() self.build_isolated_job_data() extra_vars = { 'src': self.private_data_dir, 'dest': settings.AWX_PROOT_BASE_PATH, 'ident': self.ident } if playbook: extra_vars['playbook'] = playbook if module and module_args: extra_vars['module'] = module extra_vars['module_args'] = module_args # Run ansible-playbook to launch a job on the isolated host. This: # # - sets up a temporary directory for proot/bwrap (if necessary) # - copies encrypted job data from the controlling host to the isolated host (with rsync) # - writes the encryption secret to a named pipe on the isolated host # - launches ansible-runner args = self._build_args('run_isolated.yml', '%s,' % self.host, extra_vars) if self.instance.verbosity: args.append('-%s' % ('v' * min(5, self.instance.verbosity))) buff = StringIO() logger.debug( 'Starting job {} on isolated host with `run_isolated.yml` playbook.' .format(self.instance.id)) status, rc = IsolatedManager.run_pexpect( args, self.awx_playbook_path(), self.management_env, buff, idle_timeout=self.idle_timeout, job_timeout=settings.AWX_ISOLATED_LAUNCH_TIMEOUT, pexpect_timeout=5) output = buff.getvalue() playbook_logger.info('Isolated job {} dispatch:\n{}'.format( self.instance.id, output)) if status != 'successful': for event_data in [ { 'event': 'verbose', 'stdout': output }, { 'event': 'EOF', 'final_counter': 1 }, ]: event_data.setdefault(self.event_data_key, self.instance.id) CallbackQueueDispatcher().dispatch(event_data) return status, rc
def check(self, interval=None): """ Repeatedly poll the isolated node to determine if the job has run. On success, copy job artifacts to the controlling node. On failure, continue to poll the isolated node (until the job timeout is exceeded). For a completed job run, this function returns (status, rc), representing the status and return code of the isolated `ansible-playbook` run. :param interval: an interval (in seconds) to wait between status polls """ interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL extravars = {'src': self.private_data_dir} status = 'failed' rc = None last_check = time.time() dispatcher = CallbackQueueDispatcher() while status == 'failed': canceled = self.cancelled_callback() if self.cancelled_callback else False if not canceled and time.time() - last_check < interval: # If the job isn't cancelled, but we haven't waited `interval` seconds, wait longer time.sleep(1) continue if canceled: logger.warning('Isolated job {} was manually cancelled.'.format(self.instance.id)) logger.debug('Checking on isolated job {} with `check_isolated.yml`.'.format(self.instance.id)) runner_obj = self.run_management_playbook('check_isolated.yml', self.private_data_dir, extravars=extravars) status, rc = runner_obj.status, runner_obj.rc if self.check_callback is not None and not self.captured_command_artifact: command_path = self.path_to('artifacts', self.ident, 'command') # If the configuration artifact has been synced back, update the model if os.path.exists(command_path): try: with open(command_path, 'r') as f: data = json.load(f) self.check_callback(data) self.captured_command_artifact = True except json.decoder.JSONDecodeError: # Just in case it's not fully here yet. pass self.consume_events(dispatcher) last_check = time.time() if status == 'successful': status_path = self.path_to('artifacts', self.ident, 'status') rc_path = self.path_to('artifacts', self.ident, 'rc') if os.path.exists(status_path): with open(status_path, 'r') as f: status = f.readline() with open(rc_path, 'r') as f: rc = int(f.readline()) else: # if there's no status file, it means that runner _probably_ # exited with a traceback (which should be logged to # daemon.log) Record it so we can see how runner failed. daemon_path = self.path_to('daemon.log') if os.path.exists(daemon_path): with open(daemon_path, 'r') as f: self.instance.result_traceback = f.read() self.instance.save(update_fields=['result_traceback']) else: logger.error('Failed to rsync daemon.log (is ansible-runner installed on the isolated host?)') status = 'failed' rc = 1 # consume events one last time just to be sure we didn't miss anything # in the final sync self.consume_events(dispatcher) # emit an EOF event event_data = { 'event': 'EOF', 'final_counter': len(self.handled_events) } event_data.setdefault(self.event_data_key, self.instance.id) dispatcher.dispatch(event_data) return status, rc
def check(self, interval=None): """ Repeatedly poll the isolated node to determine if the job has run. On success, copy job artifacts to the controlling node. On failure, continue to poll the isolated node (until the job timeout is exceeded). For a completed job run, this function returns (status, rc), representing the status and return code of the isolated `ansible-playbook` run. :param interval: an interval (in seconds) to wait between status polls """ interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL extra_vars = {'src': self.private_data_dir} args = self._build_args('check_isolated.yml', '%s,' % self.host, extra_vars) if self.instance.verbosity: args.append('-%s' % ('v' * min(5, self.instance.verbosity))) status = 'failed' output = '' rc = None buff = StringIO() last_check = time.time() job_timeout = remaining = self.job_timeout dispatcher = CallbackQueueDispatcher() while status == 'failed': if job_timeout != 0: remaining = max(0, job_timeout - (time.time() - self.started_at)) if remaining == 0: # if it takes longer than $REMAINING_JOB_TIMEOUT to retrieve # job artifacts from the host, consider the job failed if isinstance(self.extra_update_fields, dict): self.extra_update_fields[ 'job_explanation'] = "Job terminated due to timeout" status = 'failed' break canceled = self.cancelled_callback( ) if self.cancelled_callback else False if not canceled and time.time() - last_check < interval: # If the job isn't cancelled, but we haven't waited `interval` seconds, wait longer time.sleep(1) continue buff = StringIO() logger.debug( 'Checking on isolated job {} with `check_isolated.yml`.'. format(self.instance.id)) status, rc = IsolatedManager.run_pexpect( args, self.awx_playbook_path(), self.management_env, buff, cancelled_callback=self.cancelled_callback, idle_timeout=remaining, job_timeout=remaining, pexpect_timeout=5, proot_cmd=self.proot_cmd) output = buff.getvalue().encode('utf-8') playbook_logger.info('Isolated job {} check:\n{}'.format( self.instance.id, output)) # discover new events and ingest them events_path = self.path_to('artifacts', self.ident, 'job_events') # it's possible that `events_path` doesn't exist *yet*, because runner # hasn't actually written any events yet (if you ran e.g., a sleep 30) # only attempt to consume events if any were rsynced back if os.path.exists(events_path): for event in set( os.listdir(events_path)) - self.handled_events: path = os.path.join(events_path, event) if os.path.exists(path): event_data = json.load( open(os.path.join(events_path, event), 'r')) event_data.setdefault(self.event_data_key, self.instance.id) dispatcher.dispatch(event_data) self.handled_events.add(event) # handle artifacts if event_data.get('event_data', {}).get('artifact_data', {}): self.instance.artifacts = event_data['event_data'][ 'artifact_data'] self.instance.save(update_fields=['artifacts']) last_check = time.time() if status == 'successful': status_path = self.path_to('artifacts', self.ident, 'status') rc_path = self.path_to('artifacts', self.ident, 'rc') with open(status_path, 'r') as f: status = f.readline() with open(rc_path, 'r') as f: rc = int(f.readline()) # emit an EOF event event_data = { 'event': 'EOF', 'final_counter': len(self.handled_events) } event_data.setdefault(self.event_data_key, self.instance.id) dispatcher.dispatch(event_data) return status, rc
class RunnerCallback: event_data_key = 'job_id' def __init__(self, model=None): self.parent_workflow_job_id = None self.host_map = {} self.guid = GuidMiddleware.get_guid() self.job_created = None self.recent_event_timings = deque( maxlen=settings.MAX_WEBSOCKET_EVENT_RATE) self.dispatcher = CallbackQueueDispatcher() self.safe_env = {} self.event_ct = 0 self.model = model def update_model(self, pk, _attempt=0, **updates): return update_model(self.model, pk, _attempt=0, **updates) def event_handler(self, event_data): # # ⚠️ D-D-D-DANGER ZONE ⚠️ # This method is called once for *every event* emitted by Ansible # Runner as a playbook runs. That means that changes to the code in # this method are _very_ likely to introduce performance regressions. # # Even if this function is made on average .05s slower, it can have # devastating performance implications for playbooks that emit # tens or hundreds of thousands of events. # # Proceed with caution! # """ Ansible runner puts a parent_uuid on each event, no matter what the type. AWX only saves the parent_uuid if the event is for a Job. """ # cache end_line locally for RunInventoryUpdate tasks # which generate job events from two 'streams': # ansible-inventory and the awx.main.commands.inventory_import # logger if event_data.get(self.event_data_key, None): if self.event_data_key != 'job_id': event_data.pop('parent_uuid', None) if self.parent_workflow_job_id: event_data['workflow_job_id'] = self.parent_workflow_job_id event_data['job_created'] = self.job_created if self.host_map: host = event_data.get('event_data', {}).get('host', '').strip() if host: event_data['host_name'] = host if host in self.host_map: event_data['host_id'] = self.host_map[host] else: event_data['host_name'] = '' event_data['host_id'] = '' if event_data.get('event') == 'playbook_on_stats': event_data['host_map'] = self.host_map if isinstance(self, RunnerCallbackForProjectUpdate): # need a better way to have this check. # it's common for Ansible's SCM modules to print # error messages on failure that contain the plaintext # basic auth credentials (username + password) # it's also common for the nested event data itself (['res']['...']) # to contain unredacted text on failure # this is a _little_ expensive to filter # with regex, but project updates don't have many events, # so it *should* have a negligible performance impact task = event_data.get('event_data', {}).get('task_action') try: if task in ('git', 'svn'): event_data_json = json.dumps(event_data) event_data_json = UriCleaner.remove_sensitive( event_data_json) event_data = json.loads(event_data_json) except json.JSONDecodeError: pass if 'event_data' in event_data: event_data['event_data']['guid'] = self.guid # To prevent overwhelming the broadcast queue, skip some websocket messages if self.recent_event_timings: cpu_time = time.time() first_window_time = self.recent_event_timings[0] last_window_time = self.recent_event_timings[-1] if event_data.get('event') in MINIMAL_EVENTS: should_emit = True # always send some types like playbook_on_stats elif event_data.get('stdout') == '' and event_data[ 'start_line'] == event_data['end_line']: should_emit = False # exclude events with no output else: should_emit = any([ # if 30the most recent websocket message was sent over 1 second ago cpu_time - first_window_time > 1.0, # if the very last websocket message came in over 1/30 seconds ago self.recent_event_timings.maxlen * (cpu_time - last_window_time) > 1.0, # if the queue is not yet full len(self.recent_event_timings ) != self.recent_event_timings.maxlen, ]) if should_emit: self.recent_event_timings.append(cpu_time) else: event_data.setdefault('event_data', {}) event_data['skip_websocket_message'] = True elif self.recent_event_timings.maxlen: self.recent_event_timings.append(time.time()) event_data.setdefault(self.event_data_key, self.instance.id) self.dispatcher.dispatch(event_data) self.event_ct += 1 ''' Handle artifacts ''' if event_data.get('event_data', {}).get('artifact_data', {}): self.instance.artifacts = event_data['event_data']['artifact_data'] self.instance.save(update_fields=['artifacts']) return False def cancel_callback(self): """ Ansible runner callback to tell the job when/if it is canceled """ unified_job_id = self.instance.pk self.instance.refresh_from_db() if not self.instance: logger.error( 'unified job {} was deleted while running, canceling'.format( unified_job_id)) return True if self.instance.cancel_flag or self.instance.status == 'canceled': cancel_wait = (now() - self.instance.modified ).seconds if self.instance.modified else 0 if cancel_wait > 5: logger.warn( 'Request to cancel {} took {} seconds to complete.'.format( self.instance.log_format, cancel_wait)) return True return False def finished_callback(self, runner_obj): """ Ansible runner callback triggered on finished run """ event_data = { 'event': 'EOF', 'final_counter': self.event_ct, 'guid': self.guid, } event_data.setdefault(self.event_data_key, self.instance.id) self.dispatcher.dispatch(event_data) def status_handler(self, status_data, runner_config): """ Ansible runner callback triggered on status transition """ if status_data['status'] == 'starting': job_env = dict(runner_config.env) ''' Take the safe environment variables and overwrite ''' for k, v in self.safe_env.items(): if k in job_env: job_env[k] = v from awx.main.signals import disable_activity_stream # Circular import with disable_activity_stream(): self.instance = self.update_model(self.instance.pk, job_args=json.dumps( runner_config.command), job_cwd=runner_config.cwd, job_env=job_env) elif status_data['status'] == 'failed': # For encrypted ssh_key_data, ansible-runner worker will open and write the # ssh_key_data to a named pipe. Then, once the podman container starts, ssh-agent will # read from this named pipe so that the key can be used in ansible-playbook. # Once the podman container exits, the named pipe is deleted. # However, if the podman container fails to start in the first place, e.g. the image # name is incorrect, then this pipe is not cleaned up. Eventually ansible-runner # processor will attempt to write artifacts to the private data dir via unstream_dir, requiring # that it open this named pipe. This leads to a hang. Thus, before any artifacts # are written by the processor, it's important to remove this ssh_key_data pipe. private_data_dir = self.instance.job_env.get( 'AWX_PRIVATE_DATA_DIR', None) if private_data_dir: key_data_file = os.path.join(private_data_dir, 'artifacts', str(self.instance.id), 'ssh_key_data') if os.path.exists(key_data_file) and stat.S_ISFIFO( os.stat(key_data_file).st_mode): os.remove(key_data_file) elif status_data['status'] == 'error': result_traceback = status_data.get('result_traceback', None) if result_traceback: from awx.main.signals import disable_activity_stream # Circular import with disable_activity_stream(): self.instance = self.update_model( self.instance.pk, result_traceback=result_traceback)
def check(self, interval=None): """ Repeatedly poll the isolated node to determine if the job has run. On success, copy job artifacts to the controlling node. On failure, continue to poll the isolated node (until the job timeout is exceeded). For a completed job run, this function returns (status, rc), representing the status and return code of the isolated `ansible-playbook` run. :param interval: an interval (in seconds) to wait between status polls """ interval = interval if interval is not None else settings.AWX_ISOLATED_CHECK_INTERVAL extravars = {'src': self.private_data_dir} status = 'failed' rc = None last_check = time.time() dispatcher = CallbackQueueDispatcher() while status == 'failed': canceled = self.cancelled_callback( ) if self.cancelled_callback else False if not canceled and time.time() - last_check < interval: # If the job isn't cancelled, but we haven't waited `interval` seconds, wait longer time.sleep(1) continue if canceled: logger.warning( 'Isolated job {} was manually cancelled.'.format( self.instance.id)) logger.debug( 'Checking on isolated job {} with `check_isolated.yml`.'. format(self.instance.id)) runner_obj = self.run_management_playbook('check_isolated.yml', self.private_data_dir, extravars=extravars) status, rc = runner_obj.status, runner_obj.rc self.consume_events(dispatcher) last_check = time.time() if status == 'successful': status_path = self.path_to('artifacts', self.ident, 'status') rc_path = self.path_to('artifacts', self.ident, 'rc') with open(status_path, 'r') as f: status = f.readline() with open(rc_path, 'r') as f: rc = int(f.readline()) # consume events one last time just to be sure we didn't miss anything # in the final sync self.consume_events(dispatcher) # emit an EOF event event_data = { 'event': 'EOF', 'final_counter': len(self.handled_events) } event_data.setdefault(self.event_data_key, self.instance.id) dispatcher.dispatch(event_data) return status, rc