def emit_job_event_detail(sender, **kwargs): instance = kwargs['instance'] created = kwargs['created'] if created: event_serialized = JobEventWebSocketSerializer(instance).data emit_channel_notification('job_events-' + str(instance.job.id), event_serialized)
def update_computed_fields(self): future_rs = Schedule.rrulestr(self.rrule) next_run_actual = future_rs.after(now()) if next_run_actual is not None: if not datetime_exists(next_run_actual): # skip imaginary dates, like 2:30 on DST boundaries next_run_actual = future_rs.after(next_run_actual) next_run_actual = next_run_actual.astimezone(pytz.utc) self.next_run = next_run_actual try: self.dtstart = future_rs[0].astimezone(pytz.utc) except IndexError: self.dtstart = None self.dtend = None if 'until' in self.rrule.lower() or 'count' in self.rrule.lower(): try: self.dtend = future_rs[-1].astimezone(pytz.utc) except IndexError: self.dtend = None emit_channel_notification('schedules-changed', dict(id=self.id, group_name='schedules')) with ignore_inventory_computed_fields(): self.unified_job_template.update_computed_fields()
def send_metrics(self): # more than one thread could be calling this at the same time, so should # acquire redis lock before sending metrics lock = self.conn.lock(root_key + '_lock') if not lock.acquire(blocking=False): return try: current_time = time.time() if current_time - self.previous_send_metrics.decode( self.conn) > self.send_metrics_interval: payload = { 'instance': self.instance_name, 'metrics': self.serialize_local_metrics(), } # store a local copy as well self.store_metrics(json.dumps(payload)) emit_channel_notification("metrics", payload) self.previous_send_metrics.set(current_time) self.previous_send_metrics.store_value(self.conn) finally: try: lock.release() except Exception as exc: # After system failures, we might throw redis.exceptions.LockNotOwnedError # this is to avoid print a Traceback, and importantly, avoid raising an exception into parent context logger.warning( f'Error releasing subsystem metrics redis lock, error: {str(exc)}' )
def delete_inventory(inventory_id, user_id, retries=5): # Delete inventory as user if user_id is None: user = None else: try: user = User.objects.get(id=user_id) except Exception: user = None with ignore_inventory_computed_fields(), ignore_inventory_group_removal(), impersonate(user): try: i = Inventory.objects.get(id=inventory_id) for host in i.hosts.iterator(): host.job_events_as_primary_host.update(host=None) i.delete() emit_channel_notification('inventories-status_changed', {'group_name': 'inventories', 'inventory_id': inventory_id, 'status': 'deleted'}) logger.debug('Deleted inventory {} as user {}.'.format(inventory_id, user_id)) except Inventory.DoesNotExist: logger.exception("Delete Inventory failed due to missing inventory: " + str(inventory_id)) return except DatabaseError: logger.exception('Database error deleting inventory {}, but will retry.'.format(inventory_id)) if retries > 0: time.sleep(10) delete_inventory(inventory_id, user_id, retries=retries - 1)
def save_user_session_membership(sender, **kwargs): session = kwargs.get('instance', None) if pkg_resources.get_distribution('channels').version >= '2': # If you get into this code block, it means we upgraded channels, but # didn't make the settings.SESSIONS_PER_USER feature work raise RuntimeError( 'save_user_session_membership must be updated for channels>=2: ' 'http://channels.readthedocs.io/en/latest/one-to-two.html#requirements' ) if 'runworker' in sys.argv: # don't track user session membership for websocket per-channel sessions return if not session: return user = session.get_decoded().get(SESSION_KEY, None) if not user: return user = User.objects.get(pk=user) if UserSessionMembership.objects.filter(user=user, session=session).exists(): return UserSessionMembership(user=user, session=session, created=timezone.now()).save() expired = UserSessionMembership.get_memberships_over_limit(user) for membership in expired: Session.objects.filter( session_key__in=[membership.session_id]).delete() membership.delete() if len(expired): consumers.emit_channel_notification( 'control-limit_reached_{}'.format(user.pk), dict(group_name='control', reason=unicode(_('limit_reached'))))
def emit_ad_hoc_command_event_detail(sender, **kwargs): instance = kwargs['instance'] created = kwargs['created'] if created: event_serialized = AdHocCommandEventWebSocketSerializer(instance).data emit_channel_notification( 'ad_hoc_command_events-' + str(instance.ad_hoc_command_id), event_serialized)
def emit_event_detail(serializer, relation, **kwargs): instance = kwargs['instance'] created = kwargs['created'] if created: event_serializer = serializer(instance) consumers.emit_channel_notification( '-'.join([event_serializer.get_group_name(instance), str(getattr(instance, relation))]), event_serializer.data )
def awx_periodic_scheduler(): with advisory_lock('awx_periodic_scheduler_lock', wait=False) as acquired: if acquired is False: logger.debug("Not running periodic scheduler, another task holds lock") return logger.debug("Starting periodic scheduler") run_now = now() state = TowerScheduleState.get_solo() last_run = state.schedule_last_run logger.debug("Last scheduler run was: %s", last_run) state.schedule_last_run = run_now state.save() old_schedules = Schedule.objects.enabled().before(last_run) for schedule in old_schedules: schedule.update_computed_fields() schedules = Schedule.objects.enabled().between(last_run, run_now) invalid_license = False try: access_registry[Job](None).check_license(quiet=True) except PermissionDenied as e: invalid_license = e for schedule in schedules: template = schedule.unified_job_template schedule.update_computed_fields() # To update next_run timestamp. if template.cache_timeout_blocked: logger.warning("Cache timeout is in the future, bypassing schedule for template %s" % str(template.id)) continue try: job_kwargs = schedule.get_job_kwargs() new_unified_job = schedule.unified_job_template.create_unified_job(**job_kwargs) logger.debug('Spawned {} from schedule {}-{}.'.format(new_unified_job.log_format, schedule.name, schedule.pk)) if invalid_license: new_unified_job.status = 'failed' new_unified_job.job_explanation = str(invalid_license) new_unified_job.save(update_fields=['status', 'job_explanation']) new_unified_job.websocket_emit_status("failed") raise invalid_license can_start = new_unified_job.signal_start() except Exception: logger.exception('Error spawning scheduled job.') continue if not can_start: new_unified_job.status = 'failed' new_unified_job.job_explanation = gettext_noop( "Scheduled job could not start because it \ was not in the right state or required manual credentials" ) new_unified_job.save(update_fields=['status', 'job_explanation']) new_unified_job.websocket_emit_status("failed") emit_channel_notification('schedules-changed', dict(id=schedule.id, group_name="schedules")) state.save()
def emit_job_event_detail(sender, **kwargs): instance = kwargs['instance'] created = kwargs['created'] if created: event_serialized = JobEventSerializer(instance).data event_serialized['id'] = instance.id event_serialized["created"] = event_serialized["created"].isoformat() event_serialized["modified"] = event_serialized["modified"].isoformat() event_serialized["event_name"] = instance.event event_serialized["group_name"] = "job_events" emit_channel_notification('job_events-' + str(instance.job.id), event_serialized)
def emit_ad_hoc_command_event_detail(sender, **kwargs): instance = kwargs['instance'] created = kwargs['created'] if created: event_serialized = AdHocCommandEventSerializer(instance).data event_serialized['id'] = instance.id event_serialized["created"] = event_serialized["created"].isoformat() event_serialized["modified"] = event_serialized["modified"].isoformat() event_serialized["event_name"] = instance.event event_serialized["group_name"] = "ad_hoc_command_events" emit_channel_notification('ad_hoc_command_events-' + str(instance.ad_hoc_command_id), event_serialized)
def update_computed_fields(self): changed = self.update_computed_fields_no_save() if not changed: return emit_channel_notification('schedules-changed', dict(id=self.id, group_name='schedules')) # Must save self here before calling unified_job_template computed fields # in order for that method to be correct # by adding modified to update fields, we avoid updating modified time super(Schedule, self).save(update_fields=['next_run', 'dtstart', 'dtend', 'modified']) with ignore_inventory_computed_fields(): self.unified_job_template.update_computed_fields()
def emit_event_detail(event): if ( settings.UI_LIVE_UPDATES_ENABLED is False and event.event not in MINIMAL_EVENTS ): return cls = event.__class__ relation = { JobEvent: 'job_id', AdHocCommandEvent: 'ad_hoc_command_id', ProjectUpdateEvent: 'project_update_id', InventoryUpdateEvent: 'inventory_update_id', SystemJobEvent: 'system_job_id', }[cls] url = '' if isinstance(event, JobEvent): url = '/api/v2/job_events/{}'.format(event.id) if isinstance(event, AdHocCommandEvent): url = '/api/v2/ad_hoc_command_events/{}'.format(event.id) group = camelcase_to_underscore(cls.__name__) + 's' timestamp = event.created.isoformat() consumers.emit_channel_notification( '-'.join([group, str(getattr(event, relation))]), { 'id': event.id, relation.replace('_id', ''): getattr(event, relation), 'created': timestamp, 'modified': timestamp, 'group_name': group, 'url': url, 'stdout': event.stdout, 'counter': event.counter, 'uuid': event.uuid, 'parent_uuid': getattr(event, 'parent_uuid', ''), 'start_line': event.start_line, 'end_line': event.end_line, 'event': event.event, 'event_data': getattr(event, 'event_data', {}), 'failed': event.failed, 'changed': event.changed, 'event_level': getattr(event, 'event_level', ''), 'play': getattr(event, 'play', ''), 'role': getattr(event, 'role', ''), 'task': getattr(event, 'task', ''), } )
def _websocket_emit_status(self, status): try: status_data = dict(unified_job_id=self.id, status=status) if status == 'waiting': if self.instance_group: status_data['instance_group_name'] = self.instance_group.name else: status_data['instance_group_name'] = None status_data.update(self.websocket_emit_data()) status_data['group_name'] = 'jobs' emit_channel_notification('jobs-status_changed', status_data) if self.spawned_by_workflow: status_data['group_name'] = "workflow_events" emit_channel_notification('workflow_events-' + str(self.workflow_job_id), status_data) except IOError: # includes socket errors logger.exception('%s failed to emit channel msg about status change', self.log_format)
def save_user_session_membership(sender, **kwargs): session = kwargs.get('instance', None) if not session: return user_id = session.get_decoded().get(SESSION_KEY, None) if not user_id: return if UserSessionMembership.objects.filter(user=user_id, session=session).exists(): return # check if user_id from session has an id match in User before saving if User.objects.filter(id=int(user_id)).exists(): UserSessionMembership(user_id=user_id, session=session, created=timezone.now()).save() expired = UserSessionMembership.get_memberships_over_limit(user_id) for membership in expired: Session.objects.filter(session_key__in=[membership.session_id]).delete() membership.delete() if len(expired): consumers.emit_channel_notification('control-limit_reached_{}'.format(user_id), dict(group_name='control', reason='limit_reached'))
def update_computed_fields(self): future_rs = dateutil.rrule.rrulestr(self.rrule, forceset=True) next_run_actual = future_rs.after(now()) self.next_run = next_run_actual try: self.dtstart = future_rs[0] except IndexError: self.dtstart = None self.dtend = None if 'until' in self.rrule.lower(): match_until = re.match(".*?(UNTIL\=[0-9]+T[0-9]+Z)", self.rrule) until_date = match_until.groups()[0].split("=")[1] self.dtend = make_aware(datetime.datetime.strptime(until_date, "%Y%m%dT%H%M%SZ"), get_default_timezone()) if 'count' in self.rrule.lower(): self.dtend = future_rs[-1] emit_channel_notification('schedules-changed', dict(id=self.id, group_name='schedules')) with ignore_inventory_computed_fields(): self.unified_job_template.update_computed_fields()
def save_user_session_membership(sender, **kwargs): session = kwargs.get('instance', None) if not session: return user = session.get_decoded().get(SESSION_KEY, None) if not user: return user = User.objects.get(pk=user) if UserSessionMembership.objects.filter(user=user, session=session).exists(): return UserSessionMembership.objects.create(user=user, session=session, created=timezone.now()) for membership in UserSessionMembership.get_memberships_over_limit(user): consumers.emit_channel_notification( 'control-limit_reached', dict(group_name='control', reason=unicode(_('limit_reached')), session_key=membership.session.session_key))
def send_metrics(self): # more than one thread could be calling this at the same time, so should # get acquire redis lock before sending metrics lock = self.conn.lock(root_key + '_lock', thread_local=False) if not lock.acquire(blocking=False): return try: current_time = time.time() if current_time - self.previous_send_metrics.decode(self.conn) > self.send_metrics_interval: payload = { 'instance': self.instance_name, 'metrics': self.serialize_local_metrics(), } # store a local copy as well self.store_metrics(json.dumps(payload)) emit_channel_notification("metrics", payload) self.previous_send_metrics.set(current_time) self.previous_send_metrics.store_value(self.conn) finally: lock.release()
def perform_work(self, body): try: flush = body.get('event') == 'FLUSH' if flush: self.last_event = '' if not flush: event_map = { 'job_id': JobEvent, 'ad_hoc_command_id': AdHocCommandEvent, 'project_update_id': ProjectUpdateEvent, 'inventory_update_id': InventoryUpdateEvent, 'system_job_id': SystemJobEvent, } job_identifier = 'unknown job' for key, cls in event_map.items(): if key in body: job_identifier = body[key] break self.last_event = f'\n\t- {cls.__name__} for #{job_identifier} ({body.get("event", "")} {body.get("uuid", "")})' # noqa if body.get('event') == 'EOF': try: if 'guid' in body: GuidMiddleware.set_guid(body['guid']) final_counter = body.get('final_counter', 0) logger.info( 'Event processing is finished for Job {}, sending notifications' .format(job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification( 'jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter)) # Additionally, when we've processed all events, we should # have all the data we need to send out success/failure # notification templates uj = UnifiedJob.objects.get(pk=job_identifier) if isinstance(uj, Job): # *actual playbooks* send their success/failure # notifications in response to the playbook_on_stats # event handling code in main.models.events pass elif hasattr(uj, 'send_notification_templates'): handle_success_and_failure_notifications.apply_async( [uj.id]) except Exception: logger.exception( 'Worker failed to emit notifications: Job {}'. format(job_identifier)) finally: self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -1) GuidMiddleware.set_guid('') return skip_websocket_message = body.pop('skip_websocket_message', False) event = cls.create_from_data(**body) if skip_websocket_message: event._skip_websocket_message = True self.buff.setdefault(cls, []).append(event) retries = 0 while retries <= self.MAX_RETRIES: try: self.flush(force=flush) break except (OperationalError, InterfaceError, InternalError): if retries >= self.MAX_RETRIES: logger.exception( 'Worker could not re-establish database connectivity, giving up on one or more events.' ) return delay = 60 * retries logger.exception( 'Database Error Saving Job Event, retry #{i} in {delay} seconds:' .format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: logger.exception('Database Error Saving Job Event') break except Exception as exc: tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))
def run(self, job_id, speed=1.0, verbosity=0): stats = { 'events_ontime': { 'total': 0, 'percentage': 0, }, 'events_late': { 'total': 0, 'percentage': 0, 'lateness_total': 0, 'lateness_average': 0, }, 'events_total': 0, 'events_distance_total': 0, 'events_distance_average': 0, 'recording_start': 0, 'recording_end': 0, 'recording_duration': 0, 'replay_start': 0, 'replay_end': 0, 'replay_duration': 0, } try: job = self.get_job(job_id) job_events = self.get_job_events(job) serializer = self.get_serializer(job) except RuntimeError as e: print("{}".format(e.message)) sys.exit(1) je_previous = None for je_current in job_events: if not je_previous: stats['recording_start'] = je_current.created self.start(je_current.created) stats['replay_start'] = self.replay_start je_previous = je_current je_serialized = serializer(je_current).data emit_channel_notification( '{}-{}'.format(je_serialized['group_name'], job.id), je_serialized) replay_offset = self.replay_offset(je_previous.created, speed) recording_diff = (je_current.created - je_previous.created ).total_seconds() * (1.0 / speed) stats['events_distance_total'] += recording_diff if verbosity >= 3: print( "recording: next job in {} seconds".format(recording_diff)) if replay_offset >= 0: replay_diff = recording_diff - replay_offset if replay_diff > 0: stats['events_ontime']['total'] += 1 if verbosity >= 3: print("\treplay: sleep for {} seconds".format( replay_diff)) self.sleep(replay_diff) else: stats['events_late']['total'] += 1 stats['events_late']['lateness_total'] += (replay_diff * -1) if verbosity >= 3: print("\treplay: too far behind to sleep {} seconds". format(replay_diff)) else: replay_offset = self.replay_offset(je_current.created, speed) stats['events_late']['lateness_total'] += (replay_offset * -1) stats['events_late']['total'] += 1 if verbosity >= 3: print( "\treplay: behind by {} seconds".format(replay_offset)) stats['events_total'] += 1 je_previous = je_current stats['replay_end'] = self.now() stats['replay_duration'] = (stats['replay_end'] - stats['replay_start']).total_seconds() stats['replay_start'] = stats['replay_start'].isoformat() stats['replay_end'] = stats['replay_end'].isoformat() stats['recording_end'] = je_current.created stats['recording_duration'] = ( stats['recording_end'] - stats['recording_start']).total_seconds() stats['recording_start'] = stats['recording_start'].isoformat() stats['recording_end'] = stats['recording_end'].isoformat() stats['events_ontime']['percentage'] = ( stats['events_ontime']['total'] / float(stats['events_total'])) * 100.00 stats['events_late']['percentage'] = ( stats['events_late']['total'] / float(stats['events_total'])) * 100.00 stats['events_distance_average'] = stats[ 'events_distance_total'] / stats['events_total'] stats['events_late']['lateness_average'] = stats['events_late'][ 'lateness_total'] / stats['events_late']['total'] if verbosity >= 2: print(json.dumps(stats, indent=4, sort_keys=True))
def perform_work(self, body): try: flush = body.get('event') == 'FLUSH' if flush: self.last_event = '' if not flush: job_identifier = 'unknown job' for cls in (JobEvent, AdHocCommandEvent, ProjectUpdateEvent, InventoryUpdateEvent, SystemJobEvent): if cls.JOB_REFERENCE in body: job_identifier = body[cls.JOB_REFERENCE] break self.last_event = f'\n\t- {cls.__name__} for #{job_identifier} ({body.get("event", "")} {body.get("uuid", "")})' # noqa notification_trigger_event = bool( body.get('event') == cls.WRAPUP_EVENT) if body.get('event') == 'EOF': try: if 'guid' in body: set_guid(body['guid']) final_counter = body.get('final_counter', 0) logger.info( 'Starting EOF event processing for Job {}'.format( job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification( 'jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter)) if notification_trigger_event: job_stats_wrapup(job_identifier) except Exception: logger.exception( 'Worker failed to perform EOF tasks: Job {}'. format(job_identifier)) finally: self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -1) set_guid('') return skip_websocket_message = body.pop('skip_websocket_message', False) event = cls.create_from_data(**body) if skip_websocket_message: # if this event sends websocket messages, fire them off on flush event._skip_websocket_message = True if notification_trigger_event: # if this is an Ansible stats event, ensure notifications on flush event._notification_trigger_event = True self.buff.setdefault(cls, []).append(event) retries = 0 while retries <= self.MAX_RETRIES: try: self.flush(force=flush) break except (OperationalError, InterfaceError, InternalError): if retries >= self.MAX_RETRIES: logger.exception( 'Worker could not re-establish database connectivity, giving up on one or more events.' ) return delay = 60 * retries logger.exception( 'Database Error Saving Job Event, retry #{i} in {delay} seconds:' .format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: logger.exception('Database Error Saving Job Event') break except Exception as exc: tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))
def run(self, job_id, speed=1.0, verbosity=0, skip_range=[], random_seed=0, final_status_delay=0, debug=False): stats = { 'events_ontime': { 'total': 0, 'percentage': 0, }, 'events_late': { 'total': 0, 'percentage': 0, 'lateness_total': 0, 'lateness_average': 0, }, 'events_total': 0, 'events_distance_total': 0, 'events_distance_average': 0, 'recording_start': 0, 'recording_end': 0, 'recording_duration': 0, 'replay_start': 0, 'replay_end': 0, 'replay_duration': 0, } try: job = self.get_job(job_id) job_events, job_event_count = self.get_job_events(job) serializer = self.get_serializer(job) except RuntimeError as e: print("{}".format(e.message)) sys.exit(1) je_previous = None self.emit_job_status(job, 'pending') self.emit_job_status(job, 'waiting') self.emit_job_status(job, 'running') finish_status_index = self.determine_job_event_finish_status_index( job_event_count, random_seed) for n, je_current in enumerate(job_events): if je_current.counter in skip_range: continue if debug: raw_input("{} of {}:".format(n, job_event_count)) if not je_previous: stats['recording_start'] = je_current.created self.start(je_current.created) stats['replay_start'] = self.replay_start je_previous = je_current je_serialized = serializer(je_current).data emit_channel_notification( '{}-{}'.format(je_serialized['group_name'], job.id), je_serialized) replay_offset = self.replay_offset(je_previous.created, speed) recording_diff = (je_current.created - je_previous.created ).total_seconds() * (1.0 / speed) stats['events_distance_total'] += recording_diff if verbosity >= 3: print( "recording: next job in {} seconds".format(recording_diff)) if replay_offset >= 0: replay_diff = recording_diff - replay_offset if replay_diff > 0: stats['events_ontime']['total'] += 1 if verbosity >= 3: print("\treplay: sleep for {} seconds".format( replay_diff)) self.sleep(replay_diff) else: stats['events_late']['total'] += 1 stats['events_late']['lateness_total'] += (replay_diff * -1) if verbosity >= 3: print("\treplay: too far behind to sleep {} seconds". format(replay_diff)) else: replay_offset = self.replay_offset(je_current.created, speed) stats['events_late']['lateness_total'] += (replay_offset * -1) stats['events_late']['total'] += 1 if verbosity >= 3: print( "\treplay: behind by {} seconds".format(replay_offset)) stats['events_total'] += 1 je_previous = je_current if n == finish_status_index: if final_status_delay != 0: self.sleep(final_status_delay) self.emit_job_status(job, job.status) if stats['events_total'] > 2: stats['replay_end'] = self.now() stats['replay_duration'] = (stats['replay_end'] - stats['replay_start']).total_seconds() stats['replay_start'] = stats['replay_start'].isoformat() stats['replay_end'] = stats['replay_end'].isoformat() stats['recording_end'] = je_current.created stats['recording_duration'] = ( stats['recording_end'] - stats['recording_start']).total_seconds() stats['recording_start'] = stats['recording_start'].isoformat() stats['recording_end'] = stats['recording_end'].isoformat() stats['events_ontime']['percentage'] = ( stats['events_ontime']['total'] / float(stats['events_total'])) * 100.00 stats['events_late']['percentage'] = ( stats['events_late']['total'] / float(stats['events_total'])) * 100.00 stats['events_distance_average'] = stats[ 'events_distance_total'] / stats['events_total'] stats['events_late']['lateness_average'] = stats['events_late'][ 'lateness_total'] / stats['events_late']['total'] else: stats = {'events_total': stats['events_total']} if verbosity >= 2: print(json.dumps(stats, indent=4, sort_keys=True))
def callback_worker(self, queue_actual, idx): signal_handler = WorkerSignalHandler() while not signal_handler.kill_now: try: body = queue_actual.get(block=True, timeout=1) except QueueEmpty: continue except Exception as e: logger.error("Exception on worker thread, restarting: " + str(e)) continue try: event_map = { 'job_id': JobEvent, 'ad_hoc_command_id': AdHocCommandEvent, 'project_update_id': ProjectUpdateEvent, 'inventory_update_id': InventoryUpdateEvent, 'system_job_id': SystemJobEvent, } if not any([key in body for key in event_map]): raise Exception('Payload does not have a job identifier') if settings.DEBUG: from pygments import highlight from pygments.lexers import PythonLexer from pygments.formatters import Terminal256Formatter from pprint import pformat logger.info('Body: {}'.format( highlight(pformat(body, width=160), PythonLexer(), Terminal256Formatter( style='friendly')))[:1024 * 4]) def _save_event_data(): for key, cls in event_map.items(): if key in body: cls.create_from_data(**body) job_identifier = 'unknown job' for key in event_map.keys(): if key in body: job_identifier = body[key] break if body.get('event') == 'EOF': try: logger.info( 'Event processing is finished for Job {}, sending notifications' .format(job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification( 'jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier)) # Additionally, when we've processed all events, we should # have all the data we need to send out success/failure # notification templates uj = UnifiedJob.objects.get(pk=job_identifier) if hasattr(uj, 'send_notification_templates'): retries = 0 while retries < 5: if uj.finished: uj.send_notification_templates( 'succeeded' if uj.status == 'successful' else 'failed') break else: # wait a few seconds to avoid a race where the # events are persisted _before_ the UJ.status # changes from running -> successful retries += 1 time.sleep(1) uj = UnifiedJob.objects.get( pk=job_identifier) except Exception: logger.exception( 'Worker failed to emit notifications: Job {}'. format(job_identifier)) continue retries = 0 while retries <= self.MAX_RETRIES: try: _save_event_data() break except (OperationalError, InterfaceError, InternalError) as e: if retries >= self.MAX_RETRIES: logger.exception( 'Worker could not re-establish database connectivity, shutting down gracefully: Job {}' .format(job_identifier)) os.kill(os.getppid(), signal.SIGINT) return delay = 60 * retries logger.exception( 'Database Error Saving Job Event, retry #{i} in {delay} seconds:' .format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError as e: logger.exception( 'Database Error Saving Job Event for Job {}'. format(job_identifier)) break except Exception as exc: import traceback tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))
def perform_work(self, body): try: flush = body.get('event') == 'FLUSH' if not flush: event_map = { 'job_id': JobEvent, 'ad_hoc_command_id': AdHocCommandEvent, 'project_update_id': ProjectUpdateEvent, 'inventory_update_id': InventoryUpdateEvent, 'system_job_id': SystemJobEvent, } job_identifier = 'unknown job' for key, cls in event_map.items(): if key in body: job_identifier = body[key] break if body.get('event') == 'EOF': try: final_counter = body.get('final_counter', 0) logger.info( 'Event processing is finished for Job {}, sending notifications' .format(job_identifier)) # EOF events are sent when stdout for the running task is # closed. don't actually persist them to the database; we # just use them to report `summary` websocket events as an # approximation for when a job is "done" emit_channel_notification( 'jobs-summary', dict(group_name='jobs', unified_job_id=job_identifier, final_counter=final_counter)) # Additionally, when we've processed all events, we should # have all the data we need to send out success/failure # notification templates uj = UnifiedJob.objects.get(pk=job_identifier) if hasattr(uj, 'send_notification_templates'): retries = 0 while retries < 5: if uj.finished: uj.send_notification_templates( 'succeeded' if uj.status == 'successful' else 'failed') break else: # wait a few seconds to avoid a race where the # events are persisted _before_ the UJ.status # changes from running -> successful retries += 1 time.sleep(1) uj = UnifiedJob.objects.get( pk=job_identifier) except Exception: logger.exception( 'Worker failed to emit notifications: Job {}'. format(job_identifier)) return event = cls.create_from_data(**body) self.buff.setdefault(cls, []).append(event) retries = 0 while retries <= self.MAX_RETRIES: try: self.flush(force=flush) break except (OperationalError, InterfaceError, InternalError): if retries >= self.MAX_RETRIES: logger.exception( 'Worker could not re-establish database connectivity, giving up on one or more events.' ) return delay = 60 * retries logger.exception( 'Database Error Saving Job Event, retry #{i} in {delay} seconds:' .format(i=retries + 1, delay=delay)) django_connection.close() time.sleep(delay) retries += 1 except DatabaseError: logger.exception('Database Error Saving Job Event') break except Exception as exc: tb = traceback.format_exc() logger.error('Callback Task Processor Raised Exception: %r', exc) logger.error('Detail: {}'.format(tb))