def flush(self, force=False): now = tz_now() if (force or any([len(events) >= 1000 for events in self.buff.values()])): for cls, events in self.buff.items(): logger.debug( f'{cls.__name__}.objects.bulk_create({len(events)})') for e in events: if not e.created: e.created = now e.modified = now try: cls.objects.bulk_create(events) except Exception as exc: # if an exception occurs, we should re-attempt to save the # events one-by-one, because something in the list is # broken/stale (e.g., an IntegrityError on a specific event) for e in events: try: if (isinstance(exc, IntegrityError), getattr(e, 'host_id', '')): # this is one potential IntegrityError we can # work around - if the host disappears before # the event can be processed e.host_id = None e.save() except Exception: logger.exception('Database Error Saving Job Event') for e in events: emit_event_detail(e) self.buff = {}
def flush(self, force=False): now = tz_now() if (force or any([len(events) >= 1000 for events in self.buff.values()])): for cls, events in self.buff.items(): logger.debug( f'{cls.__name__}.objects.bulk_create({len(events)})') for e in events: if not e.created: e.created = now e.modified = now try: cls.objects.bulk_create(events) except Exception: # if an exception occurs, we should re-attempt to save the # events one-by-one, because something in the list is # broken/stale for e in events: try: e.save() except Exception: logger.exception('Database Error Saving Job Event') for e in events: emit_event_detail(e) self.buff = {}
def flush(self, force=False): now = tz_now() if force or (time.time() - self.last_flush ) > settings.JOB_EVENT_BUFFER_SECONDS or any([ len(events) >= 1000 for events in self.buff.values() ]): bulk_events_saved = 0 singular_events_saved = 0 metrics_events_batch_save_errors = 0 for cls, events in self.buff.items(): logger.debug( f'{cls.__name__}.objects.bulk_create({len(events)})') for e in events: if not e.created: e.created = now e.modified = now duration_to_save = time.perf_counter() try: cls.objects.bulk_create(events) bulk_events_saved += len(events) except Exception: # if an exception occurs, we should re-attempt to save the # events one-by-one, because something in the list is # broken/stale metrics_events_batch_save_errors += 1 for e in events: try: e.save() singular_events_saved += 1 except Exception: logger.exception('Database Error Saving Job Event') duration_to_save = time.perf_counter() - duration_to_save for e in events: if not getattr(e, '_skip_websocket_message', False): emit_event_detail(e) self.buff = {} self.last_flush = time.time() # only update metrics if we saved events if (bulk_events_saved + singular_events_saved) > 0: self.subsystem_metrics.inc( 'callback_receiver_batch_events_errors', metrics_events_batch_save_errors) self.subsystem_metrics.inc( 'callback_receiver_events_insert_db_seconds', duration_to_save) self.subsystem_metrics.inc( 'callback_receiver_events_insert_db', bulk_events_saved + singular_events_saved) self.subsystem_metrics.observe( 'callback_receiver_batch_events_insert_db', bulk_events_saved) self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -(bulk_events_saved + singular_events_saved)) if self.subsystem_metrics.should_pipe_execute() is True: self.subsystem_metrics.pipe_execute()
def run(self, job_id, speed=1.0, verbosity=0, skip_range=[], random_seed=0, final_status_delay=0, debug=False): stats = { 'events_ontime': { 'total': 0, 'percentage': 0, }, 'events_late': { 'total': 0, 'percentage': 0, 'lateness_total': 0, 'lateness_average': 0, }, 'events_total': 0, 'events_distance_total': 0, 'events_distance_average': 0, 'recording_start': 0, 'recording_end': 0, 'recording_duration': 0, 'replay_start': 0, 'replay_end': 0, 'replay_duration': 0, } try: job = self.get_job(job_id) job_events, job_event_count = self.get_job_events(job) except RuntimeError as e: print("{}".format(e.message)) sys.exit(1) je_previous = None self.emit_job_status(job, 'pending') self.emit_job_status(job, 'waiting') self.emit_job_status(job, 'running') finish_status_index = self.determine_job_event_finish_status_index( job_event_count, random_seed) for n, je_current in enumerate(job_events): if je_current.counter in skip_range: continue if debug: input("{} of {}:".format(n, job_event_count)) if not je_previous: stats['recording_start'] = je_current.created self.start(je_current.created) stats['replay_start'] = self.replay_start je_previous = je_current emit_event_detail(je_current) replay_offset = self.replay_offset(je_previous.created, speed) recording_diff = (je_current.created - je_previous.created ).total_seconds() * (1.0 / speed) stats['events_distance_total'] += recording_diff if verbosity >= 3: print( "recording: next job in {} seconds".format(recording_diff)) if replay_offset >= 0: replay_diff = recording_diff - replay_offset if replay_diff > 0: stats['events_ontime']['total'] += 1 if verbosity >= 3: print("\treplay: sleep for {} seconds".format( replay_diff)) self.sleep(replay_diff) else: stats['events_late']['total'] += 1 stats['events_late']['lateness_total'] += (replay_diff * -1) if verbosity >= 3: print("\treplay: too far behind to sleep {} seconds". format(replay_diff)) else: replay_offset = self.replay_offset(je_current.created, speed) stats['events_late']['lateness_total'] += (replay_offset * -1) stats['events_late']['total'] += 1 if verbosity >= 3: print( "\treplay: behind by {} seconds".format(replay_offset)) stats['events_total'] += 1 je_previous = je_current if n == finish_status_index: if final_status_delay != 0: self.sleep(final_status_delay) self.emit_job_status(job, job.status) if stats['events_total'] > 2: stats['replay_end'] = self.now() stats['replay_duration'] = (stats['replay_end'] - stats['replay_start']).total_seconds() stats['replay_start'] = stats['replay_start'].isoformat() stats['replay_end'] = stats['replay_end'].isoformat() stats['recording_end'] = je_current.created stats['recording_duration'] = ( stats['recording_end'] - stats['recording_start']).total_seconds() stats['recording_start'] = stats['recording_start'].isoformat() stats['recording_end'] = stats['recording_end'].isoformat() stats['events_ontime']['percentage'] = ( stats['events_ontime']['total'] / float(stats['events_total'])) * 100.00 stats['events_late']['percentage'] = ( stats['events_late']['total'] / float(stats['events_total'])) * 100.00 stats['events_distance_average'] = stats[ 'events_distance_total'] / stats['events_total'] stats['events_late']['lateness_average'] = stats['events_late'][ 'lateness_total'] / stats['events_late']['total'] else: stats = {'events_total': stats['events_total']} if verbosity >= 2: print(json.dumps(stats, indent=4, sort_keys=True))
def flush(self, force=False): now = tz_now() if force or (time.time() - self.last_flush ) > settings.JOB_EVENT_BUFFER_SECONDS or any([ len(events) >= 1000 for events in self.buff.values() ]): metrics_bulk_events_saved = 0 metrics_singular_events_saved = 0 metrics_events_batch_save_errors = 0 metrics_events_broadcast = 0 metrics_events_missing_created = 0 metrics_total_job_event_processing_seconds = datetime.timedelta( seconds=0) for cls, events in self.buff.items(): logger.debug( f'{cls.__name__}.objects.bulk_create({len(events)})') for e in events: e.modified = now # this can be set before created because now is set above on line 149 if not e.created: e.created = now metrics_events_missing_created += 1 else: # only calculate the seconds if the created time already has been set metrics_total_job_event_processing_seconds += e.modified - e.created metrics_duration_to_save = time.perf_counter() try: cls.objects.bulk_create(events) metrics_bulk_events_saved += len(events) except Exception: # if an exception occurs, we should re-attempt to save the # events one-by-one, because something in the list is # broken/stale metrics_events_batch_save_errors += 1 for e in events: try: e.save() metrics_singular_events_saved += 1 except Exception: logger.exception('Database Error Saving Job Event') metrics_duration_to_save = time.perf_counter( ) - metrics_duration_to_save for e in events: if not getattr(e, '_skip_websocket_message', False): metrics_events_broadcast += 1 emit_event_detail(e) if getattr(e, '_notification_trigger_event', False): job_stats_wrapup(getattr(e, e.JOB_REFERENCE), event=e) self.buff = {} self.last_flush = time.time() # only update metrics if we saved events if (metrics_bulk_events_saved + metrics_singular_events_saved) > 0: self.subsystem_metrics.inc( 'callback_receiver_batch_events_errors', metrics_events_batch_save_errors) self.subsystem_metrics.inc( 'callback_receiver_events_insert_db_seconds', metrics_duration_to_save) self.subsystem_metrics.inc( 'callback_receiver_events_insert_db', metrics_bulk_events_saved + metrics_singular_events_saved) self.subsystem_metrics.observe( 'callback_receiver_batch_events_insert_db', metrics_bulk_events_saved) self.subsystem_metrics.inc( 'callback_receiver_events_in_memory', -(metrics_bulk_events_saved + metrics_singular_events_saved)) self.subsystem_metrics.inc( 'callback_receiver_events_broadcast', metrics_events_broadcast) self.subsystem_metrics.set( 'callback_receiver_event_processing_avg_seconds', metrics_total_job_event_processing_seconds.total_seconds() / (metrics_bulk_events_saved + metrics_singular_events_saved - metrics_events_missing_created), ) if self.subsystem_metrics.should_pipe_execute() is True: self.subsystem_metrics.pipe_execute()