def log_queue_metrics_to_es(self, company_id: str, queues: Sequence[Queue]) -> bool: """ Calculate and write queue statistics (avg waiting time and queue length) to Elastic :return: True if the write to es was successful, false otherwise """ es_index = (self._queue_metrics_prefix_for_company(company_id) + self._get_es_index_suffix()) timestamp = es_factory.get_timestamp_millis() def make_doc(queue: Queue) -> dict: entries = [e for e in queue.entries if e.added] return dict( _index=es_index, _source={ self.EsKeys.TIMESTAMP_FIELD: timestamp, self.EsKeys.QUEUE_FIELD: queue.id, self.EsKeys.WAITING_TIME_FIELD: self._calc_avg_waiting_time(entries), self.EsKeys.QUEUE_LENGTH_FIELD: len(entries), }, ) actions = list(map(make_doc, queues)) es_res = elasticsearch.helpers.bulk(self.es, actions) added, errors = es_res[:2] return (added == len(actions)) and not errors
def test_task_logs(self): task = self._temp_task() timestamp = es_factory.get_timestamp_millis() events = [ self._create_task_event( "log", task=task, iteration=iter_, timestamp=timestamp + iter_ * 1000, msg=f"This is a log message from test task iter {iter_}", ) for iter_ in range(10) ] self.send_batch(events) # test forward navigation ftime, ltime = None, None for page in range(2): ftime, ltime = self._assert_log_events( task=task, timestamp=ltime, expected_page=page ) # test backwards navigation self._assert_log_events(task=task, timestamp=ftime, navigate_earlier=False) # test order self._assert_log_events(task=task, order="asc")
def _create_task_event(type_, task, iteration, **kwargs): return { "worker": "test", "type": type_, "task": task, "iter": iteration, "timestamp": kwargs.get("timestamp") or es_factory.get_timestamp_millis(), **kwargs, }
def create_event(self, task, type_, iteration, **kwargs) -> dict: return { "worker": "test", "type": type_, "task": task, "iter": iteration, "timestamp": es_factory.get_timestamp_millis(), "metric": "Metric1", "variant": "Variant1", **kwargs, }
def add_events( self, company_id, events, worker, allow_locked_tasks=False ) -> Tuple[int, int, dict]: actions: List[dict] = [] task_ids = set() task_iteration = defaultdict(lambda: 0) task_last_scalar_events = nested_dict( 3, dict ) # task_id -> metric_hash -> variant_hash -> MetricEvent task_last_events = nested_dict( 3, dict ) # task_id -> metric_hash -> event_type -> MetricEvent errors_per_type = defaultdict(int) invalid_iteration_error = f"Iteration number should not exceed {MAX_LONG}" valid_tasks = self._get_valid_tasks( company_id, task_ids={ event["task"] for event in events if event.get("task") is not None }, allow_locked_tasks=allow_locked_tasks, ) for event in events: # remove spaces from event type event_type = event.get("type") if event_type is None: errors_per_type["Event must have a 'type' field"] += 1 continue event_type = event_type.replace(" ", "_") if event_type not in EVENT_TYPES: errors_per_type[f"Invalid event type {event_type}"] += 1 continue task_id = event.get("task") if task_id is None: errors_per_type["Event must have a 'task' field"] += 1 continue if task_id not in valid_tasks: errors_per_type["Invalid task id"] += 1 continue event["type"] = event_type # @timestamp indicates the time the event is written, not when it happened event["@timestamp"] = es_factory.get_es_timestamp_str() # for backward bomba-tavili-tea if "ts" in event: event["timestamp"] = event.pop("ts") # set timestamp and worker if not sent if "timestamp" not in event: event["timestamp"] = es_factory.get_timestamp_millis() if "worker" not in event: event["worker"] = worker # force iter to be a long int iter = event.get("iter") if iter is not None: iter = int(iter) if iter > MAX_LONG or iter < MIN_LONG: errors_per_type[invalid_iteration_error] += 1 continue event["iter"] = iter # used to have "values" to indicate array. no need anymore if "values" in event: event["value"] = event["values"] del event["values"] event["metric"] = event.get("metric") or "" event["variant"] = event.get("variant") or "" index_name = get_index_name(company_id, event_type) es_action = { "_op_type": "index", # overwrite if exists with same ID "_index": index_name, "_source": event, } # for "log" events, don't assing custom _id - whatever is sent, is written (not overwritten) if event_type != EventType.task_log.value: es_action["_id"] = self._get_event_id(event) else: es_action["_id"] = dbutils.id() task_ids.add(task_id) if ( iter is not None and event.get("metric") not in self._skip_iteration_for_metric ): task_iteration[task_id] = max(iter, task_iteration[task_id]) self._update_last_metric_events_for_task( last_events=task_last_events[task_id], event=event, ) if event_type == EventType.metrics_scalar.value: self._update_last_scalar_events_for_task( last_events=task_last_scalar_events[task_id], event=event ) actions.append(es_action) plot_actions = [ action["_source"] for action in actions if action["_source"]["type"] == EventType.metrics_plot.value ] if plot_actions: self.validate_and_compress_plots( plot_actions, validate_json=config.get("services.events.validate_plot_str", False), compression_threshold=config.get( "services.events.plot_compression_threshold", 100_000 ), ) added = 0 with translate_errors_context(): if actions: chunk_size = 500 with TimingContext("es", "events_add_batch"): # TODO: replace it with helpers.parallel_bulk in the future once the parallel pool leak is fixed with closing( helpers.streaming_bulk( self.es, actions, chunk_size=chunk_size, # thread_count=8, refresh=True, ) ) as it: for success, info in it: if success: added += 1 else: errors_per_type["Error when indexing events batch"] += 1 remaining_tasks = set() now = datetime.utcnow() for task_id in task_ids: # Update related tasks. For reasons of performance, we prefer to update # all of them and not only those who's events were successful updated = self._update_task( company_id=company_id, task_id=task_id, now=now, iter_max=task_iteration.get(task_id), last_scalar_events=task_last_scalar_events.get(task_id), last_events=task_last_events.get(task_id), ) if not updated: remaining_tasks.add(task_id) continue if remaining_tasks: TaskBLL.set_last_update( remaining_tasks, company_id, last_update=now ) # this is for backwards compatibility with streaming bulk throwing exception on those invalid_iterations_count = errors_per_type.get(invalid_iteration_error) if invalid_iterations_count: raise BulkIndexError( f"{invalid_iterations_count} document(s) failed to index.", [invalid_iteration_error], ) if not added: raise errors.bad_request.EventsNotAdded(**errors_per_type) errors_count = sum(errors_per_type.values()) return added, errors_count, errors_per_type