def compare_scalar_metrics_average_per_iter( self, company_id, task_ids: Sequence[str], samples, key: ScalarKeyEnum, allow_public=True, ): """ Compare scalar metrics for different tasks per metric and variant The amount of points in each histogram should not exceed the requested samples """ if len(task_ids) > self.MAX_TASKS_COUNT: raise errors.BadRequest( f"Up to {self.MAX_TASKS_COUNT} tasks supported for comparison", len(task_ids), ) task_name_by_id = {} with translate_errors_context(): task_objs = Task.get_many( company=company_id, query=Q(id__in=task_ids), allow_public=allow_public, override_projection=("id", "name", "company"), return_dicts=False, ) if len(task_objs) < len(task_ids): invalid = tuple(set(task_ids) - set(r.id for r in task_objs)) raise errors.bad_request.InvalidTaskId(company=company_id, ids=invalid) task_name_by_id = {t.id: t.name for t in task_objs} companies = {t.company for t in task_objs} if len(companies) > 1: raise errors.bad_request.InvalidTaskId( "only tasks from the same company are supported" ) ret = self._run_get_scalar_metrics_as_parallel( next(iter(companies)), task_ids=task_ids, samples=samples, key=ScalarKey.resolve(key), get_func=self._get_scalar_average_per_task, ) for metric_data in ret.values(): for variant_data in metric_data.values(): for task_id, task_data in variant_data.items(): task_data["name"] = task_name_by_id[task_id] return ret
def add_events(self, company_id, events, worker, allow_locked_tasks=False): actions = [] task_ids = set() task_iteration = defaultdict(lambda: 0) task_last_events = nested_dict( 3, dict) # task_id -> metric_hash -> variant_hash -> MetricEvent for event in events: # remove spaces from event type if "type" not in event: raise errors.BadRequest("Event must have a 'type' field", event=event) event_type = event["type"].replace(" ", "_") if event_type not in EVENT_TYPES: raise errors.BadRequest( "Invalid event type {}".format(event_type), event=event, types=EVENT_TYPES, ) event["type"] = event_type # @timestamp indicates the time the event is written, not when it happened event["@timestamp"] = es_factory.get_es_timestamp_str() # for backward bomba-tavili-tea if "ts" in event: event["timestamp"] = event.pop("ts") # set timestamp and worker if not sent if "timestamp" not in event: event["timestamp"] = es_factory.get_timestamp_millis() if "worker" not in event: event["worker"] = worker # force iter to be a long int iter = event.get("iter") if iter is not None: iter = int(iter) event["iter"] = iter # used to have "values" to indicate array. no need anymore if "values" in event: event["value"] = event["values"] del event["values"] index_name = EventMetrics.get_index_name(company_id, event_type) es_action = { "_op_type": "index", # overwrite if exists with same ID "_index": index_name, "_type": "event", "_source": event, } # for "log" events, don't assing custom _id - whatever is sent, is written (not overwritten) if event_type != "log": es_action["_id"] = self._get_event_id(event) else: es_action["_id"] = dbutils.id() task_id = event.get("task") if task_id is not None: es_action["_routing"] = task_id task_ids.add(task_id) if (iter is not None and event.get("metric") not in self._skip_iteration_for_metric): task_iteration[task_id] = max(iter, task_iteration[task_id]) if event_type == EventType.metrics_scalar.value: self._update_last_metric_event_for_task( task_last_events=task_last_events, task_id=task_id, event=event) else: es_action["_routing"] = task_id actions.append(es_action) if task_ids: # verify task_ids with translate_errors_context(), TimingContext( "mongo", "task_by_ids"): extra_msg = None query = Q(id__in=task_ids, company=company_id) if not allow_locked_tasks: query &= Q(status__nin=LOCKED_TASK_STATUSES) extra_msg = "or task published" res = Task.objects(query).only("id") if len(res) < len(task_ids): invalid_task_ids = tuple( set(task_ids) - set(r.id for r in res)) raise errors.bad_request.InvalidTaskId( extra_msg, company=company_id, ids=invalid_task_ids) errors_in_bulk = [] added = 0 chunk_size = 500 with translate_errors_context(), TimingContext("es", "events_add_batch"): # TODO: replace it with helpers.parallel_bulk in the future once the parallel pool leak is fixed with closing( helpers.streaming_bulk( self.es, actions, chunk_size=chunk_size, # thread_count=8, refresh=True, )) as it: for success, info in it: if success: added += chunk_size else: errors_in_bulk.append(info) remaining_tasks = set() now = datetime.utcnow() for task_id in task_ids: # Update related tasks. For reasons of performance, we prefer to update all of them and not only those # who's events were successful updated = self._update_task( company_id=company_id, task_id=task_id, now=now, iter_max=task_iteration.get(task_id), last_events=task_last_events.get(task_id), ) if not updated: remaining_tasks.add(task_id) continue if remaining_tasks: TaskBLL.set_last_update(remaining_tasks, company_id, last_update=now) # Compensate for always adding chunk_size on success (last chunk is probably smaller) added = min(added, len(actions)) return added, errors_in_bulk