Esempio n. 1
0
    def get_metrics_and_variants(self, company_id, task_id, event_type):

        es_index = EventMetrics.get_index_name(company_id, event_type)

        if not self.es.indices.exists(es_index):
            return {}

        es_req = {
            "size": 0,
            "aggs": {
                "metrics": {
                    "terms": {"field": "metric", "size": 200},
                    "aggs": {"variants": {"terms": {"field": "variant", "size": 200}}},
                }
            },
            "query": {"bool": {"must": [{"term": {"task": task_id}}]}},
        }

        with translate_errors_context(), TimingContext(
            "es", "events_get_metrics_and_variants"
        ):
            es_res = self.es.search(index=es_index, body=es_req, routing=task_id)

        metrics = {}
        for metric_bucket in es_res["aggregations"]["metrics"].get("buckets"):
            metric = metric_bucket["key"]
            metrics[metric] = [
                b["key"] for b in metric_bucket["variants"].get("buckets")
            ]

        return metrics
Esempio n. 2
0
    def get_vector_metrics_per_iter(self, company_id, task_id, metric, variant):

        es_index = EventMetrics.get_index_name(company_id, "training_stats_vector")
        if not self.es.indices.exists(es_index):
            return [], []

        es_req = {
            "size": 10000,
            "query": {
                "bool": {
                    "must": [
                        {"term": {"task": task_id}},
                        {"term": {"metric": metric}},
                        {"term": {"variant": variant}},
                    ]
                }
            },
            "_source": ["iter", "value"],
            "sort": ["iter"],
        }
        with translate_errors_context(), TimingContext("es", "task_stats_vector"):
            es_res = self.es.search(index=es_index, body=es_req, routing=task_id)

        vectors = []
        iterations = []
        for hit in es_res["hits"]["hits"]:
            vectors.append(hit["_source"]["value"])
            iterations.append(hit["_source"]["iter"])

        return iterations, vectors
Esempio n. 3
0
 def __init__(self, events_es=None, redis=None):
     self.es = events_es or es_factory.connect("events")
     self._metrics = EventMetrics(self.es)
     self._skip_iteration_for_metric = set(
         config.get("services.events.ignore_iteration.metrics", [])
     )
     self.redis = redis or redman.connection("apiserver")
     self.debug_images_iterator = DebugImagesIterator(es=self.es, redis=self.redis)
Esempio n. 4
0
    def get_task_events(
        self,
        company_id: str,
        metrics: Sequence[Tuple[str, str]],
        iter_count: int,
        navigate_earlier: bool = True,
        refresh: bool = False,
        state_id: str = None,
    ) -> DebugImagesResult:
        es_index = EventMetrics.get_index_name(company_id, self.EVENT_TYPE)
        if not self.es.indices.exists(es_index):
            return DebugImagesResult()

        def init_state(state_: DebugImageEventsScrollState):
            unique_metrics = set(metrics)
            state_.metrics = self._init_metric_states(es_index,
                                                      list(unique_metrics))

        def validate_state(state_: DebugImageEventsScrollState):
            """
            Validate that the metrics stored in the state are the same
            as requested in the current call.
            Refresh the state if requested
            """
            state_metrics = set((m.task, m.name) for m in state_.metrics)
            if state_metrics != set(metrics):
                raise errors.bad_request.InvalidScrollId(
                    "Task metrics stored in the state do not match the passed ones",
                    scroll_id=state_.id,
                )
            if refresh:
                self._reinit_outdated_metric_states(company_id, es_index,
                                                    state_)
                for metric_state in state_.metrics:
                    metric_state.reset()

        with self.cache_manager.get_or_create_state(
                state_id=state_id,
                init_state=init_state,
                validate_state=validate_state) as state:
            res = DebugImagesResult(next_scroll_id=state.id)
            with ThreadPoolExecutor(self._max_workers) as pool:
                res.metric_events = list(
                    pool.map(
                        partial(
                            self._get_task_metric_events,
                            es_index=es_index,
                            iter_count=iter_count,
                            navigate_earlier=navigate_earlier,
                        ),
                        state.metrics,
                    ))

            return res
Esempio n. 5
0
    def delete_task_events(self, company_id, task_id):
        es_index = EventMetrics.get_index_name(company_id, "*")
        es_req = {"query": {"term": {"task": task_id}}}
        with translate_errors_context(), TimingContext("es",
                                                       "delete_task_events"):
            es_res = self.es.delete_by_query(index=es_index,
                                             body=es_req,
                                             routing=task_id,
                                             refresh=True)

        return es_res.get("deleted", 0)
Esempio n. 6
0
    def scroll_task_events(
        self,
        company_id,
        task_id,
        order,
        event_type=None,
        batch_size=10000,
        scroll_id=None,
    ):
        if scroll_id:
            with translate_errors_context(), TimingContext(
                    "es", "task_log_events"):
                es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h")
        else:
            size = min(batch_size, 10000)
            if event_type is None:
                event_type = "*"

            es_index = EventMetrics.get_index_name(company_id, event_type)

            if not self.es.indices.exists(es_index):
                return [], None, 0

            es_req = {
                "size": size,
                "sort": {
                    "timestamp": {
                        "order": order
                    }
                },
                "query": {
                    "bool": {
                        "must": [{
                            "term": {
                                "task": task_id
                            }
                        }]
                    }
                },
            }

            with translate_errors_context(), TimingContext(
                    "es", "scroll_task_events"):
                es_res = self.es.search(index=es_index,
                                        body=es_req,
                                        scroll="1h",
                                        routing=task_id)

        events = [hit["_source"] for hit in es_res["hits"]["hits"]]
        next_scroll_id = es_res["_scroll_id"]
        total_events = es_res["hits"]["total"]

        return events, next_scroll_id, total_events
Esempio n. 7
0
    def get_task_events(
        self,
        company_id: str,
        metrics: Sequence[Tuple[str, str]],
        iter_count: int,
        navigate_earlier: bool = True,
        refresh: bool = False,
        state_id: str = None,
    ) -> DebugImagesResult:
        es_index = EventMetrics.get_index_name(company_id, self.EVENT_TYPE)
        if not self.es.indices.exists(es_index):
            return DebugImagesResult()

        unique_metrics = set(metrics)
        state = self.cache_manager.get_state(state_id) if state_id else None
        if not state:
            state = DebugImageEventsScrollState(
                id=database.utils.id(),
                metrics=self._init_metric_states(es_index,
                                                 list(unique_metrics)),
            )
        else:
            state_metrics = set((m.task, m.name) for m in state.metrics)
            if state_metrics != unique_metrics:
                raise errors.bad_request.InvalidScrollId(
                    "while getting debug images events", scroll_id=state_id)

            if refresh:
                self._reinit_outdated_metric_states(company_id, es_index,
                                                    state)
                for metric_state in state.metrics:
                    metric_state.reset()

        res = DebugImagesResult(next_scroll_id=state.id)
        try:
            with ThreadPoolExecutor(self._max_workers) as pool:
                res.metric_events = list(
                    pool.map(
                        partial(
                            self._get_task_metric_events,
                            es_index=es_index,
                            iter_count=iter_count,
                            navigate_earlier=navigate_earlier,
                        ),
                        state.metrics,
                    ))
        finally:
            self.cache_manager.set_state(state)

        return res
Esempio n. 8
0
def get_task_latest_scalar_values(call, company_id, req_model):
    task_id = call.data["task"]
    task = task_bll.assert_exists(company_id, task_id, allow_public=True)
    metrics, last_timestamp = event_bll.get_task_latest_scalar_values(
        company_id, task_id)
    es_index = EventMetrics.get_index_name(company_id, "*")
    last_iters = event_bll.get_last_iters(es_index, task_id, None, 1)
    call.result.data = dict(
        metrics=metrics,
        last_iter=last_iters[0] if last_iters else 0,
        name=task.name,
        status=task.status,
        last_timestamp=last_timestamp,
    )
Esempio n. 9
0
    def delete_task_events(self, company_id, task_id, allow_locked=False):
        with translate_errors_context():
            extra_msg = None
            query = Q(id=task_id, company=company_id)
            if not allow_locked:
                query &= Q(status__nin=LOCKED_TASK_STATUSES)
                extra_msg = "or task published"
            res = Task.objects(query).only("id").first()
            if not res:
                raise errors.bad_request.InvalidTaskId(
                    extra_msg, company=company_id, id=task_id
                )

        es_index = EventMetrics.get_index_name(company_id, "*")
        es_req = {"query": {"term": {"task": task_id}}}
        with translate_errors_context(), TimingContext("es", "delete_task_events"):
            es_res = self.es.delete_by_query(index=es_index, body=es_req, refresh=True)

        return es_res.get("deleted", 0)
    def get_task_events(
        self,
        company_id: str,
        task_id: str,
        batch_size: int,
        navigate_earlier: bool = True,
        refresh: bool = False,
        state_id: str = None,
    ) -> TaskEventsResult:
        es_index = EventMetrics.get_index_name(company_id, self.EVENT_TYPE)
        if not self.es.indices.exists(es_index):
            return TaskEventsResult()

        def init_state(state_: LogEventsScrollState):
            state_.task = task_id

        def validate_state(state_: LogEventsScrollState):
            """
            Checks that the task id stored in the state
            is equal to the one passed with the current call
            Refresh the state if requested
            """
            if state_.task != task_id:
                raise errors.bad_request.InvalidScrollId(
                    "Task stored in the state does not match the passed one",
                    scroll_id=state_.id,
                )
            if refresh:
                state_.reset()

        with self.cache_manager.get_or_create_state(
            state_id=state_id, init_state=init_state, validate_state=validate_state,
        ) as state:
            res = TaskEventsResult(next_scroll_id=state.id)
            res.events, res.total_events = self._get_events(
                es_index=es_index,
                batch_size=batch_size,
                navigate_earlier=navigate_earlier,
                state=state,
            )
            return res
Esempio n. 11
0
    def get_task_events(
        self,
        company_id: str,
        task_id: str,
        batch_size: int,
        navigate_earlier: bool = True,
        from_timestamp: Optional[int] = None,
    ) -> TaskEventsResult:
        es_index = EventMetrics.get_index_name(company_id, self.EVENT_TYPE)
        if not self.es.indices.exists(es_index):
            return TaskEventsResult()

        res = TaskEventsResult()
        res.events, res.total_events = self._get_events(
            es_index=es_index,
            task_id=task_id,
            batch_size=batch_size,
            navigate_earlier=navigate_earlier,
            from_timestamp=from_timestamp,
        )
        return res
Esempio n. 12
0
    def get_task_latest_scalar_values(self, company_id, task_id):
        es_index = EventMetrics.get_index_name(company_id,
                                               "training_stats_scalar")

        if not self.es.indices.exists(es_index):
            return {}

        es_req = {
            "size": 0,
            "query": {
                "bool": {
                    "must": [
                        {
                            "query_string": {
                                "query": "value:>0"
                            }
                        },
                        {
                            "term": {
                                "task": task_id
                            }
                        },
                    ]
                }
            },
            "aggs": {
                "metrics": {
                    "terms": {
                        "field": "metric",
                        "size": EventMetrics.MAX_METRICS_COUNT,
                        "order": {
                            "_term": "asc"
                        },
                    },
                    "aggs": {
                        "variants": {
                            "terms": {
                                "field": "variant",
                                "size": EventMetrics.MAX_VARIANTS_COUNT,
                                "order": {
                                    "_term": "asc"
                                },
                            },
                            "aggs": {
                                "last_value": {
                                    "top_hits": {
                                        "docvalue_fields": ["value"],
                                        "_source": "value",
                                        "size": 1,
                                        "sort": [{
                                            "iter": {
                                                "order": "desc"
                                            }
                                        }],
                                    }
                                },
                                "last_timestamp": {
                                    "max": {
                                        "field": "@timestamp"
                                    }
                                },
                                "last_10_value": {
                                    "top_hits": {
                                        "docvalue_fields": ["value"],
                                        "_source": "value",
                                        "size": 10,
                                        "sort": [{
                                            "iter": {
                                                "order": "desc"
                                            }
                                        }],
                                    }
                                },
                            },
                        }
                    },
                }
            },
            "_source": {
                "excludes": []
            },
        }
        with translate_errors_context(), TimingContext(
                "es", "events_get_metrics_and_variants"):
            es_res = self.es.search(index=es_index,
                                    body=es_req,
                                    routing=task_id)

        metrics = []
        max_timestamp = 0
        for metric_bucket in es_res["aggregations"]["metrics"].get("buckets"):
            metric_summary = dict(name=metric_bucket["key"], variants=[])
            for variant_bucket in metric_bucket["variants"].get("buckets"):
                variant_name = variant_bucket["key"]
                last_value = variant_bucket["last_value"]["hits"]["hits"][0][
                    "fields"]["value"][0]
                last_10_value = variant_bucket["last_10_value"]["hits"][
                    "hits"][0]["fields"]["value"][0]
                timestamp = variant_bucket["last_timestamp"]["value"]
                max_timestamp = max(timestamp, max_timestamp)
                metric_summary["variants"].append(
                    dict(
                        name=variant_name,
                        last_value=last_value,
                        last_10_value=last_10_value,
                    ))
            metrics.append(metric_summary)
        return metrics, max_timestamp
Esempio n. 13
0
    def get_task_plots(
        self,
        company_id: str,
        tasks: Sequence[str],
        last_iterations_per_plot: int = None,
        sort=None,
        size: int = 500,
        scroll_id: str = None,
    ):
        if scroll_id == self.empty_scroll:
            return [], scroll_id, 0

        if scroll_id:
            with translate_errors_context(), TimingContext("es", "get_task_events"):
                es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h")
        else:
            event_type = "plot"
            es_index = EventMetrics.get_index_name(company_id, event_type)
            if not self.es.indices.exists(es_index):
                return TaskEventsResult()

            must = []
            if last_iterations_per_plot is None:
                must.append({"terms": {"task": tasks}})
            else:
                should = []
                for i, task_id in enumerate(tasks):
                    last_iters = self.get_last_iterations_per_event_metric_variant(
                        es_index, task_id, last_iterations_per_plot, event_type
                    )
                    if not last_iters:
                        continue

                    for metric, variant, iter in last_iters:
                        should.append(
                            {
                                "bool": {
                                    "must": [
                                        {"term": {"task": task_id}},
                                        {"term": {"metric": metric}},
                                        {"term": {"variant": variant}},
                                        {"term": {"iter": iter}},
                                    ]
                                }
                            }
                        )
                if not should:
                    return TaskEventsResult()
                must.append({"bool": {"should": should}})

            if sort is None:
                sort = [{"timestamp": {"order": "asc"}}]

            es_req = {
                "sort": sort,
                "size": min(size, 10000),
                "query": {"bool": {"must": must}},
            }

            with translate_errors_context(), TimingContext("es", "get_task_plots"):
                es_res = self.es.search(
                    index=es_index, body=es_req, ignore=404, scroll="1h",
                )

        events, total_events, next_scroll_id = self._get_events_from_es_res(es_res)
        return TaskEventsResult(
            events=events, next_scroll_id=next_scroll_id, total_events=total_events
        )
Esempio n. 14
0
 def __init__(self, events_es=None):
     self.es = events_es or es_factory.connect("events")
     self._metrics = EventMetrics(self.es)
     self._skip_iteration_for_metric = set(
         config.get("services.events.ignore_iteration.metrics", []))
Esempio n. 15
0
    def add_events(self, company_id, events, worker, allow_locked_tasks=False):
        actions = []
        task_ids = set()
        task_iteration = defaultdict(lambda: 0)
        task_last_events = nested_dict(
            3, dict)  # task_id -> metric_hash -> variant_hash -> MetricEvent

        for event in events:
            # remove spaces from event type
            if "type" not in event:
                raise errors.BadRequest("Event must have a 'type' field",
                                        event=event)

            event_type = event["type"].replace(" ", "_")
            if event_type not in EVENT_TYPES:
                raise errors.BadRequest(
                    "Invalid event type {}".format(event_type),
                    event=event,
                    types=EVENT_TYPES,
                )

            event["type"] = event_type

            # @timestamp indicates the time the event is written, not when it happened
            event["@timestamp"] = es_factory.get_es_timestamp_str()

            # for backward bomba-tavili-tea
            if "ts" in event:
                event["timestamp"] = event.pop("ts")

            # set timestamp and worker if not sent
            if "timestamp" not in event:
                event["timestamp"] = es_factory.get_timestamp_millis()

            if "worker" not in event:
                event["worker"] = worker

            # force iter to be a long int
            iter = event.get("iter")
            if iter is not None:
                iter = int(iter)
                event["iter"] = iter

            # used to have "values" to indicate array. no need anymore
            if "values" in event:
                event["value"] = event["values"]
                del event["values"]

            index_name = EventMetrics.get_index_name(company_id, event_type)
            es_action = {
                "_op_type": "index",  # overwrite if exists with same ID
                "_index": index_name,
                "_type": "event",
                "_source": event,
            }

            # for "log" events, don't assing custom _id - whatever is sent, is written (not overwritten)
            if event_type != "log":
                es_action["_id"] = self._get_event_id(event)
            else:
                es_action["_id"] = dbutils.id()

            task_id = event.get("task")
            if task_id is not None:
                es_action["_routing"] = task_id
                task_ids.add(task_id)
                if (iter is not None and event.get("metric")
                        not in self._skip_iteration_for_metric):
                    task_iteration[task_id] = max(iter,
                                                  task_iteration[task_id])

                if event_type == EventType.metrics_scalar.value:
                    self._update_last_metric_event_for_task(
                        task_last_events=task_last_events,
                        task_id=task_id,
                        event=event)
            else:
                es_action["_routing"] = task_id

            actions.append(es_action)

        if task_ids:
            # verify task_ids
            with translate_errors_context(), TimingContext(
                    "mongo", "task_by_ids"):
                extra_msg = None
                query = Q(id__in=task_ids, company=company_id)
                if not allow_locked_tasks:
                    query &= Q(status__nin=LOCKED_TASK_STATUSES)
                    extra_msg = "or task published"
                res = Task.objects(query).only("id")
                if len(res) < len(task_ids):
                    invalid_task_ids = tuple(
                        set(task_ids) - set(r.id for r in res))
                    raise errors.bad_request.InvalidTaskId(
                        extra_msg, company=company_id, ids=invalid_task_ids)

        errors_in_bulk = []
        added = 0
        chunk_size = 500
        with translate_errors_context(), TimingContext("es",
                                                       "events_add_batch"):
            # TODO: replace it with helpers.parallel_bulk in the future once the parallel pool leak is fixed
            with closing(
                    helpers.streaming_bulk(
                        self.es,
                        actions,
                        chunk_size=chunk_size,
                        # thread_count=8,
                        refresh=True,
                    )) as it:
                for success, info in it:
                    if success:
                        added += chunk_size
                    else:
                        errors_in_bulk.append(info)

            remaining_tasks = set()
            now = datetime.utcnow()
            for task_id in task_ids:
                # Update related tasks. For reasons of performance, we prefer to update all of them and not only those
                #  who's events were successful

                updated = self._update_task(
                    company_id=company_id,
                    task_id=task_id,
                    now=now,
                    iter_max=task_iteration.get(task_id),
                    last_events=task_last_events.get(task_id),
                )

                if not updated:
                    remaining_tasks.add(task_id)
                    continue

            if remaining_tasks:
                TaskBLL.set_last_update(remaining_tasks,
                                        company_id,
                                        last_update=now)

        # Compensate for always adding chunk_size on success (last chunk is probably smaller)
        added = min(added, len(actions))

        return added, errors_in_bulk
Esempio n. 16
0
    def get_task_events(
        self,
        company_id,
        task_id,
        event_type=None,
        metric=None,
        variant=None,
        last_iter_count=None,
        sort=None,
        size=500,
        scroll_id=None,
    ):

        if scroll_id:
            with translate_errors_context(), TimingContext(
                    "es", "get_task_events"):
                es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h")
        else:
            task_ids = [task_id] if isinstance(task_id,
                                               six.string_types) else task_id
            if event_type is None:
                event_type = "*"

            es_index = EventMetrics.get_index_name(company_id, event_type)
            if not self.es.indices.exists(es_index):
                return TaskEventsResult()

            query = {"bool": defaultdict(list)}

            if metric or variant:
                must = query["bool"]["must"]
                if metric:
                    must.append({"term": {"metric": metric}})
                if variant:
                    must.append({"term": {"variant": variant}})

            if last_iter_count is None:
                must = query["bool"]["must"]
                must.append({"terms": {"task": task_ids}})
            else:
                should = query["bool"]["should"]
                for i, task_id in enumerate(task_ids):
                    last_iters = self.get_last_iters(es_index, task_id,
                                                     event_type,
                                                     last_iter_count)
                    if not last_iters:
                        continue
                    should.append({
                        "bool": {
                            "must": [
                                {
                                    "term": {
                                        "task": task_id
                                    }
                                },
                                {
                                    "terms": {
                                        "iter": last_iters
                                    }
                                },
                            ]
                        }
                    })
                if not should:
                    return TaskEventsResult()

            if sort is None:
                sort = [{"timestamp": {"order": "asc"}}]

            es_req = {"sort": sort, "size": min(size, 10000), "query": query}

            routing = ",".join(task_ids)

            with translate_errors_context(), TimingContext(
                    "es", "get_task_events"):
                es_res = self.es.search(
                    index=es_index,
                    body=es_req,
                    ignore=404,
                    routing=routing,
                    scroll="1h",
                )

        events = [
            doc["_source"] for doc in es_res.get("hits", {}).get("hits", [])
        ]
        next_scroll_id = es_res["_scroll_id"]
        total_events = es_res["hits"]["total"]

        return TaskEventsResult(events=events,
                                next_scroll_id=next_scroll_id,
                                total_events=total_events)
Esempio n. 17
0
    def get_task_plots(
        self,
        company_id: str,
        tasks: Sequence[str],
        last_iterations_per_plot: int = None,
        sort=None,
        size: int = 500,
        scroll_id: str = None,
    ):
        if scroll_id:
            with translate_errors_context(), TimingContext(
                    "es", "get_task_events"):
                es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h")
        else:
            event_type = "plot"
            es_index = EventMetrics.get_index_name(company_id, event_type)
            if not self.es.indices.exists(es_index):
                return TaskEventsResult()

            query = {"bool": defaultdict(list)}

            if last_iterations_per_plot is None:
                must = query["bool"]["must"]
                must.append({"terms": {"task": tasks}})
            else:
                should = query["bool"]["should"]
                for i, task_id in enumerate(tasks):
                    last_iters = self.get_last_iterations_per_event_metric_variant(
                        es_index, task_id, last_iterations_per_plot,
                        event_type)
                    if not last_iters:
                        continue

                    for metric, variant, iter in last_iters:
                        should.append({
                            "bool": {
                                "must": [
                                    {
                                        "term": {
                                            "task": task_id
                                        }
                                    },
                                    {
                                        "term": {
                                            "metric": metric
                                        }
                                    },
                                    {
                                        "term": {
                                            "variant": variant
                                        }
                                    },
                                    {
                                        "term": {
                                            "iter": iter
                                        }
                                    },
                                ]
                            }
                        })
                if not should:
                    return TaskEventsResult()

            if sort is None:
                sort = [{"timestamp": {"order": "asc"}}]

            es_req = {"sort": sort, "size": min(size, 10000), "query": query}

            routing = ",".join(tasks)

            with translate_errors_context(), TimingContext(
                    "es", "get_task_plots"):
                es_res = self.es.search(
                    index=es_index,
                    body=es_req,
                    ignore=404,
                    routing=routing,
                    scroll="1h",
                )

        events = [
            doc["_source"] for doc in es_res.get("hits", {}).get("hits", [])
        ]
        # scroll id may be missing when queering a totally empty DB
        next_scroll_id = es_res.get("_scroll_id")
        total_events = es_res["hits"]["total"]

        return TaskEventsResult(events=events,
                                next_scroll_id=next_scroll_id,
                                total_events=total_events)
Esempio n. 18
0
    def add_events(self,
                   company_id,
                   events,
                   worker,
                   allow_locked_tasks=False) -> Tuple[int, int, dict]:
        actions = []
        task_ids = set()
        task_iteration = defaultdict(lambda: 0)
        task_last_scalar_events = nested_dict(
            3, dict)  # task_id -> metric_hash -> variant_hash -> MetricEvent
        task_last_events = nested_dict(
            3, dict)  # task_id -> metric_hash -> event_type -> MetricEvent
        errors_per_type = defaultdict(int)
        valid_tasks = self._get_valid_tasks(
            company_id,
            task_ids={
                event["task"]
                for event in events if event.get("task") is not None
            },
            allow_locked_tasks=allow_locked_tasks,
        )
        for event in events:
            # remove spaces from event type
            event_type = event.get("type")
            if event_type is None:
                errors_per_type["Event must have a 'type' field"] += 1
                continue

            event_type = event_type.replace(" ", "_")
            if event_type not in EVENT_TYPES:
                errors_per_type[f"Invalid event type {event_type}"] += 1
                continue

            task_id = event.get("task")
            if task_id is None:
                errors_per_type["Event must have a 'task' field"] += 1
                continue

            if task_id not in valid_tasks:
                errors_per_type["Invalid task id"] += 1
                continue

            event["type"] = event_type

            # @timestamp indicates the time the event is written, not when it happened
            event["@timestamp"] = es_factory.get_es_timestamp_str()

            # for backward bomba-tavili-tea
            if "ts" in event:
                event["timestamp"] = event.pop("ts")

            # set timestamp and worker if not sent
            if "timestamp" not in event:
                event["timestamp"] = es_factory.get_timestamp_millis()

            if "worker" not in event:
                event["worker"] = worker

            # force iter to be a long int
            iter = event.get("iter")
            if iter is not None:
                iter = int(iter)
                event["iter"] = iter

            # used to have "values" to indicate array. no need anymore
            if "values" in event:
                event["value"] = event["values"]
                del event["values"]

            event["metric"] = event.get("metric") or ""
            event["variant"] = event.get("variant") or ""

            index_name = EventMetrics.get_index_name(company_id, event_type)
            es_action = {
                "_op_type": "index",  # overwrite if exists with same ID
                "_index": index_name,
                "_type": "event",
                "_source": event,
            }

            # for "log" events, don't assing custom _id - whatever is sent, is written (not overwritten)
            if event_type != "log":
                es_action["_id"] = self._get_event_id(event)
            else:
                es_action["_id"] = dbutils.id()

            es_action["_routing"] = task_id
            task_ids.add(task_id)
            if (iter is not None and event.get("metric")
                    not in self._skip_iteration_for_metric):
                task_iteration[task_id] = max(iter, task_iteration[task_id])

            self._update_last_metric_events_for_task(
                last_events=task_last_events[task_id],
                event=event,
            )
            if event_type == EventType.metrics_scalar.value:
                self._update_last_scalar_events_for_task(
                    last_events=task_last_scalar_events[task_id], event=event)

            actions.append(es_action)

        added = 0
        if actions:
            chunk_size = 500
            with translate_errors_context(), TimingContext(
                    "es", "events_add_batch"):
                # TODO: replace it with helpers.parallel_bulk in the future once the parallel pool leak is fixed
                with closing(
                        helpers.streaming_bulk(
                            self.es,
                            actions,
                            chunk_size=chunk_size,
                            # thread_count=8,
                            refresh=True,
                        )) as it:
                    for success, info in it:
                        if success:
                            added += chunk_size
                        else:
                            errors_per_type[
                                "Error when indexing events batch"] += 1

                remaining_tasks = set()
                now = datetime.utcnow()
                for task_id in task_ids:
                    # Update related tasks. For reasons of performance, we prefer to update
                    # all of them and not only those who's events were successful
                    updated = self._update_task(
                        company_id=company_id,
                        task_id=task_id,
                        now=now,
                        iter_max=task_iteration.get(task_id),
                        last_scalar_events=task_last_scalar_events.get(
                            task_id),
                        last_events=task_last_events.get(task_id),
                    )

                    if not updated:
                        remaining_tasks.add(task_id)
                        continue

                if remaining_tasks:
                    TaskBLL.set_last_update(remaining_tasks,
                                            company_id,
                                            last_update=now)

        # Compensate for always adding chunk_size on success (last chunk is probably smaller)
        added = min(added, len(actions))

        if not added:
            raise errors.bad_request.EventsNotAdded(**errors_per_type)

        errors_count = sum(errors_per_type.values())
        return added, errors_count, errors_per_type
Esempio n. 19
0
 def __init__(self, events_es=None):
     self.es = events_es or es_factory.connect("events")
     self._metrics = EventMetrics(self.es)
Esempio n. 20
0
    def get_task_events(
        self,
        company_id,
        task_id,
        event_type=None,
        metric=None,
        variant=None,
        last_iter_count=None,
        sort=None,
        size=500,
        scroll_id=None,
    ):
        if scroll_id == self.empty_scroll:
            return [], scroll_id, 0

        if scroll_id:
            with translate_errors_context(), TimingContext("es", "get_task_events"):
                es_res = self.es.scroll(scroll_id=scroll_id, scroll="1h")
        else:
            task_ids = [task_id] if isinstance(task_id, six.string_types) else task_id
            if event_type is None:
                event_type = "*"

            es_index = EventMetrics.get_index_name(company_id, event_type)
            if not self.es.indices.exists(es_index):
                return TaskEventsResult()

            must = []
            if metric:
                must.append({"term": {"metric": metric}})
            if variant:
                must.append({"term": {"variant": variant}})

            if last_iter_count is None:
                must.append({"terms": {"task": task_ids}})
            else:
                should = []
                for i, task_id in enumerate(task_ids):
                    last_iters = self.get_last_iters(
                        es_index, task_id, event_type, last_iter_count
                    )
                    if not last_iters:
                        continue
                    should.append(
                        {
                            "bool": {
                                "must": [
                                    {"term": {"task": task_id}},
                                    {"terms": {"iter": last_iters}},
                                ]
                            }
                        }
                    )
                if not should:
                    return TaskEventsResult()
                must.append({"bool": {"should": should}})

            if sort is None:
                sort = [{"timestamp": {"order": "asc"}}]

            es_req = {
                "sort": sort,
                "size": min(size, 10000),
                "query": {"bool": {"must": must}},
            }

            with translate_errors_context(), TimingContext("es", "get_task_events"):
                es_res = self.es.search(
                    index=es_index, body=es_req, ignore=404, scroll="1h",
                )

        events, total_events, next_scroll_id = self._get_events_from_es_res(es_res)
        return TaskEventsResult(
            events=events, next_scroll_id=next_scroll_id, total_events=total_events
        )