def test_join_query(self) -> None: ev = Entity("events", "ev") gm = Entity("groupedmessage", "gm") join = Join([Relationship(ev, "grouped", gm)]) query = (Query("discover", join).set_select([ Column("group_id", ev), Column("status", gm), Function("avg", [Column("retention_days", ev)], "avg"), ]).set_groupby([Column("group_id", ev), Column("status", gm)]).set_where([ Condition(Column("project_id", ev), Op.EQ, self.project_id), Condition(Column("project_id", gm), Op.EQ, self.project_id), Condition(Column("timestamp", ev), Op.GTE, self.base_time), Condition(Column("timestamp", ev), Op.LT, self.next_time), ])) response = self.post("/discover/snql", data=query.snuba()) data = json.loads(response.data) assert response.status_code == 200 assert data["data"] == []
def expected_query(match, select, extra_groupby, metric_name): function, column, alias = select return Query( dataset="metrics", match=Entity(match), select=[ Function( OP_TO_SNUBA_FUNCTION[match][alias], [ Column("value"), Function( "equals", [Column("metric_id"), resolve_weak(metric_name)]), ], alias=f"{alias}({metric_name})", ) ], groupby=[Column("tags[8]"), Column("tags[2]")] + extra_groupby, where=[ Condition(Column("org_id"), Op.EQ, 1), Condition(Column("project_id"), Op.IN, [1]), Condition(Column("timestamp"), Op.GTE, datetime(2021, 5, 28, 0, tzinfo=pytz.utc)), Condition(Column("timestamp"), Op.LT, datetime(2021, 8, 26, 0, tzinfo=pytz.utc)), Condition(Column("tags[6]"), Op.IN, [10]), Condition(Column("metric_id"), Op.IN, [resolve_weak(metric_name)]), ], limit=Limit(MAX_POINTS), offset=Offset(0), granularity=Granularity(query_definition.rollup), )
def expected_query(match, select, extra_groupby): function, column, alias = select return Query( dataset="metrics", match=Entity(match), select=[Function(function, [Column(column)], alias)], groupby=[ Column("metric_id"), Column("tags[8]"), Column("tags[2]") ] + extra_groupby, where=[ Condition(Column("org_id"), Op.EQ, 1), Condition(Column("project_id"), Op.IN, [1]), Condition(Column("metric_id"), Op.IN, [9, 11, 7]), Condition(Column("timestamp"), Op.GTE, datetime(2021, 5, 28, 0, tzinfo=pytz.utc)), Condition(Column("timestamp"), Op.LT, datetime(2021, 8, 26, 0, tzinfo=pytz.utc)), Condition(Column("tags[6]"), Op.IN, [10]), ], limit=Limit(MAX_POINTS), offset=Offset(0), granularity=Granularity(query_definition.rollup), )
def test_sub_query(self) -> None: inner_query = (Query("discover", Entity("discover_events")).set_select( [Function("count", [], "count") ]).set_groupby([Column("project_id"), Column("tags[custom_tag]")]).set_where([ Condition(Column("type"), Op.NEQ, "transaction"), Condition(Column("project_id"), Op.EQ, self.project_id), Condition(Column("timestamp"), Op.GTE, self.base_time), Condition(Column("timestamp"), Op.LT, self.next_time), ])) query = (Query("discover", inner_query).set_select( [Function("avg", [Column("count")], "avg_count")]).set_orderby([ OrderBy(Function("avg", [Column("count")], "avg_count"), Direction.ASC) ]).set_limit(1000)) response = self.post("/discover/snql", data=query.snuba()) data = json.loads(response.data) assert response.status_code == 200, data assert data["data"] == [{"avg_count": 1.0}]
def run_metrics_query( *, entity_key: EntityKey, select: List[Column], where: List[Condition], groupby: List[Column], projects: Sequence[Project], org_id: int, referrer: str, ) -> Mapping[str, Any]: # Round timestamp to minute to get cache efficiency: now = datetime.now().replace(second=0, microsecond=0) query = Query( dataset=Dataset.Metrics.value, match=Entity(entity_key.value), select=select, groupby=groupby, where=[ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, [p.id for p in projects]), Condition(Column(TS_COL_QUERY), Op.GTE, now - timedelta(hours=24)), Condition(Column(TS_COL_QUERY), Op.LT, now), ] + where, granularity=Granularity(GRANULARITY), ) result = raw_snql_query(query, referrer, use_cache=True) return result["data"]
def _get_data(entity_key: EntityKey, metric_name: str) -> Tuple[int, int]: total = 0 crashed = 0 metric_id = try_get_string_index(org_id, metric_name) if metric_id is not None: where = conditions + [ Condition(Column("metric_id"), Op.EQ, metric_id), Condition(Column("timestamp"), Op.LT, end), ] data = raw_snql_query( Query( dataset=Dataset.Metrics.value, match=Entity(entity_key.value), select=[Column("value")], where=where, groupby=[Column(status_key)], ), referrer= "release_health.metrics.crash-free-breakdown.session", )["data"] for row in data: if row[status_key] == status_init: total = int(row["value"]) elif row[status_key] == status_crashed: crashed = int(row["value"]) return total, crashed
def _build_totals_and_series_queries( entity, select, where, groupby, orderby, limit, offset, rollup, intervals_len ): totals_query = Query( dataset=Dataset.Metrics.value, match=Entity(entity), groupby=groupby, select=select, where=where, limit=Limit(limit or MAX_POINTS), offset=Offset(offset or 0), granularity=Granularity(rollup), orderby=orderby, ) series_query = totals_query.set_groupby( (totals_query.groupby or []) + [Column(TS_COL_GROUP)] ) # In a series query, we also need to factor in the len of the intervals array series_limit = MAX_POINTS if limit: series_limit = limit * intervals_len series_query = series_query.set_limit(series_limit) return {"totals": totals_query, "series": series_query}
def monitor_release_adoption(**kwargs): metrics.incr("sentry.tasks.monitor_release_adoption.start", sample_rate=1.0) # 1. Query snuba for all project ids that have sessions. with metrics.timer( "sentry.tasks.monitor_release_adoption.aggregate_projects.loop", sample_rate=1.0 ): aggregated_projects = defaultdict(list) start_time = time.time() offset = 0 while (time.time() - start_time) < MAX_SECONDS: query = ( Query( dataset="sessions", match=Entity("org_sessions"), select=[ Column("org_id"), Column("project_id"), ], groupby=[Column("org_id"), Column("project_id")], where=[ Condition( Column("started"), Op.GTE, datetime.utcnow() - timedelta(hours=6) ), Condition(Column("started"), Op.LT, datetime.utcnow()), ], granularity=Granularity(3600), orderby=[ OrderBy(Column("org_id"), Direction.ASC), OrderBy(Column("project_id"), Direction.ASC), ], ) .set_limit(CHUNK_SIZE + 1) .set_offset(offset) ) data = snuba.raw_snql_query(query, referrer="tasks.monitor_release_adoption")["data"] count = len(data) more_results = count > CHUNK_SIZE offset += CHUNK_SIZE if more_results: data = data[:-1] for row in data: aggregated_projects[row["org_id"]].append(row["project_id"]) if not more_results: break else: logger.info( "monitor_release_adoption.loop_timeout", sample_rate=1.0, extra={"offset": offset}, ) with metrics.timer( "sentry.tasks.monitor_release_adoption.process_projects_with_sessions", sample_rate=1.0 ): for org_id in aggregated_projects: process_projects_with_sessions.delay(org_id, aggregated_projects[org_id])
def test_invalid_time_conditions(self) -> None: query = (Query("events", Entity("events")).set_select( [Function("count", [], "count")]).set_where([ Condition(Column("project_id"), Op.EQ, self.project_id), Condition(Column("timestamp"), Op.GTE, self.next_time), Condition(Column("timestamp"), Op.LT, self.base_time), ])) response = self.post("/events/snql", data=query.snuba()) resp = json.loads(response.data) assert response.status_code == 400, resp
def test_escape_edge_cases(self) -> None: query = (Query("events", Entity("events")).set_select( [Function("count", [], "times_seen")]).set_where([ Condition(Column("project_id"), Op.EQ, self.project_id), Condition(Column("timestamp"), Op.GTE, self.base_time), Condition(Column("timestamp"), Op.LT, self.next_time), Condition(Column("environment"), Op.EQ, "\\' \n \\n \\"), ])) response = self.post("/events/snql", data=query.snuba()) data = json.loads(response.data) assert response.status_code == 200, data
def get_changed_project_release_model_adoptions( self, project_ids: Sequence[ProjectId], ) -> Sequence[ProjectRelease]: now = datetime.now(pytz.utc) start = now - timedelta(days=3) projects_ids = list(project_ids) if len(projects_ids) == 0: return [] org_id = self._get_org_id(project_ids) release_column_name = tag_key(org_id, "release") query_cols = [Column("project_id"), Column(release_column_name)] group_by = query_cols where_clause = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, now), ] query = Query( dataset=Dataset.Metrics.value, match=Entity("metrics_counters"), select=query_cols, where=where_clause, groupby=group_by, ) result = raw_snql_query( query, referrer= "release_health.metrics.get_changed_project_release_model_adoptions", use_cache=False, ) def extract_row_info( row: Mapping[str, Union[OrganizationId, str]]) -> ProjectRelease: return row.get("project_id"), reverse_tag_value( org_id, row.get(release_column_name)) # type: ignore return [extract_row_info(row) for row in result["data"]]
def test_build_snuba_query_orderby(mock_now, mock_now2, mock_indexer): mock_indexer.resolve = MockIndexer().resolve query_params = MultiValueDict({ "query": ["release:staging" ], # weird release but we need a string exising in mock indexer "groupBy": ["session.status", "environment"], "field": [ "sum(sentry.sessions.session)", ], "orderBy": ["-sum(sentry.sessions.session)"], "limit": [3], }) query_definition = QueryDefinition(query_params) snuba_queries = SnubaQueryBuilder([PseudoProject(1, 1)], query_definition).get_snuba_queries() counter_queries = snuba_queries.pop("metrics_counters") assert not snuba_queries assert counter_queries["series"] is None # No series because of orderBy assert counter_queries["totals"] == Query( dataset="metrics", match=Entity("metrics_counters"), select=[Function("sum", [Column("value")], "value")], groupby=[ Column("metric_id"), Column("tags[8]"), Column("tags[2]"), ], where=[ Condition(Column("org_id"), Op.EQ, 1), Condition(Column("project_id"), Op.IN, [1]), Condition(Column("metric_id"), Op.IN, [9]), Condition(Column("timestamp"), Op.GTE, datetime(2021, 5, 28, 0, tzinfo=pytz.utc)), Condition(Column("timestamp"), Op.LT, datetime(2021, 8, 26, 0, tzinfo=pytz.utc)), Condition(Column("tags[6]", entity=None), Op.IN, [10]), ], orderby=[OrderBy(Column("value"), Direction.DESC)], limit=Limit(3), offset=Offset(0), granularity=Granularity(query_definition.rollup), )
def _get_snuba_query( org_id: int, query: QueryDefinition, entity_key: EntityKey, metric_id: int, columns: Sequence[str], series: bool, extra_conditions: List[Condition], remove_groupby: Set[Column], ) -> Query: """Build the snuba query""" conditions = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, query.filter_keys["project_id"]), Condition(Column("metric_id"), Op.EQ, metric_id), Condition(Column(TS_COL_QUERY), Op.GTE, query.start), Condition(Column(TS_COL_QUERY), Op.LT, query.end), ] conditions += _get_filter_conditions(org_id, query.conditions) conditions += extra_conditions groupby_tags = [field for field in query.raw_groupby if field != "project"] tag_keys = {field: _resolve(field) for field in groupby_tags} groupby = { field: Column(f"tags[{tag_id}]") for field, tag_id in tag_keys.items() if tag_id is not None # exclude unresolved keys from groupby } if "project" in query.raw_groupby: groupby["project"] = Column("project_id") full_groupby = set(groupby.values()) - remove_groupby if series: full_groupby.add(Column(TS_COL_GROUP)) return Query( dataset=Dataset.Metrics.value, match=Entity(entity_key.value), select=[Column(column) for column in columns], groupby=list(full_groupby), where=conditions, granularity=Granularity(query.rollup), )
def test_attribution_tags(self) -> None: query = (Query("events", Entity("events")).set_select( [Function("count", [], "count")]).set_where([ Condition(Column("project_id"), Op.EQ, self.project_id), Condition(Column("timestamp"), Op.GTE, self.base_time), Condition(Column("timestamp"), Op.LT, self.next_time), ]).set_team("sns").set_feature("test")) response = self.post("/events/snql", data=query.snuba()) resp = json.loads(response.data) assert response.status_code == 200, resp metric_calls = get_recorded_metric_calls("increment", "snuba.attribution.log") assert metric_calls is not None assert len(metric_calls) == 1 assert metric_calls[0].value > 0 assert metric_calls[0].tags["app_id"] == "default"
def check_releases_have_health_data( self, organization_id: OrganizationId, project_ids: Sequence[ProjectId], release_versions: Sequence[ReleaseName], start: datetime, end: datetime, ) -> Set[ReleaseName]: release_column_name = tag_key(organization_id, "release") releases_ids = [ release_id for release_id in [ try_get_string_index(organization_id, release) for release in release_versions ] if release_id is not None ] query = Query( dataset=Dataset.Metrics.value, match=Entity("metrics_counters"), select=[Column(release_column_name)], where=[ Condition(Column("org_id"), Op.EQ, organization_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(organization_id, "session")), Condition(Column(release_column_name), Op.IN, releases_ids), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, end), ], groupby=[Column(release_column_name)], ) result = raw_snql_query( query, referrer="release_health.metrics.check_releases_have_health_data", use_cache=False, ) def extract_row_info( row: Mapping[str, Union[OrganizationId, str]]) -> ReleaseName: return reverse_tag_value( organization_id, row.get(release_column_name)) # type: ignore return {extract_row_info(row) for row in result["data"]}
def _get_crash_free_rate_data( org_id: int, project_ids: Sequence[int], start: datetime, end: datetime, rollup: int, ) -> Dict[int, Dict[str, float]]: data: Dict[int, Dict[str, float]] = {} session_status = tag_key(org_id, "session.status") count_query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsCounters.value), select=[Column("value")], where=[ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, end), ], groupby=[ Column("project_id"), Column(session_status), ], granularity=Granularity(rollup), ) count_data = raw_snql_query( count_query, referrer="release_health.metrics.get_crash_free_data", use_cache=False)["data"] for row in count_data: project_data = data.setdefault(row["project_id"], {}) tag_value = reverse_tag_value(org_id, row[session_status]) project_data[tag_value] = row["value"] return data
def _count_users(total: bool, referrer: str) -> Dict[Any, int]: query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsSets.value), select=[Column("value")], where=_get_common_where(total) + [ Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "user")), ], groupby=_get_common_groupby(total), ) return _convert_results( raw_snql_query( query, referrer=referrer, use_cache=False, )["data"], total=total, )
def test_tags_in_groupby(self) -> None: query = (Query("events", Entity("events")).set_select([ Function("count", [], "times_seen"), Function("min", [Column("timestamp")], "first_seen"), Function("max", [Column("timestamp")], "last_seen"), ]).set_groupby([Column("tags[k8s-app]")]).set_where([ Condition(Column("project_id"), Op.EQ, self.project_id), Condition(Column("timestamp"), Op.GTE, self.base_time), Condition(Column("timestamp"), Op.LT, self.next_time), Condition(Column("tags[k8s-app]"), Op.NEQ, ""), Condition(Column("type"), Op.NEQ, "transaction"), ]).set_orderby([ OrderBy( Function("max", [Column("timestamp")], "last_seen"), Direction.DESC, ) ]).set_limit(1000)) response = self.post("/events/snql", data=query.snuba()) data = json.loads(response.data) assert response.status_code == 200, data
def test_sessions_query(self) -> None: query = (Query("sessions", Entity("sessions")).set_select( [Column("project_id"), Column("release")]).set_groupby( [Column("project_id"), Column("release")]).set_where([ Condition(Column("project_id"), Op.IN, [self.project_id]), Condition(Column("org_id"), Op.EQ, self.org_id), Condition( Column("started"), Op.GTE, datetime(2021, 1, 1, 17, 5, 59, 554860), ), Condition(Column("started"), Op.LT, datetime(2022, 1, 1, 17, 6, 0, 554981)), ]).set_orderby([OrderBy(Column("sessions"), Direction.DESC)]).set_limit(100)) response = self.post("/sessions/snql", data=query.snuba()) data = json.loads(response.data) assert response.status_code == 200 assert data["data"] == []
def test_arrayjoin(self) -> None: query = (Query("events", Entity("events")).set_select([ Function("count", [], "times_seen"), Function("min", [Column("timestamp")], "first_seen"), Function("max", [Column("timestamp")], "last_seen"), ]).set_groupby([Column("exception_frames.filename")]).set_array_join( [Column("exception_frames.filename")]).set_where([ Condition(Column("exception_frames.filename"), Op.LIKE, "%.java"), Condition(Column("project_id"), Op.EQ, self.project_id), Condition(Column("timestamp"), Op.GTE, self.base_time), Condition(Column("timestamp"), Op.LT, self.next_time), ]).set_orderby([ OrderBy( Function("max", [Column("timestamp")], "last_seen"), Direction.DESC, ) ]).set_limit(1000)) response = self.post("/events/snql", data=query.snuba()) data = json.loads(response.data) assert response.status_code == 200, data assert len(data["data"]) == 6
def test_simple_query(self) -> None: query = (Query("discover", Entity("discover_events")).set_select( [Function("count", [], "count")]).set_groupby( [Column("project_id"), Column("tags[custom_tag]")]).set_where([ Condition(Column("type"), Op.NEQ, "transaction"), Condition(Column("project_id"), Op.EQ, self.project_id), Condition(Column("timestamp"), Op.GTE, self.base_time), Condition(Column("timestamp"), Op.LT, self.next_time), ]).set_orderby([ OrderBy(Function("count", [], "count"), Direction.ASC) ]).set_limit(1000).set_consistent(True).set_debug(True)) response = self.post("/discover/snql", data=query.snuba()) data = json.loads(response.data) assert response.status_code == 200, data assert data["stats"]["consistent"] assert data["data"] == [{ "count": 1, "tags[custom_tag]": "custom_value", "project_id": self.project_id, }]
def _build_queries_for_entity(self, query_definition, entity, fields, where, groupby): totals_query = Query( dataset="metrics", match=Entity(entity), groupby=groupby, select=list( map( Column, {_OP_TO_FIELD[entity][op] for op, _ in fields}, )), where=where, limit=Limit(MAX_POINTS), offset=Offset(0), granularity=Granularity(query_definition.rollup), ) series_query = totals_query.set_groupby((totals_query.groupby or []) + [Column(TS_COL_GROUP)]) return { "totals": totals_query, "series": series_query, }
def _build_queries_for_entity(self, query_definition, entity, fields, where, groupby): totals_query = Query( dataset=Dataset.Metrics.value, match=Entity(entity), groupby=groupby, select=list(self._build_select(entity, fields)), where=where, limit=Limit(query_definition.limit or MAX_POINTS), offset=Offset(0), granularity=Granularity(query_definition.rollup), orderby=self._build_orderby(query_definition, entity), ) if totals_query.orderby is None: series_query = totals_query.set_groupby( (totals_query.groupby or []) + [Column(TS_COL_GROUP)] ) else: series_query = None return { "totals": totals_query, "series": series_query, }
def test_suspect_spans_lambdas(self) -> None: query = (Query( "discover", Entity("discover_transactions") ).set_select([ Column("spans.op"), Column("spans.group"), Function( "arrayReduce", [ "sumIf", Column("spans.exclusive_time_32"), Function( "arrayMap", [ Lambda( ["x", "y"], Function( "if", [ Function( "equals", [ Function( "and", [ Function( "equals", [ Identifier( "x"), "db", ], ), Function( "equals", [ Identifier( "y"), "05029609156d8133", ], ), ], ), 1, ], ), 1, 0, ], ), ), Column("spans.op"), Column("spans.group"), ], ), ], "array_spans_exclusive_time", ), ]).set_where([ Condition(Column("transaction_name"), Op.EQ, "/api/do_things"), Condition(Function("has", [Column("spans.op"), "db"]), Op.EQ, 1), Condition( Function("has", [Column("spans.group"), "05029609156d8133"]), Op.EQ, 1, ), Condition(Column("duration"), Op.LT, 900000.0), Condition(Column("finish_ts"), Op.GTE, self.base_time), Condition(Column("finish_ts"), Op.LT, self.next_time), Condition(Column("project_id"), Op.IN, (self.project_id, )), ]).set_orderby( [OrderBy(Column("array_spans_exclusive_time"), Direction.DESC)]).set_limit(10)) response = self.post("/discover/snql", data=query.snuba()) resp = json.loads(response.data) assert response.status_code == 200, resp data = resp["data"] assert len(data) == 1 assert data[0]["array_spans_exclusive_time"] > 0
def get_oldest_health_data_for_releases( self, project_releases: Sequence[ProjectRelease], ) -> Mapping[ProjectRelease, str]: now = datetime.now(pytz.utc) start = now - timedelta(days=90) project_ids: List[ProjectId] = [x[0] for x in project_releases] org_id = self._get_org_id(project_ids) release_column_name = tag_key(org_id, "release") releases = [x[1] for x in project_releases] releases_ids = [ release_id for release_id in [try_get_string_index(org_id, release) for release in releases] if release_id is not None ] query_cols = [ Column("project_id"), Column(release_column_name), Function("min", [Column("bucketed_time")], "oldest"), ] group_by = [ Column("project_id"), Column(release_column_name), ] where_clause = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, now), Condition(Column(release_column_name), Op.IN, releases_ids), ] query = Query( dataset=Dataset.Metrics.value, match=Entity("metrics_counters"), select=query_cols, where=where_clause, groupby=group_by, granularity=Granularity(3600), ) rows = raw_snql_query( query, referrer= "release_health.metrics.get_oldest_health_data_for_releases", use_cache=False, )["data"] result = {} for row in rows: result[row["project_id"], reverse_tag_value(org_id, row[release_column_name] )] = row["oldest"] return result
Limit, LimitBy, Offset, Op, OrderBy, Query, ) from snuba_sdk.query_validation import InvalidMatchError from snuba_sdk.query_visitors import InvalidQueryError NOW = datetime(2021, 1, 2, 3, 4, 5, 6, timezone.utc) tests = [ pytest.param( Query( dataset="discover", match=Entity("events"), select=[Column("event_id")], groupby=None, where=[Condition(Column("timestamp"), Op.GT, NOW)], limit=Limit(10), offset=Offset(1), granularity=Granularity(3600), ), id="basic query", ), pytest.param( Query( dataset="discover", match=Entity("events", "ev", 0.2), select=[ Column("title"),
def sum_sessions_and_releases(org_id, project_ids): # Takes a single org id and a list of project ids # returns counts of releases and sessions across all environments and passed project_ids for the last 6 hours start_time = time.time() offset = 0 totals = defaultdict(dict) with metrics.timer( "sentry.tasks.monitor_release_adoption.process_projects_with_sessions.loop" ): while (time.time() - start_time) < MAX_SECONDS: with metrics.timer( "sentry.tasks.monitor_release_adoption.process_projects_with_sessions.query" ): query = (Query( dataset="sessions", match=Entity("sessions"), select=[ Column("sessions"), ], groupby=[ Column("org_id"), Column("project_id"), Column("release"), Column("environment"), ], where=[ Condition(Column("started"), Op.GTE, datetime.utcnow() - timedelta(hours=6)), Condition(Column("started"), Op.LT, datetime.utcnow()), Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), ], granularity=Granularity(21600), orderby=[ OrderBy(Column("org_id"), Direction.ASC), OrderBy(Column("project_id"), Direction.ASC), ], ).set_limit(CHUNK_SIZE + 1).set_offset(offset)) data = snuba.raw_snql_query( query, referrer="tasks.process_projects_with_sessions.session_count" )["data"] count = len(data) more_results = count > CHUNK_SIZE offset += CHUNK_SIZE if more_results: data = data[:-1] for row in data: row_totals = totals[row["project_id"]].setdefault( row["environment"], { "total_sessions": 0, "releases": defaultdict(int) }) row_totals["total_sessions"] += row["sessions"] row_totals["releases"][row["release"]] += row["sessions"] if not more_results: break else: logger.info( "process_projects_with_sessions.loop_timeout", extra={ "org_id": org_id, "project_ids": project_ids }, ) return totals
def check_has_health_data( self, projects_list: Sequence[ProjectOrRelease] ) -> Set[ProjectOrRelease]: now = datetime.now(pytz.utc) start = now - timedelta(days=3) projects_list = list(projects_list) if len(projects_list) == 0: return set() includes_releases = isinstance(projects_list[0], tuple) if includes_releases: project_ids: List[ProjectId] = [x[0] for x in projects_list ] # type: ignore else: project_ids = projects_list # type: ignore org_id = self._get_org_id(project_ids) where_clause = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, project_ids), Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column("timestamp"), Op.GTE, start), Condition(Column("timestamp"), Op.LT, now), ] if includes_releases: releases = [x[1] for x in projects_list] # type: ignore release_column_name = tag_key(org_id, "release") releases_ids = get_tag_values_list(org_id, releases) where_clause.append( Condition(Column(release_column_name), Op.IN, releases_ids)) column_names = ["project_id", release_column_name] else: column_names = ["project_id"] def extract_row_info_func( include_releases: bool, ) -> Callable[[Mapping[str, Union[int, str]]], ProjectOrRelease]: def f(row: Mapping[str, Union[int, str]]) -> ProjectOrRelease: if include_releases: return row["project_id"], reverse_tag_value( org_id, row.get(release_column_name)) # type: ignore else: return row["project_id"] # type: ignore return f extract_row_info = extract_row_info_func(includes_releases) query_cols = [Column(column_name) for column_name in column_names] group_by_clause = query_cols query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsCounters.value), select=query_cols, where=where_clause, groupby=group_by_clause, ) result = raw_snql_query( query, referrer="release_health.metrics.check_has_health_data", use_cache=False) return {extract_row_info(row) for row in result["data"]}
def get_release_sessions_time_bounds( self, project_id: ProjectId, release: ReleaseName, org_id: OrganizationId, environments: Optional[Sequence[EnvironmentName]] = None, ) -> ReleaseSessionsTimeBounds: select: List[SelectableExpression] = [ Function("min", [Column("timestamp")], "min"), Function("max", [Column("timestamp")], "max"), ] try: where: List[Union[BooleanCondition, Condition]] = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.EQ, project_id), Condition(Column(tag_key(org_id, "release")), Op.EQ, tag_value(org_id, release)), Condition(Column("timestamp"), Op.GTE, datetime.min), Condition(Column("timestamp"), Op.LT, datetime.now(pytz.utc)), ] if environments is not None: env_filter = get_tag_values_list(org_id, environments) if not env_filter: raise MetricIndexNotFound() where.append( Condition(Column(tag_key(org_id, "environment")), Op.IN, env_filter)) except MetricIndexNotFound: # Some filter condition can't be constructed and therefore can't be # satisfied. # # Ignore return type because of https://github.com/python/mypy/issues/8533 return { "sessions_lower_bound": None, "sessions_upper_bound": None } # type: ignore # XXX(markus): We know that this combination of queries is not fully # equivalent to the sessions-table based backend. Example: # # 1. Session sid=x is started with timestamp started=n # 2. Same sid=x is updated with new payload with timestamp started=n - 1 # # Old sessions backend would return [n - 1 ; n - 1] as range. # New metrics backend would return [n ; n - 1] as range. # # We don't yet know if this case is relevant. Session's started # timestamp shouldn't really change as session status is updated # though. try: # Take care of initial values for session.started by querying the # init counter. This should take care of most cases on its own. init_sessions_query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsCounters.value), select=select, where=where + [ Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session")), Condition(Column(tag_key(org_id, "session.status")), Op.EQ, tag_value(org_id, "init")), ], ) rows = raw_snql_query( init_sessions_query, referrer= "release_health.metrics.get_release_sessions_time_bounds.init_sessions", use_cache=False, )["data"] except MetricIndexNotFound: rows = [] try: # Take care of potential timestamp updates by looking at the metric # for session duration, which is emitted once a session is closed ("terminal state") # # There is a testcase checked in that tests specifically for a # session update that lowers session.started. We don't know if that # testcase matters particularly. terminal_sessions_query = Query( dataset=Dataset.Metrics.value, match=Entity(EntityKey.MetricsDistributions.value), select=select, where=where + [ Condition(Column("metric_id"), Op.EQ, metric_id(org_id, "session.duration")), ], ) rows.extend( raw_snql_query( terminal_sessions_query, referrer= "release_health.metrics.get_release_sessions_time_bounds.terminal_sessions", use_cache=False, )["data"]) except MetricIndexNotFound: pass # This check is added because if there are no sessions found, then the # aggregations query return both the sessions_lower_bound and the # sessions_upper_bound as `0` timestamp and we do not want that behaviour # by default # P.S. To avoid confusion the `0` timestamp which is '1970-01-01 00:00:00' # is rendered as '0000-00-00 00:00:00' in clickhouse shell formatted_unix_start_time = datetime.utcfromtimestamp(0).strftime( "%Y-%m-%dT%H:%M:%S+00:00") lower_bound: Optional[str] = None upper_bound: Optional[str] = None for row in rows: if set(row.values()) == {formatted_unix_start_time}: continue if lower_bound is None or row["min"] < lower_bound: lower_bound = row["min"] if upper_bound is None or row["max"] > upper_bound: upper_bound = row["max"] if lower_bound is None or upper_bound is None: return { "sessions_lower_bound": None, "sessions_upper_bound": None } # type: ignore def iso_format_snuba_datetime(date: str) -> str: return datetime.strptime( date, "%Y-%m-%dT%H:%M:%S+00:00").isoformat()[:19] + "Z" return { # type: ignore "sessions_lower_bound": iso_format_snuba_datetime(lower_bound), "sessions_upper_bound": iso_format_snuba_datetime(upper_bound), }
def _get_snuba_query( org_id: int, query: QueryDefinition, entity_key: EntityKey, metric_id: int, columns: List[SelectableExpression], series: bool, limit_state: _LimitState, extra_conditions: List[Condition], ) -> Optional[Query]: """Build the snuba query Return None if the results from the initial totals query was empty. """ conditions = [ Condition(Column("org_id"), Op.EQ, org_id), Condition(Column("project_id"), Op.IN, query.filter_keys["project_id"]), Condition(Column("metric_id"), Op.EQ, metric_id), Condition(Column(TS_COL_QUERY), Op.GTE, query.start), Condition(Column(TS_COL_QUERY), Op.LT, query.end), ] conditions += _get_filter_conditions(org_id, query.conditions) conditions += extra_conditions groupby = {} for field in query.raw_groupby: if field == "session.status": # This will be handled by conditional aggregates continue if field == "project": groupby["project"] = Column("project_id") continue try: groupby[field] = Column(resolve_tag_key(field)) except MetricIndexNotFound: # exclude unresolved keys from groupby pass full_groupby = list(set(groupby.values())) if series: full_groupby.append(Column(TS_COL_GROUP)) query_args = dict( dataset=Dataset.Metrics.value, match=Entity(entity_key.value), select=columns, groupby=full_groupby, where=conditions, granularity=Granularity(query.rollup), ) # In case of group by, either set a limit or use the groups from the # first query to limit the results: if query.raw_groupby: if not limit_state.initialized: # Set limit and order by to be consistent with sessions_v2 max_groups = SNUBA_LIMIT // len(get_timestamps(query)) query_args["limit"] = Limit(max_groups) query_args["orderby"] = [OrderBy(columns[0], Direction.DESC)] else: if limit_state.limiting_conditions is None: # Initial query returned no results, no need to run any more queries return None query_args["where"] += limit_state.limiting_conditions query_args["limit"] = Limit(SNUBA_LIMIT) return Query(**query_args)