def test_bulk_load(self) -> None: row = GroupedMessageRow.from_bulk( { "project_id": "2", "id": "10", "status": "0", "last_seen": "2019-06-28 17:57:32+00", "first_seen": "2019-06-28 06:40:17+00", "active_at": "2019-06-28 06:40:17+00", "first_release_id": "26", } ) write_processed_messages(self.storage, [InsertBatch([row.to_clickhouse()])]) ret = ( get_cluster(StorageSetKey.EVENTS) .get_query_connection(ClickhouseClientSettings.QUERY) .execute("SELECT * FROM groupedmessage_local;") ) assert ret[0] == ( 0, # offset 0, # deleted 2, # project_id 10, # id 0, # status datetime(2019, 6, 28, 17, 57, 32), datetime(2019, 6, 28, 6, 40, 17), datetime(2019, 6, 28, 6, 40, 17), 26, )
def test_bulk_load(self) -> None: row = GroupAssigneeRow.from_bulk( { "project_id": "2", "group_id": "1359", "date_added": "2019-09-19 00:17:55+00", "user_id": "1", "team_id": "", } ) write_processed_messages( self.storage, [InsertBatch([row.to_clickhouse()], None)] ) ret = ( self.storage.get_cluster() .get_query_connection(ClickhouseClientSettings.QUERY) .execute("SELECT * FROM groupassignee_local;") .results ) assert ret[0] == ( 0, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id )
def test_messages(self) -> None: processor = GroupedMessageProcessor("sentry_groupedmessage") metadata = KafkaMessageMetadata( offset=42, partition=0, timestamp=datetime(1970, 1, 1) ) ret = processor.process_message(self.INSERT_MSG, metadata) assert ret == InsertBatch([self.PROCESSED]) write_processed_messages(self.storage, [ret]) ret = ( get_cluster(StorageSetKey.EVENTS) .get_query_connection(ClickhouseClientSettings.INSERT) .execute("SELECT * FROM groupedmessage_local;") ) assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 74, # id 0, # status datetime(2019, 6, 19, 6, 46, 28), datetime(2019, 6, 19, 6, 45, 32), datetime(2019, 6, 19, 6, 45, 32), None, ) ret = processor.process_message(self.UPDATE_MSG, metadata) assert ret == InsertBatch([self.PROCESSED]) ret = processor.process_message(self.DELETE_MSG, metadata) assert ret == InsertBatch([self.DELETED])
def generate_outcomes( self, org_id: int, project_id: int, num_outcomes: int, outcome: int, time_since_base: timedelta, ) -> None: outcomes = [] for _ in range(num_outcomes): processed = (self.storage.get_table_writer().get_stream_loader( ).get_processor().process_message( { "project_id": project_id, "event_id": uuid.uuid4().hex, "timestamp": (self.base_time + time_since_base).strftime("%Y-%m-%dT%H:%M:%S.%fZ"), "org_id": org_id, "reason": None, "key_id": 1, "outcome": outcome, }, KafkaMessageMetadata(0, 0, self.base_time), )) if processed: outcomes.append(processed) write_processed_messages(self.storage, outcomes)
def test_messages(self) -> None: processor = GroupAssigneeProcessor("sentry_groupasignee") metadata = KafkaMessageMetadata(offset=42, partition=0, timestamp=datetime(1970, 1, 1)) ret = processor.process_message(self.INSERT_MSG, metadata) assert ret == InsertBatch([self.PROCESSED]) write_processed_messages(self.storage, [ret]) ret = (self.storage.get_cluster().get_query_connection( ClickhouseClientSettings.QUERY).execute( "SELECT * FROM groupassignee_local;")) assert ret[0] == ( 42, # offset 0, # deleted 2, # project_id 1359, # group_id datetime(2019, 9, 19, 0, 17, 55), 1, # user_id None, # team_id ) ret = processor.process_message(self.UPDATE_MSG_NO_KEY_CHANGE, metadata) assert ret == InsertBatch([self.PROCESSED]) # Tests an update with key change which becomes a two inserts: # one deletion and the insertion of the new row. ret = processor.process_message(self.UPDATE_MSG_WITH_KEY_CHANGE, metadata) assert ret == InsertBatch([self.DELETED, self.PROCESSED_UPDATE]) ret = processor.process_message(self.DELETE_MSG, metadata) assert ret == InsertBatch([self.DELETED])
def generate_uniform_distributions(self) -> None: events = [] processor = self.storage.get_table_writer().get_stream_loader( ).get_processor() value_array = list(range(self.d_range_min, self.d_range_max)) for n in range(self.seconds): for p in self.project_ids: msg = { "org_id": self.org_id, "project_id": p, "type": METRICS_DISTRIBUTIONS_TYPE, "value": value_array, "timestamp": self.base_time.timestamp() + n, "tags": self.default_tags, "metric_id": self.metric_id, "retention_days": RETENTION_DAYS, } processed = processor.process_message( msg, KafkaMessageMetadata(0, 0, self.base_time), ) if processed: events.append(processed) write_processed_messages(self.storage, events)
def generate_sets(self) -> None: events = [] processor = self.storage.get_table_writer().get_stream_loader( ).get_processor() for n in range(self.seconds): for p in self.project_ids: msg = { "org_id": self.org_id, "project_id": p, "type": METRICS_SET_TYPE, "value": [n % self.unique_set_values], "timestamp": self.base_time.timestamp() + n, "tags": self.default_tags, "metric_id": self.metric_id, "retention_days": RETENTION_DAYS, } processed = processor.process_message( msg, KafkaMessageMetadata(0, 0, self.base_time), ) if processed: events.append(processed) write_processed_messages(self.storage, events)
def generate_session_events(self): processor = self.storage.get_table_writer().get_stream_loader( ).get_processor() meta = KafkaMessageMetadata(offset=1, partition=2, timestamp=datetime(1970, 1, 1)) template = { "session_id": "00000000-0000-0000-0000-000000000000", "distinct_id": "b3ef3211-58a4-4b36-a9a1-5a55df0d9aaf", "duration": None, "environment": "production", "org_id": 1, "project_id": 2, "release": "[email protected]", "retention_days": settings.DEFAULT_RETENTION_DAYS, "seq": 0, "errors": 0, "received": datetime.utcnow().timestamp(), "started": self.started.timestamp(), } events = [ processor.process_message( { **template, "status": "exited", "duration": 1947.49, "session_id": "8333339f-5675-4f89-a9a0-1c935255ab58", }, meta, ), processor.process_message( { **template, "status": "exited", "quantity": 5 }, meta, ), processor.process_message( { **template, "status": "errored", "errors": 1, "quantity": 2 }, meta, ), processor.process_message( { **template, "distinct_id": "b3ef3211-58a4-4b36-a9a1-5a55df0d9aaf", "status": "errored", "errors": 1, "quantity": 2, }, meta, ), ] write_processed_messages(self.storage, events)
def generate_session_events(self, org_id, project_id: int) -> None: processor = self.storage.get_table_writer().get_stream_loader().get_processor() meta = KafkaMessageMetadata( offset=1, partition=2, timestamp=datetime(1970, 1, 1) ) distinct_id = uuid4().hex template = { "session_id": uuid4().hex, "distinct_id": distinct_id, "duration": None, "environment": "production", "org_id": org_id, "project_id": project_id, "release": "[email protected]", "retention_days": settings.DEFAULT_RETENTION_DAYS, "seq": 0, "errors": 0, "received": datetime.utcnow().timestamp(), "started": self.started.timestamp(), } events = [ processor.process_message( { **template, "status": "exited", "duration": 1947.49, "session_id": uuid4().hex, "started": (self.started + timedelta(minutes=13)).timestamp(), }, meta, ), processor.process_message( {**template, "status": "exited", "quantity": 5}, meta, ), processor.process_message( {**template, "status": "errored", "errors": 1, "quantity": 2}, meta, ), processor.process_message( { **template, "distinct_id": distinct_id, "status": "errored", "errors": 1, "quantity": 2, "started": (self.started + timedelta(minutes=24)).timestamp(), }, meta, ), ] filtered = [e for e in events if e] write_processed_messages(self.storage, filtered)
def generate_outcomes( self, org_id: int, project_id: int, num_outcomes: int, outcome: int, time_since_base: timedelta, category: Optional[int], quantity: Optional[int] = None, ) -> None: outcomes = [] for _ in range(num_outcomes): message = { "project_id": project_id, "event_id": uuid.uuid4().hex, "timestamp": (self.base_time + time_since_base).strftime("%Y-%m-%dT%H:%M:%S.%fZ"), "org_id": org_id, "reason": None, "key_id": 1, "outcome": outcome, "category": category, "quantity": quantity, } if message["category"] is None: del message["category"] # for testing None category case if message["quantity"] is None: del message["quantity"] # for testing None quantity case processed = (self.storage.get_table_writer().get_stream_loader(). get_processor().process_message( message, KafkaMessageMetadata(0, 0, self.base_time), )) if processed: outcomes.append(processed) write_processed_messages(self.storage, outcomes)
def test_midnight_error_case( self, current_time: MagicMock, storage_key: StorageKey, create_event_row_for_date: Callable[[datetime, Optional[int]], InsertBatch], ) -> None: """ This test is simulating a failure case that happened in production, where when the script ran, it attempted to delete a part whose last day was within the retention period. The script was using datetimes not aligned to midnight, and so it was removing a part on the same day, but technically outside of the window because of the script's extra time. """ def to_monday(d: datetime) -> datetime: rounded = d - timedelta(days=d.weekday()) return datetime(rounded.year, rounded.month, rounded.day) storage = get_writable_storage(storage_key) clickhouse = storage.get_cluster().get_query_connection( ClickhouseClientSettings.CLEANUP) table = storage.get_table_writer().get_schema().get_local_table_name() database = storage.get_cluster().get_database() parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert parts == [] # Pick a time a few minutes after midnight base = datetime(2022, 1, 29, 0, 4, 37) current_time.return_value = base # Insert an event that is outside retention, but its last day is just inside retention # Note that without rounding the base time to midnight, base - retention > last_day(timestamp) timestamp = datetime(2021, 10, 25) write_processed_messages(storage, [create_event_row_for_date(timestamp, 90)]) parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert [(p.date, p.retention_days) for p in parts] == [(to_monday(timestamp), 90)] stale = cleanup.filter_stale_partitions(parts) assert stale == []
def generate_counters(self) -> None: events = [] for n in range(self.seconds): for p in self.project_ids: processed = (self.storage.get_table_writer().get_stream_loader( ).get_processor().process_message( ({ "org_id": self.org_id, "project_id": p, "unit": "ms", "type": METRICS_COUNTERS_TYPE, "value": 1.0, "timestamp": self.base_time.timestamp() + n, "tags": self.default_tags, "metric_id": self.metric_id, "retention_days": RETENTION_DAYS, }), KafkaMessageMetadata(0, 0, self.base_time), )) if processed: events.append(processed) write_processed_messages(self.storage, events)
def test_optimize( self, storage_key: StorageKey, create_event_row_for_date: Callable[[datetime], InsertBatch], ) -> None: storage = get_writable_storage(storage_key) cluster = storage.get_cluster() clickhouse = cluster.get_query_connection(ClickhouseClientSettings.OPTIMIZE) table = storage.get_table_writer().get_schema().get_local_table_name() database = cluster.get_database() # no data, 0 partitions to optimize parts = optimize.get_partitions_to_optimize( clickhouse, storage, database, table ) assert parts == [] base = datetime(1999, 12, 26) # a sunday base_monday = base - timedelta(days=base.weekday()) # 1 event, 0 unoptimized parts write_processed_messages(storage, [create_event_row_for_date(base)]) parts = optimize.get_partitions_to_optimize( clickhouse, storage, database, table ) assert parts == [] # 2 events in the same part, 1 unoptimized part write_processed_messages(storage, [create_event_row_for_date(base)]) parts = optimize.get_partitions_to_optimize( clickhouse, storage, database, table ) assert [(p.date, p.retention_days) for p in parts] == [(base_monday, 90)] # 3 events in the same part, 1 unoptimized part write_processed_messages(storage, [create_event_row_for_date(base)]) parts = optimize.get_partitions_to_optimize( clickhouse, storage, database, table ) assert [(p.date, p.retention_days) for p in parts] == [(base_monday, 90)] # 3 events in one part, 2 in another, 2 unoptimized parts a_month_earlier = base_monday - timedelta(days=31) a_month_earlier_monday = a_month_earlier - timedelta( days=a_month_earlier.weekday() ) write_processed_messages( storage, [create_event_row_for_date(a_month_earlier_monday)] ) write_processed_messages( storage, [create_event_row_for_date(a_month_earlier_monday)] ) parts = optimize.get_partitions_to_optimize( clickhouse, storage, database, table ) assert [(p.date, p.retention_days) for p in parts] == [ (base_monday, 90), (a_month_earlier_monday, 90), ] # respects before (base is properly excluded) assert [ (p.date, p.retention_days) for p in list( optimize.get_partitions_to_optimize( clickhouse, storage, database, table, before=base ) ) ] == [(a_month_earlier_monday, 90)] optimize.optimize_partitions(clickhouse, database, table, parts) # all parts should be optimized parts = optimize.get_partitions_to_optimize( clickhouse, storage, database, table ) assert parts == []
def test( self, storage_key: StorageKey, create_event_row_for_date: Callable[[datetime, Optional[int]], InsertBatch], ) -> None: def to_monday(d: datetime) -> datetime: return d - timedelta(days=d.weekday()) base = datetime(1999, 12, 26) # a sunday storage = get_writable_storage(storage_key) clickhouse = storage.get_cluster().get_query_connection( ClickhouseClientSettings.CLEANUP) table = storage.get_table_writer().get_schema().get_table_name() database = storage.get_cluster().get_database() parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert parts == [] # base, 90 retention write_processed_messages(storage, [create_event_row_for_date(base, None)]) parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert [(p.date, p.retention_days) for p in parts] == [(to_monday(base), 90)] stale = cleanup.filter_stale_partitions(parts, as_of=base) assert stale == [] # -40 days, 90 retention three_weeks_ago = base - timedelta(days=7 * 3) write_processed_messages( storage, [create_event_row_for_date(three_weeks_ago, None)]) parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert [(p.date, p.retention_days) for p in parts] == [ (to_monday(three_weeks_ago), 90), (to_monday(base), 90), ] stale = cleanup.filter_stale_partitions(parts, as_of=base) assert stale == [] # -100 days, 90 retention thirteen_weeks_ago = base - timedelta(days=7 * 13) write_processed_messages( storage, [create_event_row_for_date(thirteen_weeks_ago, None)]) parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert [(p.date, p.retention_days) for p in parts] == [ (to_monday(thirteen_weeks_ago), 90), (to_monday(three_weeks_ago), 90), (to_monday(base), 90), ] stale = cleanup.filter_stale_partitions(parts, as_of=base) assert [(p.date, p.retention_days) for p in stale] == [(to_monday(thirteen_weeks_ago), 90)] # -1 week, 30 retention one_week_ago = base - timedelta(days=7) write_processed_messages(storage, [create_event_row_for_date(one_week_ago, 30)]) parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert {(p.date, p.retention_days) for p in parts} == set([ (to_monday(thirteen_weeks_ago), 90), (to_monday(three_weeks_ago), 90), (to_monday(one_week_ago), 30), (to_monday(base), 90), ]) stale = cleanup.filter_stale_partitions(parts, as_of=base) assert [(p.date, p.retention_days) for p in stale] == [(to_monday(thirteen_weeks_ago), 90)] # -5 weeks, 30 retention five_weeks_ago = base - timedelta(days=7 * 5) write_processed_messages( storage, [create_event_row_for_date(five_weeks_ago, 30)]) parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert {(p.date, p.retention_days) for p in parts} == set([ (to_monday(thirteen_weeks_ago), 90), (to_monday(five_weeks_ago), 30), (to_monday(three_weeks_ago), 90), (to_monday(one_week_ago), 30), (to_monday(base), 90), ]) stale = cleanup.filter_stale_partitions(parts, as_of=base) assert {(p.date, p.retention_days) for p in stale} == set([(to_monday(thirteen_weeks_ago), 90), (to_monday(five_weeks_ago), 30)]) cleanup.drop_partitions(clickhouse, database, table, stale, dry_run=False) parts = cleanup.get_active_partitions(clickhouse, storage, database, table) assert {(p.date, p.retention_days) for p in parts} == set([ (to_monday(three_weeks_ago), 90), (to_monday(one_week_ago), 30), (to_monday(base), 90), ])
def generate_fizzbuzz_events(self) -> None: """ Generate a deterministic set of events across a time range. """ events = [] for tick in range(self.minutes): tock = tick + 1 for p in self.project_ids: # project N sends an event every Nth minute if tock % p == 0: trace_id = "7400045b25c443b885914600aa83ad04" span_id = "8841662216cc598b" processed = ( self.storage.get_table_writer().get_stream_loader(). get_processor().process_message( ( 2, "insert", { "project_id": p, "event_id": uuid.uuid4().hex, "deleted": 0, "datetime": (self.base_time + timedelta(minutes=tick)).isoformat(), "platform": self.platforms[(tock * p) % len(self.platforms)], "retention_days": settings.DEFAULT_RETENTION_DAYS, "data": { # Project N sends every Nth (mod len(hashes)) hash (and platform) "received": calendar.timegm( (self.base_time + timedelta( minutes=tick)).timetuple()), "type": "transaction", "transaction": "/api/do_things", "start_timestamp": datetime.timestamp( (self.base_time + timedelta(minutes=tick))), "timestamp": datetime.timestamp( (self.base_time + timedelta( minutes=tick, seconds=1))), "tags": { # Sentry "environment": self.environments[(tock * p) % len( self.environments)], "sentry:release": str(tick), "sentry:dist": "dist1", # User "foo": "baz", "foo.bar": "qux", "os_name": "linux", }, "user": { "email": "*****@*****.**", "ip_address": "8.8.8.8", }, "contexts": { "trace": { "trace_id": trace_id, "span_id": span_id, "op": "http", "status": "0", }, }, "measurements": { "lcp": { "value": 32.129 }, "lcp.elementSize": { "value": 4242 }, }, "breakdowns": { "span_ops": { "ops.db": { "value": 62.512 }, "ops.http": { "value": 109.774 }, "total.time": { "value": 172.286 }, } }, "spans": [{ "op": "db", "trace_id": trace_id, "span_id": span_id + "1", "parent_span_id": None, "same_process_as_parent": True, "description": "SELECT * FROM users", "data": {}, "timestamp": calendar.timegm( (self.base_time + timedelta(minutes=tick) ).timetuple()), }], }, }, ), KafkaMessageMetadata(0, 0, self.base_time), )) if processed: events.append(processed) write_processed_messages(self.storage, events)
def test(self) -> None: def to_monday(d: datetime) -> datetime: return d - timedelta(days=d.weekday()) base = datetime(1999, 12, 26) # a sunday storage = get_writable_storage(StorageKey.EVENTS) clickhouse = storage.get_cluster().get_query_connection( ClickhouseClientSettings.CLEANUP) table = storage.get_table_writer().get_schema().get_table_name() database = storage.get_cluster().get_database() parts = cleanup.get_active_partitions(clickhouse, database, table) assert parts == [] # base, 90 retention write_processed_messages(storage, [self.create_event_row_for_date(base)]) parts = cleanup.get_active_partitions(clickhouse, database, table) assert parts == [(to_monday(base), 90)] stale = cleanup.filter_stale_partitions(parts, as_of=base) assert stale == [] # -40 days, 90 retention three_weeks_ago = base - timedelta(days=7 * 3) write_processed_messages( storage, [self.create_event_row_for_date(three_weeks_ago)]) parts = cleanup.get_active_partitions(clickhouse, database, table) assert parts == [(to_monday(three_weeks_ago), 90), (to_monday(base), 90)] stale = cleanup.filter_stale_partitions(parts, as_of=base) assert stale == [] # -100 days, 90 retention thirteen_weeks_ago = base - timedelta(days=7 * 13) write_processed_messages( storage, [self.create_event_row_for_date(thirteen_weeks_ago)]) parts = cleanup.get_active_partitions(clickhouse, database, table) assert parts == [ (to_monday(thirteen_weeks_ago), 90), (to_monday(three_weeks_ago), 90), (to_monday(base), 90), ] stale = cleanup.filter_stale_partitions(parts, as_of=base) assert stale == [(to_monday(thirteen_weeks_ago), 90)] # -1 week, 30 retention one_week_ago = base - timedelta(days=7) write_processed_messages( storage, [self.create_event_row_for_date(one_week_ago, 30)]) parts = cleanup.get_active_partitions(clickhouse, database, table) assert parts == [ (to_monday(thirteen_weeks_ago), 90), (to_monday(three_weeks_ago), 90), (to_monday(one_week_ago), 30), (to_monday(base), 90), ] stale = cleanup.filter_stale_partitions(parts, as_of=base) assert stale == [(to_monday(thirteen_weeks_ago), 90)] # -5 weeks, 30 retention five_weeks_ago = base - timedelta(days=7 * 5) write_processed_messages( storage, [self.create_event_row_for_date(five_weeks_ago, 30)]) parts = cleanup.get_active_partitions(clickhouse, database, table) assert parts == [ (to_monday(thirteen_weeks_ago), 90), (to_monday(five_weeks_ago), 30), (to_monday(three_weeks_ago), 90), (to_monday(one_week_ago), 30), (to_monday(base), 90), ] stale = cleanup.filter_stale_partitions(parts, as_of=base) assert stale == [ (to_monday(thirteen_weeks_ago), 90), (to_monday(five_weeks_ago), 30), ] cleanup.drop_partitions(clickhouse, database, table, stale, dry_run=False) parts = cleanup.get_active_partitions(clickhouse, database, table) assert parts == [ (to_monday(three_weeks_ago), 90), (to_monday(one_week_ago), 30), (to_monday(base), 90), ]