def test_extract_required(self): now = datetime.utcnow() event = { "event_id": "1" * 32, "project_id": 100, "group_id": 10, "datetime": now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"), } output = {} extract_base(output, event) output["retention_days"] = enforce_retention( event, datetime.strptime(event["datetime"], settings.PAYLOAD_DATETIME_FORMAT), ) enforce_table_writer( self.dataset).get_stream_loader().get_processor().extract_required( output, event) assert output == { "event_id": "11111111111111111111111111111111", "project_id": 100, "group_id": 10, "timestamp": now, "retention_days": settings.DEFAULT_RETENTION_DAYS, }
def test_extract_common_search_message(self): now = datetime.utcnow().replace(microsecond=0) event = { 'primary_hash': 'a' * 32, 'message': 'the message', 'platform': 'the_platform', 'search_message': 'the search message', } data = { 'received': int(calendar.timegm(now.timetuple())), } output = {} enforce_table_writer( self.dataset).get_stream_loader().get_processor().extract_common( output, event, data) assert output['search_message'] == 'the search message' # with optional short message now = datetime.utcnow().replace(microsecond=0) event = { 'primary_hash': 'a' * 32, 'message': 'the message', 'platform': 'the_platform', 'search_message': 'the search message', } data = { 'received': int(calendar.timegm(now.timetuple())), 'message': 'the short message', } output = {} enforce_table_writer( self.dataset).get_stream_loader().get_processor().extract_common( output, event, data) assert output['search_message'] == 'the search message' assert output['message'] == 'the short message'
def test_extract_common(self): now = datetime.utcnow().replace(microsecond=0) event = { "primary_hash": "a" * 32, "message": "the message", "platform": "the_platform", "data": { "received": int(calendar.timegm(now.timetuple())), "culprit": "the culprit", "type": "error", "version": 6, "title": "FooError", "location": "bar.py", "modules": OrderedDict([("foo", "1.0"), ("bar", "2.0"), ("baz", None)]), }, } output = {} enforce_table_writer( self.dataset ).get_stream_loader().get_processor().extract_common( output, event, self.metadata ) assert output == { "platform": u"the_platform", "primary_hash": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "received": now, "culprit": "the culprit", "type": "error", "version": "6", "modules.name": [u"foo", u"bar", u"baz"], "modules.version": [u"1.0", u"2.0", u""], "title": "FooError", "location": "bar.py", }
def write(*, dataset: Dataset) -> RespTuple: from snuba.processor import InsertBatch rows: MutableSequence[WriterTableRow] = [] offset_base = int(round(time.time() * 1000)) for index, message in enumerate(json.loads(http_request.data)): offset = offset_base + index processed_message = ( enforce_table_writer(dataset) .get_stream_loader() .get_processor() .process_message( message, KafkaMessageMetadata( offset=offset, partition=0, timestamp=datetime.utcnow() ), ) ) if processed_message: assert isinstance(processed_message, InsertBatch) rows.extend(processed_message.rows) BatchWriterEncoderWrapper( enforce_table_writer(dataset).get_batch_writer(metrics), JSONRowEncoder(), ).write(rows) return ("ok", 200, {"Content-Type": "text/plain"})
def test_invalid_format(self) -> None: with pytest.raises(InvalidMessageVersion): enforce_table_writer(self.dataset).get_stream_loader( ).get_processor().process_message( (-1, "insert", self.event), self.metadata, )
def bulk_load(dataset, dest_table, source, log_level): import sentry_sdk sentry_sdk.init(dsn=settings.SENTRY_DSN) logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s') logger = logging.getLogger('snuba.load-snapshot') logger.info("Start bulk load process for dataset %s, from source %s", dataset, source) dataset = get_dataset(dataset) # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = enforce_table_writer(dataset).get_bulk_loader( snapshot_source, dest_table) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table), settings.BULK_CLICKHOUSE_BUFFER, ) loader.load(writer)
def bulk_load( *, dataset_name: Optional[str], dest_table: Optional[str], source: Optional[str], log_level: Optional[str] = None, ) -> None: setup_logging(log_level) setup_sentry() logger = logging.getLogger("snuba.load-snapshot") logger.info("Start bulk load process for dataset %s, from source %s", dataset_name, source) dataset = get_dataset(dataset_name) # TODO: Have a more abstract way to load sources if/when we support more than one. snapshot_source = PostgresSnapshot.load( product=settings.SNAPSHOT_LOAD_PRODUCT, path=source, ) loader = enforce_table_writer(dataset).get_bulk_loader( snapshot_source, dest_table) # TODO: see whether we need to pass options to the writer writer = BufferedWriterWrapper( enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table), settings.BULK_CLICKHOUSE_BUFFER, ) loader.load(writer)
def test_extract_tags_empty_string(self): # verify our text field extraction doesn't coerce '' to None tags = { "environment": "", } output = {} enforce_table_writer(self.dataset).get_stream_loader().get_processor( ).extract_promoted_tags(output, tags) assert output["environment"] == u""
def test_simple_version_0(self): processed = enforce_table_writer( self.dataset).get_stream_loader().get_processor().process_message( (0, 'insert', self.event)) for field in ('event_id', 'project_id', 'message', 'platform'): assert processed.data[0][field] == self.event[field]
def generate_events(self): events = [] for tick in range(self.minutes): # project N sends an event every Nth minute events.append( enforce_table_writer(self.dataset).get_stream_loader(). get_processor().process_insert({ "project_id": self.project_id, "event_id": uuid.uuid4().hex, "deleted": 0, "datetime": (self.base_time + timedelta(minutes=tick) ).strftime("%Y-%m-%dT%H:%M:%S.%fZ"), "message": "a message", "platform": self.platforms[tick % len(self.platforms)], "primary_hash": uuid.uuid4().hex, "group_id": tick, "retention_days": settings.DEFAULT_RETENTION_DAYS, "data": { "received": calendar.timegm((self.base_time + timedelta(minutes=tick)).timetuple()), }, })) self.write_processed_records(events)
def test_error_handling(self): try: enforce_table_writer(self.dataset).get_writer(table_name="invalid").write([{"x": "y"}]) except ClickHouseError as error: assert error.code == 60 assert error.type == 'DB::Exception' else: assert False, "expected error" try: enforce_table_writer(self.dataset).get_writer().write([{"timestamp": "invalid"}]) except ClickHouseError as error: assert error.code == 41 assert error.type == 'DB::Exception' else: assert False, "expected error"
def __process_insert_event(self, event: InsertEvent) -> Optional[ProcessedMessage]: return ( enforce_table_writer(self.dataset) .get_stream_loader() .get_processor() .process_message((2, "insert", event, {}), self.metadata) )
def _process_message_impl( self, value: Mapping[str, Any], metadata: KafkaMessageMetadata, ) -> Optional[ProcessedMessage]: processor = enforce_table_writer(self.__dataset).get_stream_loader().get_processor() return processor.process_message(value, metadata)
def optimize( *, clickhouse_host: str, clickhouse_port: int, database: str, dataset_name: str, timeout: int, log_level: Optional[str] = None, ) -> None: from datetime import datetime from snuba.clickhouse.native import ClickhousePool from snuba.optimize import run_optimize, logger setup_logging(log_level) dataset = get_dataset(dataset_name) table = enforce_table_writer(dataset).get_schema().get_local_table_name() today = datetime.utcnow().replace(hour=0, minute=0, second=0, microsecond=0) clickhouse = ClickhousePool(clickhouse_host, clickhouse_port, send_receive_timeout=timeout) num_dropped = run_optimize(clickhouse, database, table, before=today) logger.info("Optimized %s partitions on %s" % (num_dropped, clickhouse_host))
def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() replacement_topic = enforce_table_writer( self.dataset).get_stream_loader().get_replacement_topic_spec() test_worker = ConsumerWorker(self.dataset, producer, replacement_topic.topic_name, self.metrics) test_worker.flush_batch([ ProcessedMessage( action=ProcessorAction.REPLACE, data=[('1', { 'project_id': 1 })], ), ProcessedMessage( action=ProcessorAction.REPLACE, data=[('2', { 'project_id': 2 })], ), ]) assert [(m._topic, m._key, m._value) for m in producer.messages] == \ [('event-replacements', b'1', b'{"project_id": 1}'), ('event-replacements', b'2', b'{"project_id": 2}')]
def generate_outcomes( self, org_id: int, project_id: int, num_outcomes: int, outcome: int, time_since_base: timedelta, ) -> None: outcomes = [] for _ in range(num_outcomes): processed = (enforce_table_writer(self.dataset).get_stream_loader( ).get_processor().process_message( { "project_id": project_id, "event_id": uuid.uuid4().hex, "timestamp": (self.base_time + time_since_base).strftime("%Y-%m-%dT%H:%M:%S.%fZ"), "org_id": org_id, "reason": None, "key_id": 1, "outcome": outcome, }, None, )) outcomes.extend(processed.data) self.write_processed_events(outcomes)
def flush_batch(self, batch: Sequence[Replacement]) -> None: for replacement in batch: query_args = { **replacement.query_args, 'dist_read_table_name': self.dataset.get_dataset_schemas().get_read_schema(). get_data_source().format_from(), 'dist_write_table_name': enforce_table_writer( self.dataset).get_schema().get_table_name(), } count = self.clickhouse.execute_robust( replacement.count_query_template % query_args)[0][0] if count == 0: continue # query_time_flags == (type, project_id, [...data...]) flag_type, project_id = replacement.query_time_flags[:2] if flag_type == NEEDS_FINAL: set_project_needs_final(project_id) elif flag_type == EXCLUDE_GROUPS: group_ids = replacement.query_time_flags[2] set_project_exclude_groups(project_id, group_ids) t = time.time() query = replacement.insert_query_template % query_args logger.debug("Executing replace query: %s" % query) self.clickhouse.execute_robust(query) duration = int((time.time() - t) * 1000) logger.info("Replacing %s rows took %sms" % (count, duration)) self.metrics.timing('replacements.count', count) self.metrics.timing('replacements.duration', duration)
def test_gzip_load() -> None: content = gzip.compress(DATA.encode("utf-8")) dataset = get_dataset("groupedmessage") metrics = DummyMetricsBackend(strict=True) writer = enforce_table_writer(dataset).get_bulk_writer( metrics, "gzip", [ "project_id", "id", "status", "last_seen", "first_seen", "active_at", "first_release_id", ], options=None, table_name="groupedmessage_local", ) writer.write([content]) cluster = dataset.get_default_entity().get_all_storages()[0].get_cluster() reader = cluster.get_reader() ret = reader.execute(FakeQuery([])) assert ret["data"][0] == {"count()": 2}
def test_simple_version_1(self): processor = (enforce_table_writer( self.dataset).get_stream_loader().get_processor()) assert processor.process_message( (0, "insert", copy.deepcopy(self.event))) == processor.process_message( (1, "insert", copy.deepcopy(self.event), {}))
def test_simple_version_0(self): processed = (enforce_table_writer( self.dataset).get_stream_loader().get_processor().process_message( (0, "insert", self.event))) for field in ("event_id", "project_id", "message", "platform"): assert processed.data[0][field] == self.event[field]
def test_v2_start_merge(self): project_id = 1 message = (2, "start_merge", {"project_id": project_id}) processor = (enforce_table_writer( self.dataset).get_stream_loader().get_processor()) assert processor.process_message(message) == ProcessedMessage( action=ProcessorAction.REPLACE, data=[(str(project_id), message)])
def test_produce_replacement_messages(self): producer = FakeConfluentKafkaProducer() test_worker = ConsumerWorker( self.dataset, producer=producer, replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) test_worker.flush_batch( [ ProcessedMessage( action=ProcessorAction.REPLACE, data=[("1", {"project_id": 1})], ), ProcessedMessage( action=ProcessorAction.REPLACE, data=[("2", {"project_id": 2})], ), ] ) assert [(m._topic, m._key, m._value) for m in producer.messages] == [ ("event-replacements", b"1", b'{"project_id": 1}'), ("event-replacements", b"2", b'{"project_id": 2}'), ]
def test_skip_too_old(self): test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) event = self.event old_timestamp = datetime.utcnow() - timedelta(days=300) old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ") event["datetime"] = old_timestamp_str event["data"]["datetime"] = old_timestamp_str event["data"]["received"] = int(calendar.timegm(old_timestamp.timetuple())) message: Message[KafkaPayload] = Message( Partition(Topic("events"), 1), 42, KafkaPayload(None, json.dumps((0, "insert", event)).encode("utf-8")), datetime.now(), ) assert test_worker.process_message(message) is None
def test_offsets(self): event = self.event message: Message[KafkaPayload] = Message( Partition(Topic("events"), 456), 123, KafkaPayload( None, json.dumps((0, "insert", event)).encode("utf-8") ), # event doesn't really matter datetime.now(), ) test_worker = ConsumerWorker( self.dataset, producer=FakeConfluentKafkaProducer(), replacements_topic=Topic( enforce_table_writer(self.dataset) .get_stream_loader() .get_replacement_topic_spec() .topic_name ), metrics=self.metrics, ) batch = [test_worker.process_message(message)] test_worker.flush_batch(batch) assert self.clickhouse.execute( "SELECT project_id, event_id, offset, partition FROM %s" % self.table ) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
def test_extract_sdk(self): sdk = { 'integrations': ['logback'], 'name': 'sentry-java', 'version': '1.6.1-d1e3a' } output = {} enforce_table_writer( self.dataset).get_stream_loader().get_processor().extract_sdk( output, sdk) assert output == { 'sdk_name': u'sentry-java', 'sdk_version': u'1.6.1-d1e3a', 'sdk_integrations': [u'logback'], }
def test_extract_geo(self): geo = { "country_code": "US", "city": "San Francisco", "region": "CA", } output = {} enforce_table_writer( self.dataset).get_stream_loader().get_processor().extract_geo( output, geo) assert output == { "geo_country_code": "US", "geo_city": "San Francisco", "geo_region": "CA", }
def test_v2_end_delete_tag(self): project_id = 1 message = (2, "end_delete_tag", {"project_id": project_id}) processor = (enforce_table_writer( self.dataset).get_stream_loader().get_processor()) assert processor.process_message(message, self.metadata) == ReplacementBatch( str(project_id), [message])
def test_extract_sdk(self): sdk = { "integrations": ["logback"], "name": "sentry-java", "version": "1.6.1-d1e3a", } output = {} enforce_table_writer( self.dataset).get_stream_loader().get_processor().extract_sdk( output, sdk) assert output == { "sdk_name": u"sentry-java", "sdk_version": u"1.6.1-d1e3a", "sdk_integrations": [u"logback"], }
def test_extract_geo(self): geo = { 'country_code': 'US', 'city': 'San Francisco', 'region': 'CA', } output = {} enforce_table_writer( self.dataset).get_stream_loader().get_processor().extract_geo( output, geo) assert output == { 'geo_country_code': 'US', 'geo_city': 'San Francisco', 'geo_region': 'CA', }
def generate_event(self): self.dataset = get_dataset("events") event = get_event() event["project_id"] = self.project_id event = (enforce_table_writer( self.dataset).get_stream_loader().get_processor().process_insert( event)) self.write_processed_records([event])