Beispiel #1
0
    def test_produce_replacement_messages(self):
        producer = FakeConfluentKafkaProducer()
        replacement_topic = enforce_table_writer(
            self.dataset).get_stream_loader().get_replacement_topic_spec()
        test_worker = ConsumerWorker(self.dataset, producer,
                                     replacement_topic.topic_name,
                                     self.metrics)

        test_worker.flush_batch([
            ProcessedMessage(
                action=ProcessorAction.REPLACE,
                data=[('1', {
                    'project_id': 1
                })],
            ),
            ProcessedMessage(
                action=ProcessorAction.REPLACE,
                data=[('2', {
                    'project_id': 2
                })],
            ),
        ])

        assert [(m._topic, m._key, m._value) for m in producer.messages] == \
            [('event-replacements', b'1', b'{"project_id": 1}'), ('event-replacements', b'2', b'{"project_id": 2}')]
Beispiel #2
0
    def test_offsets(self):
        event = self.event

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 456),
            123,
            KafkaPayload(
                None, json.dumps((0, "insert", event)).encode("utf-8")
            ),  # event doesn't really matter
            datetime.now(),
        )

        test_worker = ConsumerWorker(
            self.dataset,
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset)
                .get_stream_loader()
                .get_replacement_topic_spec()
                .topic_name
            ),
            metrics=self.metrics,
        )
        batch = [test_worker.process_message(message)]
        test_worker.flush_batch(batch)

        assert self.clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" % self.table
        ) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
Beispiel #3
0
    def test_skip_too_old(self):
        test_worker = ConsumerWorker(
            self.dataset,
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset)
                .get_stream_loader()
                .get_replacement_topic_spec()
                .topic_name
            ),
            metrics=self.metrics,
        )

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event["datetime"] = old_timestamp_str
        event["data"]["datetime"] = old_timestamp_str
        event["data"]["received"] = int(calendar.timegm(old_timestamp.timetuple()))

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 1),
            42,
            KafkaPayload(None, json.dumps((0, "insert", event)).encode("utf-8")),
            datetime.now(),
        )

        assert test_worker.process_message(message) is None
Beispiel #4
0
    def test_produce_replacement_messages(self):
        producer = FakeConfluentKafkaProducer()
        test_worker = ConsumerWorker(
            self.dataset,
            producer=producer,
            replacements_topic=Topic(
                enforce_table_writer(self.dataset)
                .get_stream_loader()
                .get_replacement_topic_spec()
                .topic_name
            ),
            metrics=self.metrics,
        )

        test_worker.flush_batch(
            [
                ProcessedMessage(
                    action=ProcessorAction.REPLACE, data=[("1", {"project_id": 1})],
                ),
                ProcessedMessage(
                    action=ProcessorAction.REPLACE, data=[("2", {"project_id": 2})],
                ),
            ]
        )

        assert [(m._topic, m._key, m._value) for m in producer.messages] == [
            ("event-replacements", b"1", b'{"project_id": 1}'),
            ("event-replacements", b"2", b'{"project_id": 2}'),
        ]
Beispiel #5
0
    def test_produce_replacement_messages(self):
        topic = 'topic'
        producer = FakeKafkaProducer()
        test_worker = ConsumerWorker(self.clickhouse, self.table, producer, topic)

        test_worker.flush_batch([
            (processor.REPLACE, ('1', {'project_id': 1})),
            (processor.REPLACE, ('2', {'project_id': 2})),
        ])

        assert [(m._topic, m._key, m._value) for m in producer.messages] == \
            [('topic', b'1', b'{"project_id": 1}'), ('topic', b'2', b'{"project_id": 2}')]
Beispiel #6
0
    def test_skip_too_old(self):
        test_worker = ConsumerWorker(self.clickhouse, self.table, FakeKafkaProducer(), 'topic')

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event['datetime'] = old_timestamp_str
        event['data']['datetime'] = old_timestamp_str
        event['data']['received'] = int(calendar.timegm(old_timestamp.timetuple()))

        class FakeMessage(object):
            def value(self):
                return json.dumps((0, 'insert', event))

        assert test_worker.process_message(FakeMessage()) is None
Beispiel #7
0
    def test_produce_replacement_messages(self):
        producer = FakeKafkaProducer()
        test_worker = ConsumerWorker(
            self.dataset, producer,
            self.dataset.get_default_replacement_topic())

        test_worker.flush_batch([
            (self.dataset.get_processor().REPLACE, ('1', {
                'project_id': 1
            })),
            (self.dataset.get_processor().REPLACE, ('2', {
                'project_id': 2
            })),
        ])

        assert [(m._topic, m._key, m._value) for m in producer.messages] == \
            [('event-replacements', b'1', b'{"project_id": 1}'), ('event-replacements', b'2', b'{"project_id": 2}')]
Beispiel #8
0
def run(events_file, clickhouse, table_name, repeat=1,
        profile_process=False, profile_write=False):
    from snuba.clickhouse import get_table_definition, get_test_engine
    from snuba.consumer import ConsumerWorker

    clickhouse.execute(
        get_table_definition(
            name=table_name,
            engine=get_test_engine(),
        )
    )

    consumer = ConsumerWorker(
        clickhouse=clickhouse,
        dist_table_name=table_name,
        producer=None,
        replacements_topic=None,
    )

    messages = get_messages(events_file)
    messages = chain(*([messages] * repeat))
    processed = []

    def process():
        with settings_override({'DISCARD_OLD_EVENTS': False}):
            for message in messages:
                result = consumer.process_message(message)
                if result is not None:
                    processed.append(result)

    def write():
        consumer.flush_batch(processed)

    time_start = time.time()
    if profile_process:
        cProfile.runctx('process()', globals(), locals(), sort='cumulative')
    else:
        process()
    time_write = time.time()
    if profile_write:
        cProfile.runctx('write()', globals(), locals(), sort='cumulative')
    else:
        write()
    time_finish = time.time()

    format_time = lambda t: ("%.2f" % t).rjust(10, ' ')

    time_to_process = (time_write - time_start) * 1000
    time_to_write = (time_finish - time_write) * 1000
    time_total = (time_finish - time_start) * 1000
    num_events = len(processed)

    logger.info("Number of events: %s" % six.text_type(num_events).rjust(10, ' '))
    logger.info("Total:            %sms" % format_time(time_total))
    logger.info("Total process:    %sms" % format_time(time_to_process))
    logger.info("Total write:      %sms" % format_time(time_to_write))
    logger.info("Process event:    %sms/ea" % format_time(time_to_process / num_events))
    logger.info("Write event:      %sms/ea" % format_time(time_to_write / num_events))
Beispiel #9
0
 def build_base_consumer(self) -> BatchingConsumer:
     """
     Builds the consumer with a ConsumerWorker.
     """
     return self.__build_consumer(
         ConsumerWorker(self.dataset,
                        producer=self.producer,
                        replacements_topic=self.replacements_topic,
                        metrics=self.metrics))
Beispiel #10
0
def consumer(raw_events_topic, replacements_topic, commit_log_topic, consumer_group,
             bootstrap_server, clickhouse_server, distributed_table_name, max_batch_size, max_batch_time_ms,
             auto_offset_reset, queued_max_messages_kbytes, queued_min_messages, log_level,
             dogstatsd_host, dogstatsd_port):

    import sentry_sdk
    from snuba import util
    from snuba.clickhouse import ClickhousePool
    from batching_kafka_consumer import BatchingKafkaConsumer
    from snuba.consumer import ConsumerWorker

    sentry_sdk.init(dsn=settings.SENTRY_DSN)

    logging.basicConfig(level=getattr(logging, log_level.upper()), format='%(asctime)s %(message)s')
    metrics = util.create_metrics(
        dogstatsd_host, dogstatsd_port, 'snuba.consumer', tags=["group:%s" % consumer_group]
    )

    clickhouse = ClickhousePool(
        host=clickhouse_server.split(':')[0],
        port=int(clickhouse_server.split(':')[1]),
        client_settings={
            'load_balancing': 'in_order',
            'insert_distributed_sync': True,
        },
        metrics=metrics
    )

    producer = Producer({
        'bootstrap.servers': ','.join(bootstrap_server),
        'partitioner': 'consistent',
        'message.max.bytes': 50000000,  # 50MB, default is 1MB
    })

    consumer = BatchingKafkaConsumer(
        raw_events_topic,
        worker=ConsumerWorker(
            clickhouse, distributed_table_name,
            producer=producer, replacements_topic=replacements_topic, metrics=metrics
        ),
        max_batch_size=max_batch_size,
        max_batch_time=max_batch_time_ms,
        metrics=metrics,
        bootstrap_servers=bootstrap_server,
        group_id=consumer_group,
        producer=producer,
        commit_log_topic=commit_log_topic,
        auto_offset_reset=auto_offset_reset,
    )

    def handler(signum, frame):
        consumer.signal_shutdown()

    signal.signal(signal.SIGINT, handler)

    consumer.run()
Beispiel #11
0
    def test_offsets(self):
        event = self.event

        class FakeMessage(object):
            def value(self):
                # event doesn't really matter
                return json.dumps((0, 'insert', event))

            def offset(self):
                return 123

            def partition(self):
                return 456

        test_worker = ConsumerWorker(self.clickhouse, self.table, FakeKafkaProducer(), 'topic')
        batch = [test_worker.process_message(FakeMessage())]
        test_worker.flush_batch(batch)

        assert self.clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" % self.table
        ) == [(self.event['project_id'], self.event['event_id'], 123, 456)]
Beispiel #12
0
    def eventstream(dataset_name):
        dataset = get_dataset(dataset_name)
        ensure_table_exists(dataset)
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message = KafkaMessage(
            TopicPartition('topic', 0),
            0,
            http_request.data,
        )

        type_ = record[1]
        metrics = DummyMetricsBackend()
        if type_ == 'insert':
            from snuba.consumer import ConsumerWorker
            worker = ConsumerWorker(dataset,
                                    producer=None,
                                    replacements_topic=None,
                                    metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker
            worker = ReplacerWorker(clickhouse_rw, dataset, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ('ok', 200, {'Content-Type': 'text/plain'})
Beispiel #13
0
    def eventstream(*, dataset: Dataset):
        record = json.loads(http_request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        message: Message[KafkaPayload] = Message(
            Partition(Topic("topic"), 0),
            0,
            KafkaPayload(None, http_request.data, []),
            datetime.now(),
        )

        type_ = record[1]

        storage = dataset.get_writable_storage()
        assert storage is not None

        if type_ == "insert":
            from snuba.consumer import ConsumerWorker

            worker = ConsumerWorker(storage, metrics=metrics)
        else:
            from snuba.replacer import ReplacerWorker

            worker = ReplacerWorker(storage, metrics=metrics)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ("ok", 200, {"Content-Type": "text/plain"})
Beispiel #14
0
 def __build_batching_strategy_factory(
     self, ) -> BatchProcessingStrategyFactory[KafkaPayload]:
     return BatchProcessingStrategyFactory(
         worker=ConsumerWorker(
             storage=self.storage,
             producer=self.producer,
             replacements_topic=self.replacements_topic,
             metrics=self.metrics,
         ),
         max_batch_size=self.max_batch_size,
         max_batch_time=self.max_batch_time_ms,
         metrics=self.metrics,
     )
Beispiel #15
0
    def test_offsets(self):
        event = self.event

        message = KafkaMessage(
            TopicPartition('events', 456),
            123,
            json.dumps((0, 'insert',
                        event)).encode('utf-8')  # event doesn't really matter
        )

        replacement_topic = enforce_table_writer(
            self.dataset).get_stream_loader().get_replacement_topic_spec()
        test_worker = ConsumerWorker(self.dataset,
                                     FakeConfluentKafkaProducer(),
                                     replacement_topic.topic_name,
                                     self.metrics)
        batch = [test_worker.process_message(message)]
        test_worker.flush_batch(batch)

        assert self.clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" %
            self.table) == [(self.event['project_id'], self.event['event_id'],
                             123, 456)]
Beispiel #16
0
    def test_skip_too_old(self):
        replacement_topic = enforce_table_writer(
            self.dataset).get_stream_loader().get_replacement_topic_spec()
        test_worker = ConsumerWorker(self.dataset,
                                     FakeConfluentKafkaProducer(),
                                     replacement_topic.topic_name,
                                     self.metrics)

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event['datetime'] = old_timestamp_str
        event['data']['datetime'] = old_timestamp_str
        event['data']['received'] = int(
            calendar.timegm(old_timestamp.timetuple()))

        message = KafkaMessage(
            TopicPartition('events', 1),
            42,
            json.dumps((0, 'insert', event)).encode('utf-8'),
        )

        assert test_worker.process_message(message) is None
Beispiel #17
0
 def build_base_consumer(self) -> BatchingConsumer[KafkaPayload]:
     """
     Builds the consumer with a ConsumerWorker.
     """
     return self.__build_consumer(
         ConsumerWorker(
             self.dataset,
             producer=self.producer,
             replacements_topic=self.replacements_topic,
             metrics=self.metrics,
             rapidjson_deserialize=self.__rapidjson_deserialize,
             rapidjson_serialize=self.__rapidjson_serialize,
         )
     )
Beispiel #18
0
    def test_skip_too_old(self):
        test_worker = ConsumerWorker(
            self.dataset, FakeKafkaProducer(),
            self.dataset.get_default_replacement_topic())

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event['datetime'] = old_timestamp_str
        event['data']['datetime'] = old_timestamp_str
        event['data']['received'] = int(
            calendar.timegm(old_timestamp.timetuple()))

        class FakeMessage(object):
            def value(self):
                return json.dumps((0, 'insert', event))

            def partition(self):
                return 1

            def offset(self):
                return 42

        assert test_worker.process_message(FakeMessage()) is None
Beispiel #19
0
    def test_produce_replacement_messages(self):
        producer = FakeConfluentKafkaProducer()
        test_worker = ConsumerWorker(
            self.dataset.get_writable_storage(),
            producer=producer,
            replacements_topic=Topic(
                enforce_table_writer(self.dataset).get_stream_loader().
                get_replacement_topic_spec().topic_name),
            metrics=self.metrics,
        )

        test_worker.flush_batch([
            ReplacementBatch("1", [{
                "project_id": 1
            }]),
            ReplacementBatch("2", [{
                "project_id": 2
            }]),
        ])

        assert [(m._topic, m._key, m._value) for m in producer.messages] == [
            ("event-replacements", b"1", b'{"project_id":1}'),
            ("event-replacements", b"2", b'{"project_id":2}'),
        ]
Beispiel #20
0
    def eventstream():
        record = json.loads(request.data)

        version = record[0]
        if version != 2:
            raise RuntimeError("Unsupported protocol version: %s" % record)

        class Message(object):
            def __init__(self, value):
                self._value = value

            def value(self):
                return self._value

            def partition(self):
                return None

            def offset(self):
                return None

        message = Message(request.data)

        type_ = record[1]
        if type_ == 'insert':
            from snuba.consumer import ConsumerWorker
            worker = ConsumerWorker(clickhouse_rw, settings.CLICKHOUSE_TABLE, producer=None, replacements_topic=None)
        else:
            from snuba.replacer import ReplacerWorker
            worker = ReplacerWorker(clickhouse_rw, settings.CLICKHOUSE_TABLE)

        processed = worker.process_message(message)
        if processed is not None:
            batch = [processed]
            worker.flush_batch(batch)

        return ('ok', 200, {'Content-Type': 'text/plain'})
Beispiel #21
0
def run(events_file, dataset, repeat=1, profile_process=False, profile_write=False):
    """
    Measures the write performance of a dataset
    """

    from snuba.consumer import ConsumerWorker

    for storage in dataset.get_all_storages():
        for statement in storage.get_schemas().get_create_statements():
            clickhouse_rw.execute(statement.statement)

    writable_storage = dataset.get_writable_storage()

    consumer = ConsumerWorker(writable_storage, metrics=DummyMetricsBackend())

    messages = get_messages(events_file)
    messages = chain(*([messages] * repeat))
    processed = []

    def process():
        with settings_override({"DISCARD_OLD_EVENTS": False}):
            for message in messages:
                result = consumer.process_message(message)
                if result is not None:
                    processed.append(result)

    def write():
        consumer.flush_batch(processed)

    time_start = time.time()
    if profile_process:
        filename = tempfile.NamedTemporaryFile(
            prefix=os.path.basename(events_file) + ".process.",
            suffix=".pstats",
            delete=False,
        ).name
        cProfile.runctx("process()", globals(), locals(), filename=filename)
        logger.info("Profile Data: %s", filename)
    else:
        process()
    time_write = time.time()
    if profile_write:
        filename = tempfile.NamedTemporaryFile(
            prefix=os.path.basename(events_file) + ".write.",
            suffix=".pstats",
            delete=False,
        ).name
        cProfile.runctx("write()", globals(), locals(), filename=filename)
        logger.info("Profile Data: %s", filename)
    else:
        write()
    time_finish = time.time()

    time_to_process = (time_write - time_start) * 1000
    time_to_write = (time_finish - time_write) * 1000
    time_total = (time_finish - time_start) * 1000
    num_events = len(processed)

    logger.info("Number of events: %s" % str(num_events).rjust(10, " "))
    logger.info("Total:            %sms" % format_time(time_total))
    logger.info("Total process:    %sms" % format_time(time_to_process))
    logger.info("Total write:      %sms" % format_time(time_to_write))
    logger.info("Process event:    %sms/ea" % format_time(time_to_process / num_events))
    logger.info("Write event:      %sms/ea" % format_time(time_to_write / num_events))
Beispiel #22
0
def run(events_file, dataset, repeat=1,
        profile_process=False, profile_write=False):
    """
    Measures the write performance of a dataset
    """

    from snuba.consumer import ConsumerWorker
    from snuba.clickhouse.native import ClickhousePool

    for statement in dataset.get_dataset_schemas().get_create_statements():
        ClickhousePool().execute(statement)

    consumer = ConsumerWorker(
        dataset=dataset,
        producer=None,
        replacements_topic=None,
    )

    messages = get_messages(events_file)
    messages = chain(*([messages] * repeat))
    processed = []

    def process():
        with settings_override({'DISCARD_OLD_EVENTS': False}):
            for message in messages:
                result = consumer.process_message(message)
                if result is not None:
                    processed.append(result)

    def write():
        consumer.flush_batch(processed)

    time_start = time.time()
    if profile_process:
        filename = tempfile.NamedTemporaryFile(
            prefix=os.path.basename(events_file) + '.process.',
            suffix='.pstats',
            delete=False,
        ).name
        cProfile.runctx('process()', globals(), locals(), filename=filename)
        logger.info('Profile Data: %s', filename)
    else:
        process()
    time_write = time.time()
    if profile_write:
        filename = tempfile.NamedTemporaryFile(
            prefix=os.path.basename(events_file) + '.write.',
            suffix='.pstats',
            delete=False,
        ).name
        cProfile.runctx('write()', globals(), locals(), filename=filename)
        logger.info('Profile Data: %s', filename)
    else:
        write()
    time_finish = time.time()

    format_time = lambda t: ("%.2f" % t).rjust(10, ' ')

    time_to_process = (time_write - time_start) * 1000
    time_to_write = (time_finish - time_write) * 1000
    time_total = (time_finish - time_start) * 1000
    num_events = len(processed)

    logger.info("Number of events: %s" % str(num_events).rjust(10, ' '))
    logger.info("Total:            %sms" % format_time(time_total))
    logger.info("Total process:    %sms" % format_time(time_to_process))
    logger.info("Total write:      %sms" % format_time(time_to_write))
    logger.info("Process event:    %sms/ea" % format_time(time_to_process / num_events))
    logger.info("Write event:      %sms/ea" % format_time(time_to_write / num_events))