コード例 #1
0
 def test_extract_required(self):
     now = datetime.utcnow()
     event = {
         "event_id": "1" * 32,
         "project_id": 100,
         "group_id": 10,
         "datetime": now.strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
     }
     output = {}
     extract_base(output, event)
     output["retention_days"] = enforce_retention(
         event,
         datetime.strptime(event["datetime"],
                           settings.PAYLOAD_DATETIME_FORMAT),
     )
     enforce_table_writer(
         self.dataset).get_stream_loader().get_processor().extract_required(
             output, event)
     assert output == {
         "event_id": "11111111111111111111111111111111",
         "project_id": 100,
         "group_id": 10,
         "timestamp": now,
         "retention_days": settings.DEFAULT_RETENTION_DAYS,
     }
コード例 #2
0
    def test_extract_common_search_message(self):
        now = datetime.utcnow().replace(microsecond=0)
        event = {
            'primary_hash': 'a' * 32,
            'message': 'the message',
            'platform': 'the_platform',
            'search_message': 'the search message',
        }
        data = {
            'received': int(calendar.timegm(now.timetuple())),
        }
        output = {}
        enforce_table_writer(
            self.dataset).get_stream_loader().get_processor().extract_common(
                output, event, data)
        assert output['search_message'] == 'the search message'

        # with optional short message
        now = datetime.utcnow().replace(microsecond=0)
        event = {
            'primary_hash': 'a' * 32,
            'message': 'the message',
            'platform': 'the_platform',
            'search_message': 'the search message',
        }
        data = {
            'received': int(calendar.timegm(now.timetuple())),
            'message': 'the short message',
        }
        output = {}
        enforce_table_writer(
            self.dataset).get_stream_loader().get_processor().extract_common(
                output, event, data)
        assert output['search_message'] == 'the search message'
        assert output['message'] == 'the short message'
コード例 #3
0
ファイル: test_events_processor.py プロジェクト: alexef/snuba
    def test_extract_common(self):
        now = datetime.utcnow().replace(microsecond=0)
        event = {
            "primary_hash": "a" * 32,
            "message": "the message",
            "platform": "the_platform",
            "data": {
                "received": int(calendar.timegm(now.timetuple())),
                "culprit": "the culprit",
                "type": "error",
                "version": 6,
                "title": "FooError",
                "location": "bar.py",
                "modules": OrderedDict([("foo", "1.0"), ("bar", "2.0"), ("baz", None)]),
            },
        }
        output = {}

        enforce_table_writer(
            self.dataset
        ).get_stream_loader().get_processor().extract_common(
            output, event, self.metadata
        )
        assert output == {
            "platform": u"the_platform",
            "primary_hash": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa",
            "received": now,
            "culprit": "the culprit",
            "type": "error",
            "version": "6",
            "modules.name": [u"foo", u"bar", u"baz"],
            "modules.version": [u"1.0", u"2.0", u""],
            "title": "FooError",
            "location": "bar.py",
        }
コード例 #4
0
ファイル: views.py プロジェクト: chhetripradeep/snuba
    def write(*, dataset: Dataset) -> RespTuple:
        from snuba.processor import InsertBatch

        rows: MutableSequence[WriterTableRow] = []
        offset_base = int(round(time.time() * 1000))
        for index, message in enumerate(json.loads(http_request.data)):
            offset = offset_base + index
            processed_message = (
                enforce_table_writer(dataset)
                .get_stream_loader()
                .get_processor()
                .process_message(
                    message,
                    KafkaMessageMetadata(
                        offset=offset, partition=0, timestamp=datetime.utcnow()
                    ),
                )
            )
            if processed_message:
                assert isinstance(processed_message, InsertBatch)
                rows.extend(processed_message.rows)

        BatchWriterEncoderWrapper(
            enforce_table_writer(dataset).get_batch_writer(metrics), JSONRowEncoder(),
        ).write(rows)

        return ("ok", 200, {"Content-Type": "text/plain"})
コード例 #5
0
 def test_invalid_format(self) -> None:
     with pytest.raises(InvalidMessageVersion):
         enforce_table_writer(self.dataset).get_stream_loader(
         ).get_processor().process_message(
             (-1, "insert", self.event),
             self.metadata,
         )
コード例 #6
0
def bulk_load(dataset, dest_table, source, log_level):
    import sentry_sdk

    sentry_sdk.init(dsn=settings.SENTRY_DSN)
    logging.basicConfig(level=getattr(logging, log_level.upper()),
                        format='%(asctime)s %(message)s')

    logger = logging.getLogger('snuba.load-snapshot')
    logger.info("Start bulk load process for dataset %s, from source %s",
                dataset, source)
    dataset = get_dataset(dataset)

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = enforce_table_writer(dataset).get_bulk_loader(
        snapshot_source, dest_table)
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table),
        settings.BULK_CLICKHOUSE_BUFFER,
    )

    loader.load(writer)
コード例 #7
0
def bulk_load(
    *,
    dataset_name: Optional[str],
    dest_table: Optional[str],
    source: Optional[str],
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info("Start bulk load process for dataset %s, from source %s",
                dataset_name, source)
    dataset = get_dataset(dataset_name)

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = enforce_table_writer(dataset).get_bulk_loader(
        snapshot_source, dest_table)
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        enforce_table_writer(dataset).get_bulk_writer(table_name=dest_table),
        settings.BULK_CLICKHOUSE_BUFFER,
    )

    loader.load(writer)
コード例 #8
0
    def test_extract_tags_empty_string(self):
        # verify our text field extraction doesn't coerce '' to None
        tags = {
            "environment": "",
        }
        output = {}

        enforce_table_writer(self.dataset).get_stream_loader().get_processor(
        ).extract_promoted_tags(output, tags)

        assert output["environment"] == u""
コード例 #9
0
    def test_simple_version_0(self):
        processed = enforce_table_writer(
            self.dataset).get_stream_loader().get_processor().process_message(
                (0, 'insert', self.event))

        for field in ('event_id', 'project_id', 'message', 'platform'):
            assert processed.data[0][field] == self.event[field]
コード例 #10
0
 def generate_events(self):
     events = []
     for tick in range(self.minutes):
         # project N sends an event every Nth minute
         events.append(
             enforce_table_writer(self.dataset).get_stream_loader().
             get_processor().process_insert({
                 "project_id":
                 self.project_id,
                 "event_id":
                 uuid.uuid4().hex,
                 "deleted":
                 0,
                 "datetime": (self.base_time + timedelta(minutes=tick)
                              ).strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
                 "message":
                 "a message",
                 "platform":
                 self.platforms[tick % len(self.platforms)],
                 "primary_hash":
                 uuid.uuid4().hex,
                 "group_id":
                 tick,
                 "retention_days":
                 settings.DEFAULT_RETENTION_DAYS,
                 "data": {
                     "received":
                     calendar.timegm((self.base_time +
                                      timedelta(minutes=tick)).timetuple()),
                 },
             }))
     self.write_processed_records(events)
コード例 #11
0
ファイル: test_writer.py プロジェクト: denisgolius/snuba
    def test_error_handling(self):
        try:
            enforce_table_writer(self.dataset).get_writer(table_name="invalid").write([{"x": "y"}])
        except ClickHouseError as error:
            assert error.code == 60
            assert error.type == 'DB::Exception'
        else:
            assert False, "expected error"

        try:
            enforce_table_writer(self.dataset).get_writer().write([{"timestamp": "invalid"}])
        except ClickHouseError as error:
            assert error.code == 41
            assert error.type == 'DB::Exception'
        else:
            assert False, "expected error"
コード例 #12
0
ファイル: test_events_processor.py プロジェクト: alexef/snuba
 def __process_insert_event(self, event: InsertEvent) -> Optional[ProcessedMessage]:
     return (
         enforce_table_writer(self.dataset)
         .get_stream_loader()
         .get_processor()
         .process_message((2, "insert", event, {}), self.metadata)
     )
コード例 #13
0
ファイル: consumer.py プロジェクト: denisgolius/snuba
 def _process_message_impl(
     self,
     value: Mapping[str, Any],
     metadata: KafkaMessageMetadata,
 ) -> Optional[ProcessedMessage]:
     processor = enforce_table_writer(self.__dataset).get_stream_loader().get_processor()
     return processor.process_message(value, metadata)
コード例 #14
0
def optimize(
    *,
    clickhouse_host: str,
    clickhouse_port: int,
    database: str,
    dataset_name: str,
    timeout: int,
    log_level: Optional[str] = None,
) -> None:
    from datetime import datetime
    from snuba.clickhouse.native import ClickhousePool
    from snuba.optimize import run_optimize, logger

    setup_logging(log_level)

    dataset = get_dataset(dataset_name)
    table = enforce_table_writer(dataset).get_schema().get_local_table_name()

    today = datetime.utcnow().replace(hour=0,
                                      minute=0,
                                      second=0,
                                      microsecond=0)
    clickhouse = ClickhousePool(clickhouse_host,
                                clickhouse_port,
                                send_receive_timeout=timeout)
    num_dropped = run_optimize(clickhouse, database, table, before=today)
    logger.info("Optimized %s partitions on %s" %
                (num_dropped, clickhouse_host))
コード例 #15
0
    def test_produce_replacement_messages(self):
        producer = FakeConfluentKafkaProducer()
        replacement_topic = enforce_table_writer(
            self.dataset).get_stream_loader().get_replacement_topic_spec()
        test_worker = ConsumerWorker(self.dataset, producer,
                                     replacement_topic.topic_name,
                                     self.metrics)

        test_worker.flush_batch([
            ProcessedMessage(
                action=ProcessorAction.REPLACE,
                data=[('1', {
                    'project_id': 1
                })],
            ),
            ProcessedMessage(
                action=ProcessorAction.REPLACE,
                data=[('2', {
                    'project_id': 2
                })],
            ),
        ])

        assert [(m._topic, m._key, m._value) for m in producer.messages] == \
            [('event-replacements', b'1', b'{"project_id": 1}'), ('event-replacements', b'2', b'{"project_id": 2}')]
コード例 #16
0
    def generate_outcomes(
        self,
        org_id: int,
        project_id: int,
        num_outcomes: int,
        outcome: int,
        time_since_base: timedelta,
    ) -> None:
        outcomes = []
        for _ in range(num_outcomes):
            processed = (enforce_table_writer(self.dataset).get_stream_loader(
            ).get_processor().process_message(
                {
                    "project_id":
                    project_id,
                    "event_id":
                    uuid.uuid4().hex,
                    "timestamp":
                    (self.base_time +
                     time_since_base).strftime("%Y-%m-%dT%H:%M:%S.%fZ"),
                    "org_id":
                    org_id,
                    "reason":
                    None,
                    "key_id":
                    1,
                    "outcome":
                    outcome,
                },
                None,
            ))

            outcomes.extend(processed.data)

        self.write_processed_events(outcomes)
コード例 #17
0
ファイル: replacer.py プロジェクト: Appva/snuba
    def flush_batch(self, batch: Sequence[Replacement]) -> None:
        for replacement in batch:
            query_args = {
                **replacement.query_args,
                'dist_read_table_name':
                self.dataset.get_dataset_schemas().get_read_schema().
                get_data_source().format_from(),
                'dist_write_table_name':
                enforce_table_writer(
                    self.dataset).get_schema().get_table_name(),
            }
            count = self.clickhouse.execute_robust(
                replacement.count_query_template % query_args)[0][0]
            if count == 0:
                continue

            # query_time_flags == (type, project_id, [...data...])
            flag_type, project_id = replacement.query_time_flags[:2]
            if flag_type == NEEDS_FINAL:
                set_project_needs_final(project_id)
            elif flag_type == EXCLUDE_GROUPS:
                group_ids = replacement.query_time_flags[2]
                set_project_exclude_groups(project_id, group_ids)

            t = time.time()
            query = replacement.insert_query_template % query_args
            logger.debug("Executing replace query: %s" % query)
            self.clickhouse.execute_robust(query)
            duration = int((time.time() - t) * 1000)
            logger.info("Replacing %s rows took %sms" % (count, duration))
            self.metrics.timing('replacements.count', count)
            self.metrics.timing('replacements.duration', duration)
コード例 #18
0
ファイル: test_writer.py プロジェクト: pombredanne/snuba
def test_gzip_load() -> None:
    content = gzip.compress(DATA.encode("utf-8"))

    dataset = get_dataset("groupedmessage")
    metrics = DummyMetricsBackend(strict=True)
    writer = enforce_table_writer(dataset).get_bulk_writer(
        metrics,
        "gzip",
        [
            "project_id",
            "id",
            "status",
            "last_seen",
            "first_seen",
            "active_at",
            "first_release_id",
        ],
        options=None,
        table_name="groupedmessage_local",
    )

    writer.write([content])

    cluster = dataset.get_default_entity().get_all_storages()[0].get_cluster()
    reader = cluster.get_reader()

    ret = reader.execute(FakeQuery([]))
    assert ret["data"][0] == {"count()": 2}
コード例 #19
0
 def test_simple_version_1(self):
     processor = (enforce_table_writer(
         self.dataset).get_stream_loader().get_processor())
     assert processor.process_message(
         (0, "insert",
          copy.deepcopy(self.event))) == processor.process_message(
              (1, "insert", copy.deepcopy(self.event), {}))
コード例 #20
0
    def test_simple_version_0(self):
        processed = (enforce_table_writer(
            self.dataset).get_stream_loader().get_processor().process_message(
                (0, "insert", self.event)))

        for field in ("event_id", "project_id", "message", "platform"):
            assert processed.data[0][field] == self.event[field]
コード例 #21
0
 def test_v2_start_merge(self):
     project_id = 1
     message = (2, "start_merge", {"project_id": project_id})
     processor = (enforce_table_writer(
         self.dataset).get_stream_loader().get_processor())
     assert processor.process_message(message) == ProcessedMessage(
         action=ProcessorAction.REPLACE, data=[(str(project_id), message)])
コード例 #22
0
    def test_produce_replacement_messages(self):
        producer = FakeConfluentKafkaProducer()
        test_worker = ConsumerWorker(
            self.dataset,
            producer=producer,
            replacements_topic=Topic(
                enforce_table_writer(self.dataset)
                .get_stream_loader()
                .get_replacement_topic_spec()
                .topic_name
            ),
            metrics=self.metrics,
        )

        test_worker.flush_batch(
            [
                ProcessedMessage(
                    action=ProcessorAction.REPLACE, data=[("1", {"project_id": 1})],
                ),
                ProcessedMessage(
                    action=ProcessorAction.REPLACE, data=[("2", {"project_id": 2})],
                ),
            ]
        )

        assert [(m._topic, m._key, m._value) for m in producer.messages] == [
            ("event-replacements", b"1", b'{"project_id": 1}'),
            ("event-replacements", b"2", b'{"project_id": 2}'),
        ]
コード例 #23
0
    def test_skip_too_old(self):
        test_worker = ConsumerWorker(
            self.dataset,
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset)
                .get_stream_loader()
                .get_replacement_topic_spec()
                .topic_name
            ),
            metrics=self.metrics,
        )

        event = self.event
        old_timestamp = datetime.utcnow() - timedelta(days=300)
        old_timestamp_str = old_timestamp.strftime("%Y-%m-%dT%H:%M:%S.%fZ")
        event["datetime"] = old_timestamp_str
        event["data"]["datetime"] = old_timestamp_str
        event["data"]["received"] = int(calendar.timegm(old_timestamp.timetuple()))

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 1),
            42,
            KafkaPayload(None, json.dumps((0, "insert", event)).encode("utf-8")),
            datetime.now(),
        )

        assert test_worker.process_message(message) is None
コード例 #24
0
    def test_offsets(self):
        event = self.event

        message: Message[KafkaPayload] = Message(
            Partition(Topic("events"), 456),
            123,
            KafkaPayload(
                None, json.dumps((0, "insert", event)).encode("utf-8")
            ),  # event doesn't really matter
            datetime.now(),
        )

        test_worker = ConsumerWorker(
            self.dataset,
            producer=FakeConfluentKafkaProducer(),
            replacements_topic=Topic(
                enforce_table_writer(self.dataset)
                .get_stream_loader()
                .get_replacement_topic_spec()
                .topic_name
            ),
            metrics=self.metrics,
        )
        batch = [test_worker.process_message(message)]
        test_worker.flush_batch(batch)

        assert self.clickhouse.execute(
            "SELECT project_id, event_id, offset, partition FROM %s" % self.table
        ) == [(self.event["project_id"], self.event["event_id"], 123, 456)]
コード例 #25
0
    def test_extract_sdk(self):
        sdk = {
            'integrations': ['logback'],
            'name': 'sentry-java',
            'version': '1.6.1-d1e3a'
        }
        output = {}

        enforce_table_writer(
            self.dataset).get_stream_loader().get_processor().extract_sdk(
                output, sdk)

        assert output == {
            'sdk_name': u'sentry-java',
            'sdk_version': u'1.6.1-d1e3a',
            'sdk_integrations': [u'logback'],
        }
コード例 #26
0
    def test_extract_geo(self):
        geo = {
            "country_code": "US",
            "city": "San Francisco",
            "region": "CA",
        }
        output = {}

        enforce_table_writer(
            self.dataset).get_stream_loader().get_processor().extract_geo(
                output, geo)

        assert output == {
            "geo_country_code": "US",
            "geo_city": "San Francisco",
            "geo_region": "CA",
        }
コード例 #27
0
 def test_v2_end_delete_tag(self):
     project_id = 1
     message = (2, "end_delete_tag", {"project_id": project_id})
     processor = (enforce_table_writer(
         self.dataset).get_stream_loader().get_processor())
     assert processor.process_message(message,
                                      self.metadata) == ReplacementBatch(
                                          str(project_id), [message])
コード例 #28
0
    def test_extract_sdk(self):
        sdk = {
            "integrations": ["logback"],
            "name": "sentry-java",
            "version": "1.6.1-d1e3a",
        }
        output = {}

        enforce_table_writer(
            self.dataset).get_stream_loader().get_processor().extract_sdk(
                output, sdk)

        assert output == {
            "sdk_name": u"sentry-java",
            "sdk_version": u"1.6.1-d1e3a",
            "sdk_integrations": [u"logback"],
        }
コード例 #29
0
    def test_extract_geo(self):
        geo = {
            'country_code': 'US',
            'city': 'San Francisco',
            'region': 'CA',
        }
        output = {}

        enforce_table_writer(
            self.dataset).get_stream_loader().get_processor().extract_geo(
                output, geo)

        assert output == {
            'geo_country_code': 'US',
            'geo_city': 'San Francisco',
            'geo_region': 'CA',
        }
コード例 #30
0
 def generate_event(self):
     self.dataset = get_dataset("events")
     event = get_event()
     event["project_id"] = self.project_id
     event = (enforce_table_writer(
         self.dataset).get_stream_loader().get_processor().process_insert(
             event))
     self.write_processed_records([event])