Beispiel #1
0
def generate_transactions() -> None:
    from datetime import datetime

    table_writer = get_writable_storage(
        StorageKey.TRANSACTIONS).get_table_writer()

    rows = []

    for i in range(5):
        raw_transaction = get_raw_transaction()
        # Older versions of this table did not have measurements
        del raw_transaction["data"]["measurements"]

        processed = (
            table_writer.get_stream_loader().get_processor().process_message(
                (2, "insert", raw_transaction),
                KafkaMessageMetadata(0, 0, datetime.utcnow()),
            ))
        rows.extend(processed.rows)

    BatchWriterEncoderWrapper(
        table_writer.get_batch_writer(metrics=DummyMetricsBackend(
            strict=True)),
        JSONRowEncoder(),
    ).write(rows)
Beispiel #2
0
    def write(*, dataset: Dataset) -> RespTuple:
        from snuba.processor import InsertBatch

        rows: MutableSequence[WriterTableRow] = []
        offset_base = int(round(time.time() * 1000))
        for index, message in enumerate(json.loads(http_request.data)):
            offset = offset_base + index
            processed_message = (
                enforce_table_writer(dataset)
                .get_stream_loader()
                .get_processor()
                .process_message(
                    message,
                    KafkaMessageMetadata(
                        offset=offset, partition=0, timestamp=datetime.utcnow()
                    ),
                )
            )
            if processed_message:
                assert isinstance(processed_message, InsertBatch)
                rows.extend(processed_message.rows)

        BatchWriterEncoderWrapper(
            enforce_table_writer(dataset).get_batch_writer(metrics), JSONRowEncoder(),
        ).write(rows)

        return ("ok", 200, {"Content-Type": "text/plain"})
Beispiel #3
0
def write_processed_messages(storage: WritableStorage,
                             messages: Sequence[ProcessedMessage]) -> None:
    rows: MutableSequence[WriterTableRow] = []
    for message in messages:
        assert isinstance(message, InsertBatch)
        rows.extend(message.rows)

    BatchWriterEncoderWrapper(
        storage.get_table_writer().get_batch_writer(
            metrics=DummyMetricsBackend(strict=True)),
        JSONRowEncoder(),
    ).write(rows)
Beispiel #4
0
def bulk_load(
    *,
    storage_name: str,
    dest_table: str,
    source: str,
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info("Start bulk load process for storage %s, from source %s",
                storage_name, source)

    storage = get_cdc_storage(StorageKey(storage_name))
    table_writer = storage.get_table_writer()

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT,
        path=source,
    )

    loader = table_writer.get_bulk_loader(
        snapshot_source,
        storage.get_postgres_table(),
        dest_table,
        storage.get_row_processor(),
    )
    # TODO: see whether we need to pass options to the writer
    writer = BufferedWriterWrapper(
        table_writer.get_batch_writer(
            environment.metrics,
            table_name=dest_table,
            chunk_size=settings.BULK_CLICKHOUSE_BUFFER,
        ),
        settings.BULK_CLICKHOUSE_BUFFER,
        JSONRowEncoder(),
    )

    loader.load(writer)
Beispiel #5
0
    def __init__(
        self,
        storage: WritableTableStorage,
        metrics: MetricsBackend,
        producer: Optional[ConfluentKafkaProducer] = None,
        replacements_topic: Optional[Topic] = None,
    ) -> None:
        self.__storage = storage
        self.producer = producer
        self.replacements_topic = replacements_topic
        self.metrics = metrics
        table_writer = storage.get_table_writer()
        self.__writer = BatchWriterEncoderWrapper(
            table_writer.get_batch_writer(metrics, {
                "load_balancing": "in_order",
                "insert_distributed_sync": 1
            }),
            JSONRowEncoder(),
        )

        self.__processor: MessageProcessor
        self.__pre_filter = table_writer.get_stream_loader().get_pre_filter()
Beispiel #6
0
        if self.__replacement_batch_writer is not None:
            self.__replacement_batch_writer.terminate()

    def join(self, timeout: Optional[float] = None) -> None:
        start = time.time()
        self.__insert_batch_writer.join(timeout)

        if self.__replacement_batch_writer is not None:
            if timeout is not None:
                timeout = max(timeout - (time.time() - start), 0)

            self.__replacement_batch_writer.join(timeout)


json_row_encoder = JSONRowEncoder()


def process_message(
    processor: MessageProcessor, message: Message[KafkaPayload]
) -> Union[None, JSONRowInsertBatch, ReplacementBatch]:
    result = processor.process_message(
        rapidjson.loads(message.payload.value),
        KafkaMessageMetadata(
            message.offset, message.partition.index, message.timestamp
        ),
    )

    if isinstance(result, InsertBatch):
        return JSONRowInsertBatch(
            [json_row_encoder.encode(row) for row in result.rows],
Beispiel #7
0
def bulk_load(
    *,
    storage_name: str,
    dest_table: Optional[str],
    source: str,
    ignore_existing_data: bool,
    pre_processed: bool,
    show_progress: bool,
    log_level: Optional[str] = None,
) -> None:
    setup_logging(log_level)
    setup_sentry()

    logger = logging.getLogger("snuba.load-snapshot")
    logger.info(
        "Start bulk load process for storage %s, from source %s", storage_name, source
    )

    storage = get_cdc_storage(StorageKey(storage_name))
    table_writer = storage.get_table_writer()

    # TODO: Have a more abstract way to load sources if/when we support more than one.
    snapshot_source = PostgresSnapshot.load(
        product=settings.SNAPSHOT_LOAD_PRODUCT, path=source,
    )

    loader = table_writer.get_bulk_loader(
        snapshot_source,
        storage.get_postgres_table(),
        storage.get_row_processor(),
        dest_table,
    )
    # TODO: see whether we need to pass options to the writer

    def progress_callback(bar: progressbar.ProgressBar, progress: int) -> None:
        bar.update(progress)

    if show_progress:
        progress = progressbar.ProgressBar(
            max_value=snapshot_source.get_table_file_size(storage.get_postgres_table())
        )
        progress_func: Optional[ProgressCallback] = partial(progress_callback, progress)
    else:
        progress_func = None

    table_descriptor = snapshot_source.get_descriptor().get_table(
        storage.get_postgres_table()
    )
    if pre_processed:
        writer = table_writer.get_bulk_writer(
            metrics=environment.metrics,
            encoding="gzip" if table_descriptor.zip else None,
            column_names=[c.name for c in table_descriptor.columns or []],
            table_name=dest_table,
        )
        loader.load_preprocessed(
            writer, ignore_existing_data, progress_callback=progress_func
        )
    else:
        buffer_writer = BufferedWriterWrapper(
            table_writer.get_batch_writer(
                environment.metrics,
                table_name=dest_table,
                chunk_size=settings.BULK_CLICKHOUSE_BUFFER,
            ),
            settings.BULK_CLICKHOUSE_BUFFER,
            JSONRowEncoder(),
        )
        loader.load(
            buffer_writer, ignore_existing_data, progress_callback=progress_func
        )
Beispiel #8
0
def generate_transactions(count: int) -> None:
    """
    Generate a deterministic set of events across a time range.
    """
    import calendar
    import pytz
    import uuid
    from datetime import datetime, timedelta

    table_writer = get_writable_storage(
        StorageKey.TRANSACTIONS).get_table_writer()

    rows = []

    base_time = datetime.utcnow().replace(
        minute=0, second=0, microsecond=0,
        tzinfo=pytz.utc) - timedelta(minutes=count)

    for tick in range(count):

        trace_id = "7400045b25c443b885914600aa83ad04"
        span_id = "8841662216cc598b"
        processed = (
            table_writer.get_stream_loader().get_processor().process_message(
                (
                    2,
                    "insert",
                    {
                        "project_id":
                        1,
                        "event_id":
                        uuid.uuid4().hex,
                        "deleted":
                        0,
                        "datetime":
                        (base_time + timedelta(minutes=tick)).isoformat(),
                        "platform":
                        "javascript",
                        "data": {
                            # Project N sends every Nth (mod len(hashes)) hash (and platform)
                            "received":
                            calendar.timegm(
                                (base_time +
                                 timedelta(minutes=tick)).timetuple()),
                            "type":
                            "transaction",
                            "transaction":
                            f"/api/do_things/{count}",
                            # XXX(dcramer): would be nice to document why these have to be naive
                            "start_timestamp":
                            datetime.timestamp(
                                (base_time + timedelta(minutes=tick)).replace(
                                    tzinfo=None)),
                            "timestamp":
                            datetime.timestamp(
                                (base_time +
                                 timedelta(minutes=tick, seconds=1)).replace(
                                     tzinfo=None)),
                            "contexts": {
                                "trace": {
                                    "trace_id": trace_id,
                                    "span_id": span_id,
                                    "op": "http",
                                    "status": "0",
                                },
                            },
                            "request": {
                                "url":
                                "http://127.0.0.1:/query",
                                "headers": [
                                    ["Accept-Encoding", "identity"],
                                    ["Content-Length", "398"],
                                    ["Host", "127.0.0.1:"],
                                    ["Referer", "tagstore.something"],
                                    ["Trace", "8fa73032d-1"],
                                ],
                                "data":
                                "",
                                "method":
                                "POST",
                                "env": {
                                    "SERVER_PORT": "1010",
                                    "SERVER_NAME": "snuba"
                                },
                            },
                            "spans": [{
                                "op":
                                "db",
                                "trace_id":
                                trace_id,
                                "span_id":
                                span_id + "1",
                                "parent_span_id":
                                None,
                                "same_process_as_parent":
                                True,
                                "description":
                                "SELECT * FROM users",
                                "data": {},
                                "timestamp":
                                calendar.timegm(
                                    (base_time +
                                     timedelta(minutes=tick)).timetuple()),
                            }],
                        },
                    },
                ),
                KafkaMessageMetadata(0, 0, base_time),
            ))
        rows.extend(processed.rows)

    BatchWriterEncoderWrapper(
        table_writer.get_batch_writer(metrics=DummyMetricsBackend(
            strict=True)),
        JSONRowEncoder(),
    ).write(rows)
Beispiel #9
0
 def write_rows(self, rows: Sequence[WriterTableRow]) -> None:
     BatchWriterEncoderWrapper(
         enforce_table_writer(self.dataset).get_batch_writer(
             metrics=DummyMetricsBackend(strict=True)),
         JSONRowEncoder(),
     ).write(rows)