Esempio n. 1
0
def test_subscription_task_result_encoder() -> None:
    codec = SubscriptionTaskResultEncoder()

    timestamp = datetime.now()

    entity_subscription = EventsSubscription(data_dict={})
    subscription_data = SubscriptionData(
        project_id=1,
        query="MATCH (events) SELECT count() AS count",
        time_window_sec=60,
        resolution_sec=60,
        entity_subscription=entity_subscription,
    )

    # XXX: This seems way too coupled to the dataset.
    request = subscription_data.build_request(get_dataset("events"), timestamp,
                                              None, Timer("timer"))
    result: Result = {
        "meta": [{
            "type": "UInt64",
            "name": "count"
        }],
        "data": [{
            "count": 1
        }],
    }

    task_result = SubscriptionTaskResult(
        ScheduledSubscriptionTask(
            timestamp,
            SubscriptionWithMetadata(
                EntityKey.EVENTS,
                Subscription(
                    SubscriptionIdentifier(PartitionId(1), uuid.uuid1()),
                    subscription_data,
                ),
                5,
            ),
        ),
        (request, result),
    )

    message = codec.encode(task_result)
    data = json.loads(message.value.decode("utf-8"))
    assert data["version"] == 3
    payload = data["payload"]

    assert payload["subscription_id"] == str(
        task_result.task.task.subscription.identifier)
    assert payload["request"] == request.original_body
    assert payload["result"] == result
    assert payload["timestamp"] == task_result.task.timestamp.isoformat()
    assert payload["entity"] == EntityKey.EVENTS.value
Esempio n. 2
0
def test_subscription_task_result_encoder() -> None:
    codec = SubscriptionTaskResultEncoder()

    timestamp = datetime.now()

    subscription_data = LegacySubscriptionData(
        project_id=1,
        conditions=[],
        aggregations=[["count()", "", "count"]],
        time_window=timedelta(minutes=1),
        resolution=timedelta(minutes=1),
    )

    # XXX: This seems way too coupled to the dataset.
    request = subscription_data.build_request(get_dataset("events"), timestamp,
                                              None, Timer("timer"))
    result: Result = {
        "meta": [{
            "type": "UInt64",
            "name": "count"
        }],
        "data": [{
            "count": 1
        }],
    }

    task_result = SubscriptionTaskResult(
        ScheduledTask(
            timestamp,
            Subscription(
                SubscriptionIdentifier(PartitionId(1), uuid.uuid1()),
                subscription_data,
            ),
        ),
        (request, result),
    )

    message = codec.encode(task_result)
    data = json.loads(message.value.decode("utf-8"))
    assert data["version"] == 2
    payload = data["payload"]

    assert payload["subscription_id"] == str(task_result.task.task.identifier)
    assert payload["request"] == request.body
    assert payload["result"] == result
    assert payload["timestamp"] == task_result.task.timestamp.isoformat()
Esempio n. 3
0
    def __init__(
        self,
        producer: Producer[KafkaPayload],
        result_topic: str,
        commit: Callable[[Mapping[Partition, Position]], None],
    ):
        self.__producer = producer
        self.__result_topic = Topic(result_topic)
        self.__commit = commit
        self.__commit_data: MutableMapping[Partition, Position] = {}

        # Time we last called commit
        self.__last_committed: Optional[float] = None

        self.__encoder = SubscriptionTaskResultEncoder()

        self.__queue: Deque[Tuple[Message[SubscriptionTaskResult],
                                  Future[Message[KafkaPayload]]]] = deque()

        self.__max_buffer_size = 10000
        self.__closed = False
Esempio n. 4
0
def test_metrics_subscription_task_result_encoder(
        subscription_cls: Type[EntitySubscription], aggregate: str,
        entity_key: EntityKey) -> None:
    codec = SubscriptionTaskResultEncoder()

    timestamp = datetime.now()

    entity_subscription = subscription_cls(data_dict={"organization": 1})
    subscription_data = SubscriptionData(
        project_id=1,
        query=(f"""
            MATCH ({entity_key.value}) SELECT {aggregate}(value) AS value BY project_id, tags[3]
            WHERE org_id = 1 AND project_id IN array(1) AND metric_id = 7 AND tags[3] IN array(1,2)
            """),
        time_window_sec=60,
        resolution_sec=60,
        entity_subscription=entity_subscription,
    )

    # XXX: This seems way too coupled to the dataset.
    request = subscription_data.build_request(get_dataset("metrics"),
                                              timestamp, None, Timer("timer"))
    result: Result = {
        "data": [
            {
                "project_id": 1,
                "tags[3]": 13,
                "value": 8
            },
            {
                "project_id": 1,
                "tags[3]": 4,
                "value": 46
            },
        ],
        "meta": [
            {
                "name": "project_id",
                "type": "UInt64"
            },
            {
                "name": "tags[3]",
                "type": "UInt64"
            },
            {
                "name": "value",
                "type": "Float64"
            },
        ],
    }
    task_result = SubscriptionTaskResult(
        ScheduledSubscriptionTask(
            timestamp,
            SubscriptionWithMetadata(
                entity_key,
                Subscription(
                    SubscriptionIdentifier(PartitionId(1), uuid.uuid1()),
                    subscription_data,
                ),
                5,
            ),
        ),
        (request, result),
    )
    message = codec.encode(task_result)
    data = json.loads(message.value.decode("utf-8"))
    assert data["version"] == 3
    payload = data["payload"]

    assert payload["subscription_id"] == str(
        task_result.task.task.subscription.identifier)
    assert payload["request"] == request.original_body
    assert payload["result"] == result
    assert payload["timestamp"] == task_result.task.timestamp.isoformat()
    assert payload["entity"] == entity_key.value
Esempio n. 5
0
def subscriptions(
    *,
    dataset_name: str,
    topic: Optional[str],
    partitions: Optional[int],
    commit_log_topic: Optional[str],
    commit_log_groups: Sequence[str],
    consumer_group: str,
    auto_offset_reset: str,
    bootstrap_servers: Sequence[str],
    max_batch_size: int,
    max_batch_time_ms: int,
    max_query_workers: Optional[int],
    schedule_ttl: int,
    result_topic: Optional[str],
    log_level: Optional[str],
    delay_seconds: Optional[int],
) -> None:
    """Evaluates subscribed queries for a dataset."""

    setup_logging(log_level)
    setup_sentry()

    dataset = get_dataset(dataset_name)

    storage = dataset.get_default_entity().get_writable_storage()
    assert (
        storage is not None
    ), f"Dataset {dataset_name} does not have a writable storage by default."

    loader = enforce_table_writer(dataset).get_stream_loader()
    commit_log_topic_spec = loader.get_commit_log_topic_spec()
    assert commit_log_topic_spec is not None

    result_topic_spec = loader.get_subscription_result_topic_spec()
    assert result_topic_spec is not None

    metrics = MetricsWrapper(
        environment.metrics,
        "subscriptions",
        tags={
            "group": consumer_group,
            "dataset": dataset_name
        },
    )

    consumer = TickConsumer(
        SynchronizedConsumer(
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    loader.get_default_topic_spec().topic,
                    consumer_group,
                    auto_offset_reset=auto_offset_reset,
                    bootstrap_servers=bootstrap_servers,
                ), ),
            KafkaConsumer(
                build_kafka_consumer_configuration(
                    commit_log_topic_spec.topic,
                    f"subscriptions-commit-log-{uuid.uuid1().hex}",
                    auto_offset_reset="earliest",
                    bootstrap_servers=bootstrap_servers,
                ), ),
            (Topic(commit_log_topic) if commit_log_topic is not None else
             Topic(commit_log_topic_spec.topic_name)),
            set(commit_log_groups),
        ),
        time_shift=(timedelta(seconds=delay_seconds *
                              -1) if delay_seconds is not None else None),
    )

    producer = ProducerEncodingWrapper(
        KafkaProducer(
            build_kafka_producer_configuration(
                loader.get_default_topic_spec().topic,
                bootstrap_servers=bootstrap_servers,
                override_params={
                    "partitioner": "consistent",
                    "message.max.bytes": 50000000,  # 50MB, default is 1MB
                },
            )),
        SubscriptionTaskResultEncoder(),
    )

    executor = ThreadPoolExecutor(max_workers=max_query_workers)
    logger.debug("Starting %r with %s workers...", executor,
                 getattr(executor, "_max_workers", 0))
    metrics.gauge("executor.workers", getattr(executor, "_max_workers", 0))

    with closing(consumer), executor, closing(producer):
        from arroyo import configure_metrics

        configure_metrics(StreamMetricsAdapter(metrics))
        batching_consumer = StreamProcessor(
            consumer,
            (Topic(topic) if topic is not None else Topic(
                loader.get_default_topic_spec().topic_name)),
            BatchProcessingStrategyFactory(
                SubscriptionWorker(
                    dataset,
                    executor,
                    {
                        index: SubscriptionScheduler(
                            RedisSubscriptionDataStore(redis_client, dataset,
                                                       PartitionId(index)),
                            PartitionId(index),
                            cache_ttl=timedelta(seconds=schedule_ttl),
                            metrics=metrics,
                        )
                        for index in
                        range(partitions if partitions is not None else loader.
                              get_default_topic_spec().partitions_number)
                    },
                    producer,
                    Topic(result_topic) if result_topic is not None else Topic(
                        result_topic_spec.topic_name),
                    metrics,
                ),
                max_batch_size,
                max_batch_time_ms,
            ),
        )

        def handler(signum: int, frame: Optional[Any]) -> None:
            batching_consumer.signal_shutdown()

        signal.signal(signal.SIGINT, handler)
        signal.signal(signal.SIGTERM, handler)

        batching_consumer.run()
Esempio n. 6
0
class ProduceResult(ProcessingStrategy[SubscriptionTaskResult]):
    """
    Gets a SubscriptionTaskResult from the ExecuteQuery processor, encodes and produces
    the results to the subscription result topic then commits offsets.
    """
    def __init__(
        self,
        producer: Producer[KafkaPayload],
        result_topic: str,
        commit: Callable[[Mapping[Partition, Position]], None],
    ):
        self.__producer = producer
        self.__result_topic = Topic(result_topic)
        self.__commit = commit
        self.__commit_data: MutableMapping[Partition, Position] = {}

        # Time we last called commit
        self.__last_committed: Optional[float] = None

        self.__encoder = SubscriptionTaskResultEncoder()

        self.__queue: Deque[Tuple[Message[SubscriptionTaskResult],
                                  Future[Message[KafkaPayload]]]] = deque()

        self.__max_buffer_size = 10000
        self.__closed = False

    def __throttled_commit(self, force: bool = False) -> None:
        # Commits all offsets and resets self.__commit_data at most
        # every COMMIT_FREQUENCY_SEC. If force=True is passed, the
        # commit frequency is ignored and we immediately commit.

        now = time.time()

        if (self.__last_committed is None
                or now - self.__last_committed >= COMMIT_FREQUENCY_SEC
                or force is True):
            if self.__commit_data:
                self.__commit(self.__commit_data)
                self.__last_committed = now
                self.__commit_data = {}

    def poll(self) -> None:
        while self.__queue:
            message, future = self.__queue[0]

            if not future.done():
                break

            self.__queue.popleft()

            self.__commit_data[message.partition] = Position(
                message.next_offset, message.timestamp)
        self.__throttled_commit()

    def submit(self, message: Message[SubscriptionTaskResult]) -> None:
        assert not self.__closed

        if len(self.__queue) >= self.__max_buffer_size:
            raise MessageRejected

        encoded = self.__encoder.encode(message.payload)
        self.__queue.append(
            (message, self.__producer.produce(self.__result_topic, encoded)))

    def close(self) -> None:
        self.__closed = True

    def terminate(self) -> None:
        self.__closed = True

    def join(self, timeout: Optional[float] = None) -> None:
        start = time.time()

        # Commit all pending offsets
        self.__throttled_commit(force=True)

        while self.__queue:
            remaining = timeout - (time.time() -
                                   start) if timeout is not None else None
            if remaining is not None and remaining <= 0:
                logger.warning(
                    f"Timed out with {len(self.__queue)} futures in queue")
                break

            message, future = self.__queue.popleft()

            future.result(remaining)

            self.__commit({
                message.partition:
                Position(message.offset, message.timestamp)
            })