Ejemplo n.º 1
0
def test_delayed_iter():
    events = []

    def maker(n):
        for i in range(n):
            events.append(("add", i))
            yield i

    for i in delayed_iter(maker(4), 2):
        events.append(("remove", i))

    assert events == [
        ("add", 0),
        ("add", 1),
        ("add", 2),
        ("remove", 0),
        ("add", 3),
        ("remove", 1),
        ("remove", 2),
        ("remove", 3),
    ]

    events.clear()
    for i in delayed_iter(maker(2), None):
        events.append(("remove", i))

    assert events == [
        ("add", 0),
        ("add", 1),
        ("remove", 0),
        ("remove", 1),
    ]
Ejemplo n.º 2
0
    def _join_events_by_job_id(
            self, events: Iterable[Union[ReadEvent,
                                         QueryEvent]]) -> Iterable[ReadEvent]:
        # If caching eviction is enabled, we only store the most recently used query events,
        # which are used when resolving job information within the read events.
        query_jobs: MutableMapping[str, QueryEvent]
        if self.config.query_log_delay:
            query_jobs = cachetools.LRUCache(maxsize=5 *
                                             self.config.query_log_delay)
        else:
            query_jobs = {}

        def event_processor(
            events: Iterable[Union[ReadEvent,
                                   QueryEvent]]) -> Iterable[ReadEvent]:
            for event in events:
                if isinstance(event, QueryEvent):
                    if event.jobName:
                        query_jobs[event.jobName] = event
                else:
                    yield event

        # TRICKY: To account for the possibility that the query event arrives after
        # the read event in the audit logs, we wait for at least `query_log_delay`
        # additional events to be processed before attempting to resolve BigQuery
        # job information from the logs. If `query_log_delay` is None, it gets treated
        # as an unlimited delay, which prioritizes correctness at the expense of memory usage.
        original_read_events = event_processor(events)
        delayed_read_events = delayed_iter(original_read_events,
                                           self.config.query_log_delay)

        num_joined: int = 0
        for event in delayed_read_events:
            if (event.timestamp < self.config.start_time
                    or event.timestamp >= self.config.end_time
                    or not self._is_table_allowed(event.resource)):
                continue

            if event.jobName:
                if event.jobName in query_jobs:
                    # Join the query log event into the table read log event.
                    num_joined += 1
                    event.query = query_jobs[event.jobName].query

                    # TODO also join into the query itself for column references
                else:
                    self.report.report_warning(
                        str(event.resource),
                        f"Failed to match table read event {event.jobName} with job; try increasing `query_log_delay` or `max_query_duration`",
                    )
            yield event

        logger.info(
            f"Number of read events joined with query events: {num_joined}")
Ejemplo n.º 3
0
    def _join_events_by_job_id(
            self, events: Iterable[Union[ReadEvent,
                                         QueryEvent]]) -> Iterable[ReadEvent]:
        # If caching eviction is enabled, we only store the most recently used query events,
        # which are used when resolving job information within the read events.
        query_jobs: MutableMapping[str, QueryEvent]
        if self.config.query_log_delay:
            query_jobs = cachetools.LRUCache(maxsize=5 *
                                             self.config.query_log_delay)
        else:
            query_jobs = {}

        def event_processor(
            events: Iterable[Union[ReadEvent,
                                   QueryEvent]]) -> Iterable[ReadEvent]:
            for event in events:
                if isinstance(event, QueryEvent):
                    query_jobs[event.jobName] = event
                else:
                    yield event

        # TRICKY: To account for the possibility that the query event arrives after
        # the read event in the audit logs, we wait for at least `query_log_delay`
        # additional events to be processed before attempting to resolve BigQuery
        # job information from the logs. If `query_log_delay` is None, it gets treated
        # as an unlimited delay, which prioritizes correctness at the expense of memory usage.
        original_read_events = event_processor(events)
        delayed_read_events = delayed_iter(original_read_events,
                                           self.config.query_log_delay)

        for event in delayed_read_events:
            if event.jobName:
                if event.jobName in query_jobs:
                    # Join the query log event into the table read log event.
                    event.query = query_jobs[event.jobName].query

                    # TODO also join into the query itself for column references
                else:
                    self.report.report_warning(
                        "<general>",
                        "failed to match table read event with job; try increasing `query_log_delay`",
                    )

            yield event
Ejemplo n.º 4
0
    def _join_events_by_job_id(
        self, events: Iterable[Union[ReadEvent, QueryEvent]]
    ) -> Iterable[ReadEvent]:
        # We only store the most recently used query events, which are used when
        # resolving job information within the read events.
        query_jobs: cachetools.LRUCache[str, QueryEvent] = cachetools.LRUCache(
            maxsize=2 * self.config.query_log_delay
        )

        def event_processor(
            events: Iterable[Union[ReadEvent, QueryEvent]]
        ) -> Iterable[ReadEvent]:
            for event in events:
                if isinstance(event, QueryEvent):
                    query_jobs[event.jobName] = event
                else:
                    yield event

        # TRICKY: To account for the possibility that the query event arrives after
        # the read event in the audit logs, we wait for at least `query_log_delay`
        # additional events to be processed before attempting to resolve BigQuery
        # job information from the logs.
        original_read_events = event_processor(events)
        delayed_read_events = delayed_iter(
            original_read_events, self.config.query_log_delay
        )

        for event in delayed_read_events:
            if event.jobName:
                if event.jobName in query_jobs:
                    # Join the query log event into the table read log event.
                    event.query = query_jobs[event.jobName].query

                    # TODO also join into the query itself for column references
                else:
                    self.report.report_warning(
                        "<general>",
                        "failed to match table read event with job; try increasing `query_log_delay`",
                    )

            yield event