Exemple #1
0
    def yield_logs_for_export(
        self,
        start_datetime,
        end_datetime,
        repository_id=None,
        namespace_id=None,
        max_query_time=None,
    ):
        # Just for testing.
        if max_query_time is not None:
            raise LogsIterationTimeout()

        logs = []
        for log_and_repo in self._filter_logs(start_datetime, end_datetime):
            if repository_id and (
                not log_and_repo.repository or log_and_repo.repository.id != repository_id
            ):
                continue

            if namespace_id:
                if log_and_repo.log.account_username is None:
                    continue

                namespace = model.user.get_namespace_user(log_and_repo.log.account_username)
                if namespace.id != namespace_id:
                    continue

            logs.append(log_and_repo.log)

        yield logs
    def yield_logs_for_export(
        self,
        start_datetime,
        end_datetime,
        repository_id=None,
        namespace_id=None,
        max_query_time=None,
    ):
        max_query_time = max_query_time.total_seconds(
        ) if max_query_time is not None else 300
        search = self._base_query_date_range(start_datetime, end_datetime,
                                             None, repository_id, namespace_id,
                                             None)

        def raise_on_timeout(batch_generator):
            start = time()
            for batch in batch_generator:
                elapsed = time() - start
                if elapsed > max_query_time:
                    logger.error(
                        "Retrieval of logs `%s/%s` timed out with time of `%s`",
                        namespace_id,
                        repository_id,
                        elapsed,
                    )
                    raise LogsIterationTimeout()

                yield batch
                start = time()

        def read_batch(scroll):
            batch = []
            for log in scroll:
                batch.append(log)
                if len(batch) == DEFAULT_RESULT_WINDOW:
                    yield _for_elasticsearch_logs(batch,
                                                  repository_id=repository_id,
                                                  namespace_id=namespace_id)
                    batch = []

            if batch:
                yield _for_elasticsearch_logs(batch,
                                              repository_id=repository_id,
                                              namespace_id=namespace_id)

        search = search.params(size=DEFAULT_RESULT_WINDOW,
                               request_timeout=max_query_time)

        try:
            with CloseForLongOperation(config.app_config):
                for batch in raise_on_timeout(read_batch(search.scan())):
                    yield batch
        except ConnectionTimeout:
            raise LogsIterationTimeout()
Exemple #3
0
        def raise_on_timeout(batch_generator):
            start = time()
            for batch in batch_generator:
                elapsed = time() - start
                if elapsed > max_query_time:
                    logger.error(
                        'Retrieval of logs `%s/%s` timed out with time of `%s`',
                        namespace_id, repository_id, elapsed)
                    raise LogsIterationTimeout()

                yield batch
                start = time()
Exemple #4
0
    def yield_logs_for_export(
        self,
        start_datetime,
        end_datetime,
        repository_id=None,
        namespace_id=None,
        max_query_time=None,
    ):
        assert namespace_id is None or isinstance(namespace_id, int)
        assert repository_id is None or isinstance(repository_id, int)

        # Using an adjusting scale, start downloading log rows in batches, starting at
        # MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or
        # the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes
        # longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out.
        batch_start_time = datetime.utcnow()

        current_start_datetime = start_datetime
        current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE)

        while current_start_datetime < end_datetime:
            # Verify we haven't been working for too long.
            work_elapsed = datetime.utcnow() - batch_start_time
            if max_query_time is not None and work_elapsed > max_query_time:
                logger.error(
                    "Retrieval of logs `%s/%s` timed out with time of `%s`",
                    namespace_id,
                    repository_id,
                    work_elapsed,
                )
                raise LogsIterationTimeout()

            current_end_datetime = current_start_datetime + current_batch_size
            current_end_datetime = min(current_end_datetime, end_datetime)

            # Load the next set of logs.
            def load_logs():
                logger.debug(
                    "Retrieving logs over range %s -> %s with namespace %s and repository %s",
                    current_start_datetime,
                    current_end_datetime,
                    namespace_id,
                    repository_id,
                )

                logs_query = model.log.get_logs_query(
                    namespace_id=namespace_id,
                    repository=repository_id,
                    start_time=current_start_datetime,
                    end_time=current_end_datetime,
                )
                logs = list(logs_query)
                for log in logs:
                    assert isinstance(log, BaseModel)
                    if namespace_id is not None:
                        assert log.account_id == namespace_id, "Expected %s, Found: %s" % (
                            namespace_id,
                            log.account.id,
                        )

                    if repository_id is not None:
                        assert log.repository_id == repository_id

                logs = [Log.for_logentry(log) for log in logs]
                return logs

            logs, elapsed = _run_and_time(load_logs)
            if max_query_time is not None and elapsed > max_query_time:
                logger.error(
                    "Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`",
                    namespace_id,
                    repository_id,
                    current_start_datetime,
                    current_end_datetime,
                    elapsed,
                )
                raise LogsIterationTimeout()

            yield logs

            # Move forward.
            current_start_datetime = current_end_datetime

            # Increase the batch size if necessary.
            if len(logs) < EXPECTED_ITERATION_LOG_COUNT:
                seconds = min(MAXIMUM_RANGE_SIZE,
                              current_batch_size.total_seconds() * 2)
                current_batch_size = timedelta(seconds=seconds)