def yield_logs_for_export( self, start_datetime, end_datetime, repository_id=None, namespace_id=None, max_query_time=None, ): # Just for testing. if max_query_time is not None: raise LogsIterationTimeout() logs = [] for log_and_repo in self._filter_logs(start_datetime, end_datetime): if repository_id and ( not log_and_repo.repository or log_and_repo.repository.id != repository_id ): continue if namespace_id: if log_and_repo.log.account_username is None: continue namespace = model.user.get_namespace_user(log_and_repo.log.account_username) if namespace.id != namespace_id: continue logs.append(log_and_repo.log) yield logs
def yield_logs_for_export( self, start_datetime, end_datetime, repository_id=None, namespace_id=None, max_query_time=None, ): max_query_time = max_query_time.total_seconds( ) if max_query_time is not None else 300 search = self._base_query_date_range(start_datetime, end_datetime, None, repository_id, namespace_id, None) def raise_on_timeout(batch_generator): start = time() for batch in batch_generator: elapsed = time() - start if elapsed > max_query_time: logger.error( "Retrieval of logs `%s/%s` timed out with time of `%s`", namespace_id, repository_id, elapsed, ) raise LogsIterationTimeout() yield batch start = time() def read_batch(scroll): batch = [] for log in scroll: batch.append(log) if len(batch) == DEFAULT_RESULT_WINDOW: yield _for_elasticsearch_logs(batch, repository_id=repository_id, namespace_id=namespace_id) batch = [] if batch: yield _for_elasticsearch_logs(batch, repository_id=repository_id, namespace_id=namespace_id) search = search.params(size=DEFAULT_RESULT_WINDOW, request_timeout=max_query_time) try: with CloseForLongOperation(config.app_config): for batch in raise_on_timeout(read_batch(search.scan())): yield batch except ConnectionTimeout: raise LogsIterationTimeout()
def raise_on_timeout(batch_generator): start = time() for batch in batch_generator: elapsed = time() - start if elapsed > max_query_time: logger.error( 'Retrieval of logs `%s/%s` timed out with time of `%s`', namespace_id, repository_id, elapsed) raise LogsIterationTimeout() yield batch start = time()
def yield_logs_for_export( self, start_datetime, end_datetime, repository_id=None, namespace_id=None, max_query_time=None, ): assert namespace_id is None or isinstance(namespace_id, int) assert repository_id is None or isinstance(repository_id, int) # Using an adjusting scale, start downloading log rows in batches, starting at # MINIMUM_RANGE_SIZE and doubling until we've reached EXPECTED_ITERATION_LOG_COUNT or # the lookup range has reached MAXIMUM_RANGE_SIZE. If at any point this operation takes # longer than the MAXIMUM_WORK_PERIOD_SECONDS, terminate the batch operation as timed out. batch_start_time = datetime.utcnow() current_start_datetime = start_datetime current_batch_size = timedelta(seconds=MINIMUM_RANGE_SIZE) while current_start_datetime < end_datetime: # Verify we haven't been working for too long. work_elapsed = datetime.utcnow() - batch_start_time if max_query_time is not None and work_elapsed > max_query_time: logger.error( "Retrieval of logs `%s/%s` timed out with time of `%s`", namespace_id, repository_id, work_elapsed, ) raise LogsIterationTimeout() current_end_datetime = current_start_datetime + current_batch_size current_end_datetime = min(current_end_datetime, end_datetime) # Load the next set of logs. def load_logs(): logger.debug( "Retrieving logs over range %s -> %s with namespace %s and repository %s", current_start_datetime, current_end_datetime, namespace_id, repository_id, ) logs_query = model.log.get_logs_query( namespace_id=namespace_id, repository=repository_id, start_time=current_start_datetime, end_time=current_end_datetime, ) logs = list(logs_query) for log in logs: assert isinstance(log, BaseModel) if namespace_id is not None: assert log.account_id == namespace_id, "Expected %s, Found: %s" % ( namespace_id, log.account.id, ) if repository_id is not None: assert log.repository_id == repository_id logs = [Log.for_logentry(log) for log in logs] return logs logs, elapsed = _run_and_time(load_logs) if max_query_time is not None and elapsed > max_query_time: logger.error( "Retrieval of logs for export `%s/%s` with range `%s-%s` timed out at `%s`", namespace_id, repository_id, current_start_datetime, current_end_datetime, elapsed, ) raise LogsIterationTimeout() yield logs # Move forward. current_start_datetime = current_end_datetime # Increase the batch size if necessary. if len(logs) < EXPECTED_ITERATION_LOG_COUNT: seconds = min(MAXIMUM_RANGE_SIZE, current_batch_size.total_seconds() * 2) current_batch_size = timedelta(seconds=seconds)