def _get_manifest_iterator(self, indexer_state, min_id, max_id): reindex_threshold = lambda: datetime.utcnow() - timedelta( seconds=self.app.config.get("SECURITY_SCANNER_V4_REINDEX_THRESHOLD" )) # TODO(alecmerdler): Filter out any `Manifests` that are still being uploaded def not_indexed_query(): return (Manifest.select().join( ManifestSecurityStatus, JOIN.LEFT_OUTER).where(ManifestSecurityStatus.id >> None)) def index_error_query(): return (Manifest.select().join(ManifestSecurityStatus).where( ManifestSecurityStatus.index_status == IndexStatus.FAILED, ManifestSecurityStatus.last_indexed < reindex_threshold(), )) def needs_reindexing_query(indexer_hash): return (Manifest.select().join(ManifestSecurityStatus).where( ManifestSecurityStatus.index_status != IndexStatus.MANIFEST_UNSUPPORTED, ManifestSecurityStatus.indexer_hash != indexer_hash, ManifestSecurityStatus.last_indexed < reindex_threshold(), )) # 4^log10(total) gives us a scalable batch size into the billions. batch_size = int(4**log10(max(10, max_id - min_id))) # TODO(alecmerdler): We want to index newer manifests first, while backfilling older manifests... iterator = itertools.chain( yield_random_entries( not_indexed_query, Manifest.id, batch_size, max_id, min_id, ), yield_random_entries( index_error_query, Manifest.id, batch_size, max_id, min_id, ), yield_random_entries( lambda: needs_reindexing_query(indexer_state.get("state", "")), Manifest.id, batch_size, max_id, min_id, ), ) return iterator
def repositories_to_mirror(self, start_token=None): def batch_query(): return get_eligible_mirrors() # Find the minimum ID. if start_token is not None: min_id = start_token.min_id else: min_id = get_min_id_for_repo_mirror_config() # Get the ID of the last repository mirror config. Will be None if there are none in the database. max_id = get_max_id_for_repo_mirror_config() if max_id is None: return (None, None) if min_id is None or min_id > max_id: return (None, None) # 4^log10(total) gives us a scalable batch size into the billions. batch_size = int(4**log10(max(10, max_id - min_id))) iterator = yield_random_entries( batch_query, RepoMirrorConfig.id, batch_size, max_id, min_id) return (iterator, RepoMirrorToken(max_id + 1))
def _run_counting(self): def batch_query(): return database.Repository.select() min_id = model.repository.get_min_id() max_id = model.repository.get_max_id() if min_id is None or max_id is None: return # 4^log10(total) gives us a scalable batch size into the billions. batch_size = int(4**log10(max(10, max_id - min_id))) iterator = yield_random_entries( batch_query, database.Repository.id, batch_size, max_id, min_id, ) yesterday = date.today() - timedelta(days=1) for candidate, abt, num_remaining in iterator: if model.repositoryactioncount.has_repository_action_count( candidate, yesterday): abt.set() continue if not self._count_repository_actions(candidate): abt.set()
def candidates_to_scan(self, target_version, start_token=None): def batch_query(): return get_images_eligible_for_scan(target_version) # Find the minimum ID. min_id = None if start_token is not None: min_id = start_token.min_id else: min_id = app.config.get("SECURITY_SCANNER_INDEXING_MIN_ID") if min_id is None: min_id = get_min_id_for_sec_scan(target_version) # Get the ID of the last image we can analyze. Will be None if there are no images in the # database. max_id = get_max_id_for_sec_scan() if max_id is None: return (None, None) if min_id is None or min_id > max_id: return (None, None) # 4^log10(total) gives us a scalable batch size into the billions. batch_size = int(4 ** log10(max(10, max_id - min_id))) # TODO: Once we have a clean shared NamedTuple for Images, send that to the secscan analyzer # rather than the database Image itself. iterator = yield_random_entries( batch_query, get_image_pk_field(), batch_size, max_id, min_id, ) return (iterator, ScanToken(max_id + 1))
def _backfill_manifests(self): try: Manifest.select().where( Manifest.layers_compressed_size >> None).get() except Manifest.DoesNotExist: logger.debug("Manifest backfill worker has completed; skipping") return False iterator = yield_random_entries( lambda: Manifest.select().where(Manifest.layers_compressed_size >> None), Manifest.id, 250, Manifest.select(fn.Max(Manifest.id)).scalar(), 1, ) for manifest_row, abt, _ in iterator: if manifest_row.layers_compressed_size is not None: logger.debug("Another worker preempted this worker") abt.set() continue logger.debug("Setting layers compressed size for manifest %s", manifest_row.id) layers_compressed_size = -1 config_media_type = None manifest_bytes = Bytes.for_string_or_unicode( manifest_row.manifest_bytes) try: parsed = parse_manifest_from_bytes( manifest_bytes, manifest_row.media_type.name, validate=False) layers_compressed_size = parsed.layers_compressed_size if layers_compressed_size is None: layers_compressed_size = 0 config_media_type = parsed.config_media_type or None except ManifestException as me: logger.warning( "Got exception when trying to parse manifest %s: %s", manifest_row.id, me) assert layers_compressed_size is not None updated = (Manifest.update( layers_compressed_size=layers_compressed_size, config_media_type=config_media_type, ).where(Manifest.id == manifest_row.id, Manifest.layers_compressed_size >> None).execute()) if updated != 1: logger.debug("Another worker preempted this worker") abt.set() continue return True
def _candidates_to_backfill(self): def missing_tmt_query(): return ( self._filter(RepositoryTag.select()) .join(TagToRepositoryTag, JOIN.LEFT_OUTER) .where(TagToRepositoryTag.id >> None, RepositoryTag.hidden == False) ) min_id = self._filter(RepositoryTag.select(fn.Min(RepositoryTag.id))).scalar() max_id = self._filter(RepositoryTag.select(fn.Max(RepositoryTag.id))).scalar() logger.info("Found candidate range %s-%s", min_id, max_id) iterator = yield_random_entries(missing_tmt_query, RepositoryTag.id, 1000, max_id, min_id,) return iterator
def verify_placements(): encountered = set() iterator = yield_random_entries( lambda: ImageStorage.select().where(ImageStorage.uploading == False), ImageStorage.id, 1000, ImageStorage.select(fn.Max(ImageStorage.id)).scalar(), 1, ) for storage_row, abt, _ in iterator: if storage_row.id in encountered: continue encountered.add(storage_row.id) logger.info("Checking placements for storage `%s`", storage_row.uuid) try: with_locations = model.storage.get_storage_by_uuid( storage_row.uuid) except model.InvalidImageException: logger.exception("Could not find storage `%s`", storage_row.uuid) continue storage_path = model.storage.get_layer_path(storage_row) locations_to_check = set(with_locations.locations) if locations_to_check: logger.info("Checking locations `%s` for storage `%s`", locations_to_check, storage_row.uuid) for location in locations_to_check: logger.info("Checking location `%s` for storage `%s`", location, storage_row.uuid) if not storage.exists([location], storage_path): location_row = _get_location_row(location) logger.info( "Location `%s` is missing for storage `%s`; removing", location, storage_row.uuid, ) (ImageStoragePlacement.delete().where( ImageStoragePlacement.storage == storage_row, ImageStoragePlacement.location == location_row, ).execute())
def _candidates_to_backfill(self): def missing_tmt_query(): return (TagManifestLabel.select().join( TagManifestLabelMap, JOIN.LEFT_OUTER).where(TagManifestLabelMap.id >> None)) min_id = (TagManifestLabel.select(fn.Min(TagManifestLabel.id)).join( TagManifestLabelMap, JOIN.LEFT_OUTER).where(TagManifestLabelMap.id >> None).scalar()) max_id = TagManifestLabel.select(fn.Max(TagManifestLabel.id)).scalar() iterator = yield_random_entries( missing_tmt_query, TagManifestLabel.id, 100, max_id, min_id, ) return iterator
def _run_counting(self): yesterday = date.today() - timedelta(days=1) def batch_query(): return model.repositoryactioncount.missing_counts_query(yesterday) min_id = model.repository.get_min_id() max_id = model.repository.get_max_id() if min_id is None or max_id is None: return # Check for the number RAC entries vs number of repos. If they are the same, # nothing more to do. repo_count = model.repository.get_repository_count() rac_count = model.repositoryactioncount.found_entry_count(yesterday) if rac_count >= repo_count: logger.debug("All RAC entries found; nothing more to do") return # This gives us a scalable batch size into the millions. batch_size = int(3**log10(max(10, max_id - min_id))) iterator = yield_random_entries( batch_query, database.Repository.id, batch_size, max_id, min_id, ) for candidate, abt, num_remaining in iterator: if model.repositoryactioncount.has_repository_action_count( candidate, yesterday): abt.set() continue if not self._count_repository_actions(candidate): abt.set()
def test_no_work(): def create_empty_query(): return FakeQuery([]) for _ in yield_random_entries(create_empty_query, FAKE_PK_FIELD, 1, 10): assert False, 'There should never be any actual work!'
def perform_indexing(self, start_token=None): whitelisted_namespaces = self.app.config.get( "SECURITY_SCANNER_V4_NAMESPACE_WHITELIST", []) try: indexer_state = self._secscan_api.state() except APIRequestFailure: return None def eligible_manifests(base_query): return (base_query.join(Repository).join(User).where( User.username << whitelisted_namespaces)) min_id = (start_token.min_id if start_token is not None else Manifest.select(fn.Min(Manifest.id)).scalar()) max_id = Manifest.select(fn.Max(Manifest.id)).scalar() if max_id is None or min_id is None or min_id > max_id: return None reindex_threshold = lambda: datetime.utcnow() - timedelta( seconds=self.app.config.get("SECURITY_SCANNER_V4_REINDEX_THRESHOLD" )) # TODO(alecmerdler): Filter out any `Manifests` that are still being uploaded def not_indexed_query(): return (eligible_manifests( Manifest.select()).switch(Manifest).join( ManifestSecurityStatus, JOIN.LEFT_OUTER).where(ManifestSecurityStatus.id >> None)) def index_error_query(): return (eligible_manifests(Manifest.select()).switch( Manifest).join(ManifestSecurityStatus).where( ManifestSecurityStatus.index_status == IndexStatus.FAILED, ManifestSecurityStatus.last_indexed < reindex_threshold(), )) def needs_reindexing_query(indexer_hash): return (eligible_manifests(Manifest.select()).switch( Manifest).join(ManifestSecurityStatus).where( ManifestSecurityStatus.indexer_hash != indexer_hash, ManifestSecurityStatus.last_indexed < reindex_threshold(), )) # 4^log10(total) gives us a scalable batch size into the billions. batch_size = int(4**log10(max(10, max_id - min_id))) iterator = itertools.chain( yield_random_entries( not_indexed_query, Manifest.id, batch_size, max_id, min_id, ), yield_random_entries( index_error_query, Manifest.id, batch_size, max_id, min_id, ), yield_random_entries( lambda: needs_reindexing_query(indexer_state.get("state", "")), Manifest.id, batch_size, max_id, min_id, ), ) for candidate, abt, num_remaining in iterator: manifest = ManifestDataType.for_manifest(candidate, None) layers = registry_model.list_manifest_layers( manifest, self.storage, True) logger.debug("Indexing %s/%s@%s" % (candidate.repository.namespace_user, candidate.repository.name, manifest.digest)) try: (report, state) = self._secscan_api.index(manifest, layers) except APIRequestFailure: logger.exception( "Failed to perform indexing, security scanner API error") return None with db_transaction(): ManifestSecurityStatus.delete().where( ManifestSecurityStatus.manifest == candidate).execute() ManifestSecurityStatus.create( manifest=candidate, repository=candidate.repository, error_json=report["err"], index_status=(IndexStatus.FAILED if report["state"] == IndexReportState.Index_Error else IndexStatus.COMPLETED), indexer_hash=state, indexer_version=IndexerVersion.V4, metadata_json={}, ) return ScanToken(max_id + 1)
def _get_manifest_iterator(self, indexer_state, min_id, max_id, batch_size=None, reindex_threshold=None): # TODO(alecmerdler): Filter out any `Manifests` that are still being uploaded def not_indexed_query(): return (Manifest.select(Manifest, ManifestSecurityStatus).join( ManifestSecurityStatus, JOIN.LEFT_OUTER).where(ManifestSecurityStatus.id >> None)) def index_error_query(): return (Manifest.select( Manifest, ManifestSecurityStatus).join(ManifestSecurityStatus).where( ManifestSecurityStatus.index_status == IndexStatus.FAILED, ManifestSecurityStatus.last_indexed < reindex_threshold or DEFAULT_SECURITY_SCANNER_V4_REINDEX_THRESHOLD, )) def needs_reindexing_query(indexer_hash): return (Manifest.select( Manifest, ManifestSecurityStatus).join(ManifestSecurityStatus).where( ManifestSecurityStatus.index_status != IndexStatus.MANIFEST_UNSUPPORTED, ManifestSecurityStatus.indexer_hash != indexer_hash, ManifestSecurityStatus.last_indexed < reindex_threshold or DEFAULT_SECURITY_SCANNER_V4_REINDEX_THRESHOLD, )) # 4^log10(total) gives us a scalable batch size into the billions. if not batch_size: batch_size = int(4**log10(max(10, max_id - min_id))) iterator = itertools.chain( yield_random_entries( not_indexed_query, Manifest.id, batch_size, max_id, min_id, ), yield_random_entries( index_error_query, Manifest.id, batch_size, max_id, min_id, ), yield_random_entries( lambda: needs_reindexing_query(indexer_state.get("state", "")), Manifest.id, batch_size, max_id, min_id, ), ) return iterator