class RepositoryController: """ Class for importing/syncing set of repositories into the DB. First, repomd from all repositories are downloaded and parsed. Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported. """ def __init__(self): self.logger = get_logger(__name__) self.downloader = FileDownloader() self.unpacker = FileUnpacker() self.repo_store = RepositoryStore() self.repositories = set() self.certs_tmp_directory = None self.certs_files = {} def _get_certs_tuple(self, name): if name in self.certs_files: return self.certs_files[name]["ca_cert"], self.certs_files[name]["cert"], self.certs_files[name]["key"] return None, None, None def _download_repomds(self): download_items = [] for repository in self.repositories: repomd_url = urljoin(repository.repo_url, REPOMD_PATH) repository.tmp_directory = tempfile.mkdtemp(prefix="repo-") ca_cert, cert, key = self._get_certs_tuple(repository.cert_name) item = DownloadItem( source_url=repomd_url, target_path=os.path.join(repository.tmp_directory, "repomd.xml"), ca_cert=ca_cert, cert=cert, key=key ) # Save for future status code check download_items.append(item) self.downloader.add(item) self.downloader.run() # Return failed downloads return {item.target_path: item.status_code for item in download_items if item.status_code not in VALID_HTTP_CODES} def _read_repomds(self): """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are newer than metadata currently in DB. """ # Fetch current list of repositories from DB db_repositories = self.repo_store.list_repositories() for repository in self.repositories: repomd_path = os.path.join(repository.tmp_directory, "repomd.xml") repomd = RepoMD(repomd_path) # Was repository already synced before? repository_key = (repository.content_set, repository.basearch, repository.releasever) if repository_key in db_repositories: db_revision = db_repositories[repository_key]["revision"] else: db_revision = None downloaded_revision = repomd.get_revision() # Repository is synced for the first time or has newer revision if db_revision is None or downloaded_revision > db_revision: repository.repomd = repomd else: self.logger.info("Downloaded repo %s (%s) is not newer than repo in DB (%s).", ", ".join(filter(None, repository_key)), str(downloaded_revision), str(db_revision)) def _repo_download_failed(self, repo, failed_items): failed = False for md_path in list(repo.md_files.values()) + [REPOMD_PATH]: local_path = os.path.join(repo.tmp_directory, os.path.basename(md_path)) if local_path in failed_items: failed = True self.logger.warning("Download failed: %s (HTTP CODE %d)", urljoin(repo.repo_url, md_path), failed_items[local_path]) return failed def _download_metadata(self, batch): download_items = [] for repository in batch: # primary_db has higher priority, use primary.xml if not found try: repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"] except RepoMDTypeNotFound: repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"] # updateinfo.xml may be missing completely try: repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"] except RepoMDTypeNotFound: pass try: repository.md_files["modules"] = repository.repomd.get_metadata("modules")["location"] except RepoMDTypeNotFound: pass # queue metadata files for download for md_location in repository.md_files.values(): ca_cert, cert, key = self._get_certs_tuple(repository.cert_name) item = DownloadItem( source_url=urljoin(repository.repo_url, md_location), target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location)), ca_cert=ca_cert, cert=cert, key=key ) download_items.append(item) self.downloader.add(item) self.downloader.run() # Return failed downloads return {item.target_path: item.status_code for item in download_items if item.status_code not in VALID_HTTP_CODES} def _unpack_metadata(self, batch): for repository in batch: for md_type in repository.md_files: self.unpacker.add(os.path.join(repository.tmp_directory, os.path.basename(repository.md_files[md_type]))) # FIXME: this should be done in different place? repository.md_files[md_type] = os.path.join( repository.tmp_directory, os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0] self.unpacker.run() def clean_repodata(self, batch): """Clean downloaded repodata of all repositories in batch.""" for repository in batch: if repository.tmp_directory: shutil.rmtree(repository.tmp_directory) repository.tmp_directory = None self.repositories.remove(repository) def _clean_certificate_cache(self): if self.certs_tmp_directory: shutil.rmtree(self.certs_tmp_directory) self.certs_tmp_directory = None self.certs_files = {} def add_db_repositories(self): """Queue all previously imported repositories.""" repos = self.repo_store.list_repositories() for (content_set, basearch, releasever), repo_dict in repos.items(): # Reference content_set_label -> content set id self.repo_store.content_set_to_db_id[content_set] = repo_dict["content_set_id"] self.repositories.add(Repository(repo_dict["url"], content_set, basearch, releasever, cert_name=repo_dict["cert_name"], ca_cert=repo_dict["ca_cert"], cert=repo_dict["cert"], key=repo_dict["key"])) def add_repository(self, repo_url, content_set, basearch, releasever, cert_name=None, ca_cert=None, cert=None, key=None): """Queue repository to import/check updates.""" repo_url = repo_url.strip() if not repo_url.endswith("/"): repo_url += "/" self.repositories.add(Repository(repo_url, content_set, basearch, releasever, cert_name=cert_name, ca_cert=ca_cert, cert=cert, key=key)) def _write_certificate_cache(self): certs = {} for repository in self.repositories: if repository.cert_name: certs[repository.cert_name] = {"ca_cert": repository.ca_cert, "cert": repository.cert, "key": repository.key} if certs: self.certs_tmp_directory = tempfile.mkdtemp(prefix="certs-") for cert_name in certs: self.certs_files[cert_name] = {} for cert_type in ["ca_cert", "cert", "key"]: # Cert is not None if certs[cert_name][cert_type]: cert_path = os.path.join(self.certs_tmp_directory, "%s.%s" % (cert_name, cert_type)) with open(cert_path, "w") as cert_file: cert_file.write(certs[cert_name][cert_type]) self.certs_files[cert_name][cert_type] = cert_path else: self.certs_files[cert_name][cert_type] = None def _find_content_sets_by_regex(self, content_set_regex): if not content_set_regex.startswith('^'): content_set_regex = '^' + content_set_regex if not content_set_regex.endswith('$'): content_set_regex = content_set_regex + '$' return [content_set_label for content_set_label in self.repo_store.content_set_to_db_id if re.match(content_set_regex, content_set_label)] def delete_content_set(self, content_set_regex): """Deletes content sets described by given regex from DB.""" for content_set_label in self._find_content_sets_by_regex(content_set_regex): self.logger.info("Deleting content set: %s", content_set_label) self.repo_store.delete_content_set(content_set_label) self.repo_store.cleanup_unused_data() def import_repositories(self): """Create or update repository records in the DB.""" self.logger.info("Importing %d repositories.", len(self.repositories)) for repository in self.repositories: self.repo_store.import_repository(repository) def store(self): """Sync all queued repositories. Process repositories in batches due to disk space and memory usage.""" self.logger.info("Checking %d repositories.", len(self.repositories)) self._write_certificate_cache() # Download all repomd files first failed = self._download_repomds() if failed: self.logger.warning("%d repomd.xml files failed to download.", len(failed)) failed_repos = [repo for repo in self.repositories if self._repo_download_failed(repo, failed)] self.clean_repodata(failed_repos) self._read_repomds() # Filter all repositories without repomd attribute set (failed download, downloaded repomd is not newer) batches = BatchList() to_skip = [] for repository in self.repositories: if repository.repomd: batches.add_item(repository) else: to_skip.append(repository) self.clean_repodata(to_skip) self.logger.info("%d repositories skipped.", len(to_skip)) self.logger.info("Syncing %d repositories.", sum(len(l) for l in batches)) # Download and process repositories in batches (unpacked metadata files can consume lot of disk space) for batch in batches: failed = self._download_metadata(batch) if failed: self.logger.warning("%d metadata files failed to download.", len(failed)) failed_repos = [repo for repo in batch if self._repo_download_failed(repo, failed)] self.clean_repodata(failed_repos) batch = [repo for repo in batch if repo not in failed_repos] self._unpack_metadata(batch) for repository in batch: repository.load_metadata() self.repo_store.store(repository) repository.unload_metadata() self.clean_repodata(batch) self.repo_store.cleanup_unused_data() self._clean_certificate_cache()
class RepositoryController: """ Class for importing/syncing set of repositories into the DB. First, repomd from all repositories are downloaded and parsed. Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported. """ def __init__(self): self.logger = SimpleLogger() self.downloader = FileDownloader() self.unpacker = FileUnpacker() self.repo_store = RepositoryStore() self.repositories = set() self.db_repositories = {} def _download_repomds(self): download_items = [] for repository in self.repositories: repomd_url = urljoin(repository.repo_url, REPOMD_PATH) repository.tmp_directory = tempfile.mkdtemp(prefix="repo-") item = DownloadItem( source_url=repomd_url, target_path=os.path.join(repository.tmp_directory, "repomd.xml") ) # Save for future status code check download_items.append(item) self.downloader.add(item) self.downloader.run() # Return failed downloads return {item.target_path: item.status_code for item in download_items if item.status_code not in VALID_HTTP_CODES} def _read_repomds(self, failed): """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are newer than metadata currently in DB. """ for repository in self.repositories: repomd_path = os.path.join(repository.tmp_directory, "repomd.xml") if repomd_path not in failed: repomd = RepoMD(repomd_path) # Was repository already synced before? if repository.repo_url in self.db_repositories: db_revision = self.db_repositories[repository.repo_url]["revision"] else: db_revision = None downloaded_revision = datetime.fromtimestamp(repomd.get_revision(), tz=timezone.utc) # Repository is synced for the first time or has newer revision if db_revision is None or downloaded_revision > db_revision: repository.repomd = repomd else: self.logger.log("Downloaded repo %s (%s) is not newer than repo in DB (%s)." % (repository.repo_url, str(downloaded_revision), str(db_revision))) else: self.logger.log("Download failed: %s (HTTP CODE %d)" % (urljoin(repository.repo_url, REPOMD_PATH), failed[repomd_path])) def _download_metadata(self, batch): for repository in batch: # primary_db has higher priority, use primary.xml if not found try: repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"] except RepoMDTypeNotFound: repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"] # updateinfo.xml may be missing completely try: repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"] except RepoMDTypeNotFound: pass # queue metadata files for download for md_location in repository.md_files.values(): self.downloader.add(DownloadItem( source_url=urljoin(repository.repo_url, md_location), target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location)) )) self.downloader.run() def _unpack_metadata(self, batch): for repository in batch: for md_type in repository.md_files: self.unpacker.add(os.path.join(repository.tmp_directory, os.path.basename(repository.md_files[md_type]))) # FIXME: this should be done in different place? repository.md_files[md_type] = os.path.join( repository.tmp_directory, os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0] self.unpacker.run() def clean_repodata(self, batch): """Clean downloaded repodata of all repositories in batch.""" for repository in batch: if repository.tmp_directory: shutil.rmtree(repository.tmp_directory) repository.tmp_directory = None self.repositories.remove(repository) def add_repository(self, repo_url): """Queue repository to import/check updates.""" repo_url = repo_url.strip() if not repo_url.endswith("/"): repo_url += "/" self.repositories.add(Repository(repo_url)) def store(self): """Sync all queued repositories. Process repositories in batches due to disk space and memory usage.""" self.logger.log("Checking %d repositories." % len(self.repositories)) # Fetch current list of repositories from DB self.db_repositories = self.repo_store.list_repositories() # Download all repomd files first failed = self._download_repomds() self.logger.log("%d repomd.xml files failed to download." % len(failed)) self._read_repomds(failed) # Filter all repositories without repomd attribute set (failed download, downloaded repomd is not newer) batches = BatchList() to_skip = [] for repository in self.repositories: if repository.repomd: batches.add_item(repository) else: to_skip.append(repository) self.clean_repodata(to_skip) self.logger.log("%d repositories skipped." % len(to_skip)) self.logger.log("Syncing %d repositories." % sum(len(l) for l in batches)) # Download and process repositories in batches (unpacked metadata files can consume lot of disk space) for batch in batches: self._download_metadata(batch) self._unpack_metadata(batch) for repository in batch: repository.load_metadata() self.repo_store.store(repository) repository.unload_metadata() self.clean_repodata(batch)
class RepositoryController: """ Class for importing/syncing set of repositories into the DB. First, repomd from all repositories are downloaded and parsed. Second, primary and updateinfo repodata from repositories needing update are downloaded, parsed and imported. """ def __init__(self): self.logger = get_logger(__name__) self.downloader = FileDownloader() self.unpacker = FileUnpacker() self.repo_store = RepositoryStore() self.repositories = set() self.certs_tmp_directory = None self.certs_files = {} def _get_certs_tuple(self, name): if name in self.certs_files: return self.certs_files[name]["ca_cert"], self.certs_files[name]["cert"], self.certs_files[name]["key"] return None, None, None def _download_repomds(self): download_items = [] certs_tmp_dict = {} for repository in self.repositories: repomd_url = urljoin(repository.repo_url, REPOMD_PATH) repository.tmp_directory = tempfile.mkdtemp(prefix="repo-") ca_cert, cert, key = self._get_certs_tuple(repository.cert_name) # Check certificate expiration date if repository.cert_name: certs_tmp_dict[repository.cert_name] = cert item = DownloadItem( source_url=repomd_url, target_path=os.path.join(repository.tmp_directory, "repomd.xml"), ca_cert=ca_cert, cert=cert, key=key ) # Save for future status code check download_items.append(item) self.downloader.add(item) for cert_name, cert in certs_tmp_dict.items(): self._check_cert_expiration_date(cert_name, cert) self.downloader.run() # Return failed downloads return {item.target_path: item.status_code for item in download_items if item.status_code not in VALID_HTTP_CODES} def _check_cert_expiration_date(self, cert_name, cert): try: # Load certificate loaded_cert = crypto.load_certificate(crypto.FILETYPE_PEM, cert) # Get expiration date and parse it to datetime object valid_to_dt = datetime.strptime(loaded_cert.get_notAfter(), "%Y%m%d%H%M%SZ") expire_in_days_td = (valid_to_dt - datetime.utcnow()).days expire_tuple = (valid_to_dt, expire_in_days_td) if 30 >= expire_in_days_td > 0: self.logger.warning('Certificate %s will expire in %s', cert_name, expire_in_days_td) msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate will expire soon', expire_tuple) send_slack_notification(msg) else: self.logger.warning('Certificate %s expired!', cert_name) msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate expired', expire_tuple) send_slack_notification(msg) except crypto.Error: self.logger.warning('Certificate not provided or incorrect: %s', cert_name if cert_name else 'None') msg = prepare_msg_for_slack(cert_name, 'Reposcan CDN certificate not provided or incorrect') send_slack_notification(msg) def _read_repomds(self): """Reads all downloaded repomd files. Checks if their download failed and checks if their metadata are newer than metadata currently in DB. """ # Fetch current list of repositories from DB db_repositories = self.repo_store.list_repositories() for repository in self.repositories: repomd_path = os.path.join(repository.tmp_directory, "repomd.xml") repomd = RepoMD(repomd_path) # Was repository already synced before? repository_key = (repository.content_set, repository.basearch, repository.releasever) if repository_key in db_repositories: db_revision = db_repositories[repository_key]["revision"] else: db_revision = None downloaded_revision = repomd.get_revision() # Repository is synced for the first time or has newer revision if db_revision is None or downloaded_revision > db_revision: repository.repomd = repomd else: self.logger.debug("Downloaded repo %s (%s) is not newer than repo in DB (%s).", ", ".join(filter(None, repository_key)), str(downloaded_revision), str(db_revision)) def _repo_download_failed(self, repo, failed_items): failed = False for md_path in list(repo.md_files.values()) + [REPOMD_PATH]: local_path = os.path.join(repo.tmp_directory, os.path.basename(md_path)) if local_path in failed_items: failed = True # Download errors with no HTTP code are logged in downloader, deduplicate error msgs if failed_items[local_path] > 0: self.logger.warning("Download failed: LABEL: %s URL: %s (HTTP CODE %d)", repo.content_set, urljoin(repo.repo_url, md_path), failed_items[local_path]) FAILED_REPO_WITH_HTTP_CODE.labels(failed_items[local_path]).inc() return failed def _download_metadata(self, batch): download_items = [] for repository in batch: # primary_db has higher priority, use primary.xml if not found try: repository.md_files["primary_db"] = repository.repomd.get_metadata("primary_db")["location"] except RepoMDTypeNotFound: repository.md_files["primary"] = repository.repomd.get_metadata("primary")["location"] # updateinfo.xml may be missing completely try: repository.md_files["updateinfo"] = repository.repomd.get_metadata("updateinfo")["location"] except RepoMDTypeNotFound: pass try: repository.md_files["modules"] = repository.repomd.get_metadata("modules")["location"] except RepoMDTypeNotFound: pass # queue metadata files for download for md_location in repository.md_files.values(): ca_cert, cert, key = self._get_certs_tuple(repository.cert_name) item = DownloadItem( source_url=urljoin(repository.repo_url, md_location), target_path=os.path.join(repository.tmp_directory, os.path.basename(md_location)), ca_cert=ca_cert, cert=cert, key=key ) download_items.append(item) self.downloader.add(item) self.downloader.run() # Return failed downloads return {item.target_path: item.status_code for item in download_items if item.status_code not in VALID_HTTP_CODES} def _unpack_metadata(self, batch): for repository in batch: for md_type in repository.md_files: self.unpacker.add(os.path.join(repository.tmp_directory, os.path.basename(repository.md_files[md_type]))) # FIXME: this should be done in different place? repository.md_files[md_type] = os.path.join( repository.tmp_directory, os.path.basename(repository.md_files[md_type])).rsplit(".", maxsplit=1)[0] self.unpacker.run() def clean_repodata(self, batch): """Clean downloaded repodata of all repositories in batch.""" for repository in batch: if repository.tmp_directory: shutil.rmtree(repository.tmp_directory) repository.tmp_directory = None self.repositories.remove(repository) def _clean_certificate_cache(self): if self.certs_tmp_directory: shutil.rmtree(self.certs_tmp_directory) self.certs_tmp_directory = None self.certs_files = {} def add_db_repositories(self): """Queue all previously imported repositories.""" repos = self.repo_store.list_repositories() for (content_set, basearch, releasever), repo_dict in repos.items(): # Reference content_set_label -> content set id self.repo_store.content_set_to_db_id[content_set] = repo_dict["content_set_id"] self.repositories.add(Repository(repo_dict["url"], content_set, basearch, releasever, cert_name=repo_dict["cert_name"], ca_cert=repo_dict["ca_cert"], cert=repo_dict["cert"], key=repo_dict["key"])) def add_repository(self, repo_url, content_set, basearch, releasever, cert_name=None, ca_cert=None, cert=None, key=None): """Queue repository to import/check updates.""" repo_url = repo_url.strip() if not repo_url.endswith("/"): repo_url += "/" self.repositories.add(Repository(repo_url, content_set, basearch, releasever, cert_name=cert_name, ca_cert=ca_cert, cert=cert, key=key)) def _write_certificate_cache(self): certs = {} for repository in self.repositories: if repository.cert_name: certs[repository.cert_name] = {"ca_cert": repository.ca_cert, "cert": repository.cert, "key": repository.key} if certs: self.certs_tmp_directory = tempfile.mkdtemp(prefix="certs-") for cert_name in certs: self.certs_files[cert_name] = {} for cert_type in ["ca_cert", "cert", "key"]: # Cert is not None if certs[cert_name][cert_type]: cert_path = os.path.join(self.certs_tmp_directory, "%s.%s" % (cert_name, cert_type)) with open(cert_path, "w") as cert_file: cert_file.write(certs[cert_name][cert_type]) self.certs_files[cert_name][cert_type] = cert_path else: self.certs_files[cert_name][cert_type] = None def _find_content_sets_by_regex(self, content_set_regex): if not content_set_regex.startswith('^'): content_set_regex = '^' + content_set_regex if not content_set_regex.endswith('$'): content_set_regex = content_set_regex + '$' return [content_set_label for content_set_label in self.repo_store.content_set_to_db_id if re.match(content_set_regex, content_set_label)] def delete_content_set(self, content_set_regex): """Deletes content sets described by given regex from DB.""" for content_set_label in self._find_content_sets_by_regex(content_set_regex): self.logger.info("Deleting content set: %s", content_set_label) self.repo_store.delete_content_set(content_set_label) self.repo_store.cleanup_unused_data() def import_repositories(self): """Create or update repository records in the DB.""" self.logger.info("Importing %d repositories.", len(self.repositories)) failures = 0 for repository in self.repositories: try: self.repo_store.import_repository(repository) except Exception: # pylint: disable=broad-except failures += 1 if failures > 0: self.logger.warning("Failed to import %d repositories.", failures) FAILED_IMPORT_REPO.inc(failures) def store(self): """Sync all queued repositories. Process repositories in batches due to disk space and memory usage.""" self.logger.info("Checking %d repositories.", len(self.repositories)) self._write_certificate_cache() # Download all repomd files first failed = self._download_repomds() if failed: FAILED_REPOMD.inc(len(failed)) failed_repos = [repo for repo in self.repositories if self._repo_download_failed(repo, failed)] self.logger.warning("%d repomd.xml files failed to download.", len(failed)) self.clean_repodata(failed_repos) self._read_repomds() # Filter all repositories without repomd attribute set (downloaded repomd is not newer) batches = BatchList() up_to_date = [] def md_size(repomd, data_type): try: mdata = repomd.get_metadata(data_type) # open-size is not present for uncompressed files return int(mdata.get('size', 0)) + int(mdata.get('open-size', '0')) except RepoMDTypeNotFound: return 0 for repository in self.repositories: if repository.repomd: repo_size = md_size(repository.repomd, 'primary_db') # If we use primary_db, we don't even download primary data xml if repo_size == 0: repo_size += md_size(repository.repomd, 'primary') repo_size += md_size(repository.repomd, 'updateinfo') repo_size += md_size(repository.repomd, 'modules') batches.add_item(repository, repo_size) else: up_to_date.append(repository) self.clean_repodata(up_to_date) self.logger.info("%d repositories are up to date.", len(up_to_date)) total_repositories = batches.get_total_items() completed_repositories = 0 self.logger.info("%d repositories need to be synced.", total_repositories) # Download and process repositories in batches (unpacked metadata files can consume lot of disk space) try: for batch in batches: self.logger.info("Syncing a batch of %d repositories", len(batch)) try: failed = self._download_metadata(batch) if failed: self.logger.warning("%d metadata files failed to download.", len(failed)) failed_repos = [repo for repo in batch if self._repo_download_failed(repo, failed)] self.clean_repodata(failed_repos) batch = [repo for repo in batch if repo not in failed_repos] self._unpack_metadata(batch) for repository in batch: repository.load_metadata() completed_repositories += 1 self.logger.info("Syncing repository: %s [%s/%s]", ", ".join( filter(None, (repository.content_set, repository.basearch, repository.releasever))), completed_repositories, total_repositories) self.repo_store.store(repository) repository.unload_metadata() finally: self.clean_repodata(batch) finally: self.repo_store.cleanup_unused_data() self._clean_certificate_cache()
class CveRepoController: """ Controls import/sync of CVE lists into the DB. """ def __init__(self): self.logger = get_logger(__name__) self.downloader = FileDownloader() self.unpacker = FileUnpacker() self.cverepo_store = CveRepoStore() self.repos = set() self.db_lastmodified = {} self.year_since = int(os.getenv('YEAR_SINCE', DEFAULT_YEAR_SINCE)) def _download_meta(self): download_items = [] for repo in self.repos: repo.tmp_directory = tempfile.mkdtemp(prefix="cverepo-") item = DownloadItem(source_url=repo.meta_url(), target_path=repo.meta_tmp()) # Save for future status code check download_items.append(item) self.downloader.add(item) self.downloader.run() # Return failed downloads return { item.target_path: item.status_code for item in download_items if item.status_code not in VALID_HTTP_CODES } def _read_meta(self, failed): """Reads downloaded meta files and checks for updates.""" for repo in self.repos: meta_path = repo.meta_tmp() if meta_path not in failed: meta = CveMeta(meta_path) # already synced before? db_lastmodified = parse_datetime( self.db_lastmodified.get(repo.label, None)) meta_lastmodified = parse_datetime(meta.get_lastmodified()) # synced for the first time or has newer revision if (db_lastmodified is None or meta_lastmodified is None or meta_lastmodified > db_lastmodified): repo.meta = meta else: self.logger.info( "Cve list '%s' has not been updated (since %s).", repo.label, str(db_lastmodified)) else: FAILED_NIST.inc() self.logger.warning("Download failed: %s (HTTP CODE %d)", repo.meta_url(), failed[meta_path]) def _download_json(self, batch): for repo in batch: self.downloader.add( DownloadItem(source_url=repo.json_url(), target_path=repo.json_tmpgz())) self.downloader.run() def _unpack_json(self, batch): for repo in batch: self.unpacker.add(repo.json_tmpgz()) self.unpacker.run() def clean_repo(self, batch): """Clean downloaded files for given batch.""" for repo in batch: if repo.tmp_directory: shutil.rmtree(repo.tmp_directory) repo.tmp_directory = None self.repos.remove(repo) def add_repos(self): """Generate urls for CVE lists to download.""" # Fetch current list of repositories from DB self.db_lastmodified = self.cverepo_store.list_lastmodified() # CVE files for single years should be used only for initial load labels = [ str(y) for y in range(self.year_since, int(time.strftime("%Y")) + 1) ] for label in labels: if label not in self.db_lastmodified: self.repos.add(CveRepo(label)) # always import incremental changes labels = ['recent', 'modified'] for label in labels: self.repos.add(CveRepo(label)) def store(self): """Sync all queued CVE lists. Runs in batches due to disk space and memory usage.""" self.logger.info("Checking %d CVE lists.", len(self.repos)) # Download all repomd files first failed = self._download_meta() if failed: FAILED_NIST.inc() self.logger.warning("%d meta files failed to download.", len(failed)) self._read_meta(failed) # filter out failed / unchanged lists batches = BatchList() to_skip = [] for repo in self.repos: if repo.meta: batches.add_item(repo) else: to_skip.append(repo) self.clean_repo(to_skip) self.logger.info("%d CVE lists skipped.", len(to_skip)) self.logger.info("Syncing %d CVE lists.", sum(len(l) for l in batches)) # Download and process repositories in batches (unpacked metadata files can consume lot of disk space) for batch in batches: try: self._download_json(batch) self._unpack_json(batch) for repo in sorted(batch, key=lambda repo: repo.label): repo.load_json() self.cverepo_store.store(repo) repo.unload_json() finally: self.clean_repo(batch)