def extract_files(self, source_package, files_to_extract=None): """ Extract files for just the given source package. :type source_package: :class:`SourcePackage <distro_tracker.core.models.SourcePackage>` :type files_to_extract: An iterable of file names which should be extracted """ if self.cache is None: self.cache = AptCache() source_directory = self.cache.retrieve_source( source_package.source_package_name.name, source_package.version, debian_directory_only=True) debian_directory = os.path.join(source_directory, 'debian') if files_to_extract is None: files_to_extract = self.ALL_FILES_TO_EXTRACT for file_name in files_to_extract: file_path = os.path.join(debian_directory, file_name) if not os.path.exists(file_path): continue with open(file_path, 'r') as f: extracted_file = File(f) ExtractedSourceFile.objects.create( source_package=source_package, extracted_file=extracted_file, name=file_name)
def execute(self): self.log("Updating apt's cache") self.apt_cache = AptCache() updated_sources, updated_packages = ( self.apt_cache.update_repositories(self.force_update) ) self.log("Updating data from Sources files") self.update_sources_files(updated_sources) self.log("Updating data from Packages files") self.update_packages_files(updated_packages) self.log("Updating dependencies") self.update_dependencies()
class ExtractSourcePackageFiles(BaseTask): """ A task which extracts some files from a new source package version. The extracted files are: - debian/changelog - debian/copyright - debian/rules - debian/control - debian/watch """ DEPENDS_ON_EVENTS = ( 'new-source-package-version', ) PRODUCES_EVENTS = ( 'source-files-extracted', ) ALL_FILES_TO_EXTRACT = ( 'changelog', 'copyright', 'rules', 'control', 'watch', ) def __init__(self, *args, **kwargs): super(ExtractSourcePackageFiles, self).__init__(*args, **kwargs) self.cache = None def extract_files(self, source_package, files_to_extract=None): """ Extract files for just the given source package. :type source_package: :class:`SourcePackage <distro_tracker.core.models.SourcePackage>` :type files_to_extract: An iterable of file names which should be extracted """ if self.cache is None: self.cache = AptCache() source_directory = self.cache.retrieve_source( source_package.source_package_name.name, source_package.version, debian_directory_only=True) debian_directory = os.path.join(source_directory, 'debian') if files_to_extract is None: files_to_extract = self.ALL_FILES_TO_EXTRACT for file_name in files_to_extract: file_path = os.path.join(debian_directory, file_name) if not os.path.exists(file_path): continue with open(file_path, 'r') as f: extracted_file = File(f) ExtractedSourceFile.objects.create( source_package=source_package, extracted_file=extracted_file, name=file_name) def _execute_initial(self): """ When the task is directly ran, instead of relying on events to know which packages' source files should be retrieved, the task scans all existing packages and adds any missing source packages for each of them. """ # First remove all source files which are no longer to be included. qs = ExtractedSourceFile.objects.exclude( name__in=self.ALL_FILES_TO_EXTRACT) qs.delete() # Retrieves the packages and all the associated files with each of them # in only two db queries. source_packages = SourcePackage.objects.all() source_packages.prefetch_related('extracted_source_files') # Find the difference of packages and extract only those for each # package for srcpkg in source_packages: extracted_files = [ extracted_file.name for extracted_file in srcpkg.extracted_source_files.all() ] files_to_extract = [ file_name for file_name in self.ALL_FILES_TO_EXTRACT if file_name not in extracted_files ] if files_to_extract: try: self.extract_files(srcpkg, files_to_extract) except: logger.exception( 'Problem extracting source files for' ' {pkg} version {ver}'.format( pkg=srcpkg, ver=srcpkg.version)) def execute(self): if self.is_initial_task(): return self._execute_initial() # When the task is not the initial task, then all the packages it # should process should come from received events. new_version_pks = [ event.arguments['pk'] for event in self.get_all_events() ] source_packages = SourcePackage.objects.filter(pk__in=new_version_pks) source_packages = source_packages.select_related() for source_package in source_packages: try: self.extract_files(source_package) except: logger.exception( 'Problem extracting source files for' ' {pkg} version {ver}'.format( pkg=source_package, ver=source_package.version)) self.raise_event('source-files-extracted')
class UpdateRepositoriesTask(PackageUpdateTask): """ Performs an update of repository information. New (source and binary) packages are created if necessary and old ones are deleted. An event is emitted for each situation, allowing other tasks to perform updates based on updated package information. """ PRODUCES_EVENTS = ( 'new-source-package', 'new-source-package-version', 'new-source-package-in-repository', 'new-source-package-version-in-repository', 'new-binary-package', # Source package no longer found in any repository 'lost-source-package', # Source package version no longer found in the given repository 'lost-source-package-version-in-repository', # A particular version of a source package no longer found in any repo 'lost-version-of-source-package', # Binary package name no longer used by any source package 'lost-binary-package', ) SOURCE_DEPENDENCY_TYPES = ('Build-Depends', 'Build-Depends-Indep') BINARY_DEPENDENCY_TYPES = ('Depends', 'Recommends', 'Suggests') def __init__(self, *args, **kwargs): super(UpdateRepositoriesTask, self).__init__(*args, **kwargs) self._all_packages = [] self._all_repository_entries = [] def _clear_processed_repository_entries(self): self._all_repository_entries = [] def _add_processed_repository_entry(self, repository_entry): self._all_repository_entries.append(repository_entry.id) def _extract_information_from_sources_entry(self, src_pkg, stanza): entry = extract_information_from_sources_entry(stanza) # Convert the parsed data into corresponding model instances if 'architectures' in entry: # Map the list of architecture names to their objects # Discards any unknown architectures. entry['architectures'] = Architecture.objects.filter( name__in=entry['architectures']) if 'binary_packages' in entry: # Map the list of binary package names to list of existing # binary package names. binary_package_names = entry['binary_packages'] existing_binaries_qs = BinaryPackageName.objects.filter( name__in=binary_package_names) existing_binaries_names = [] binaries = [] for binary in existing_binaries_qs: binaries.append(binary) existing_binaries_names.append(binary.name) for binary_name in binary_package_names: if binary_name not in existing_binaries_names: binary_package_name, _ = PackageName.objects.get_or_create( name=binary_name) binary_package_name.binary = True binary_package_name.save() binary_package_name = BinaryPackageName.objects.get( name=binary_name) binaries.append(binary_package_name) self.raise_event('new-binary-package', { 'name': binary_name, }) entry['binary_packages'] = binaries if 'maintainer' in entry: maintainer_email, _ = UserEmail.objects.get_or_create( email=entry['maintainer']['email']) maintainer = ContributorName.objects.get_or_create( contributor_email=maintainer_email, name=entry['maintainer'].get('name', ''))[0] entry['maintainer'] = maintainer if 'uploaders' in entry: uploader_emails = [ uploader['email'] for uploader in entry['uploaders'] ] uploader_names = [ uploader.get('name', '') for uploader in entry['uploaders'] ] existing_contributor_emails_qs = UserEmail.objects.filter( email__in=uploader_emails) existing_contributor_emails = { contributor.email: contributor for contributor in existing_contributor_emails_qs } uploaders = [] for email, name in zip(uploader_emails, uploader_names): if email not in existing_contributor_emails: contributor_email, _ = UserEmail.objects.get_or_create( email=email) existing_contributor_emails[email] = contributor_email else: contributor_email = existing_contributor_emails[email] uploaders.append(ContributorName.objects.get_or_create( contributor_email=contributor_email, name=name)[0] ) entry['uploaders'] = uploaders return entry def _extract_information_from_packages_entry(self, bin_pkg, stanza): entry = extract_information_from_packages_entry(stanza) return entry def _update_sources_file(self, repository, sources_file): for stanza in deb822.Sources.iter_paragraphs(sources_file): allow, implemented = vendor.call('allow_package', stanza) if allow is not None and implemented and not allow: # The vendor-provided function indicates that the package # should not be included continue src_pkg_name, created = SourcePackageName.objects.get_or_create( name=stanza['package'] ) if created: self.raise_event('new-source-package', { 'name': src_pkg_name.name }) src_pkg, created_new_version = SourcePackage.objects.get_or_create( source_package_name=src_pkg_name, version=stanza['version'] ) if created_new_version: self.raise_event('new-source-package-version', { 'name': src_pkg.name, 'version': src_pkg.version, 'pk': src_pkg.pk, }) # Since it's a new version, extract package data from Sources entry = self._extract_information_from_sources_entry( src_pkg, stanza) # Update the source package information based on the newly # extracted data. src_pkg.update(**entry) src_pkg.save() if not repository.has_source_package(src_pkg): # Does it have any version of the package? if not repository.has_source_package_name(src_pkg.name): self.raise_event('new-source-package-in-repository', { 'name': src_pkg.name, 'repository': repository.name, }) # Add it to the repository kwargs = { 'priority': stanza.get('priority', ''), 'section': stanza.get('section', ''), } entry = repository.add_source_package(src_pkg, **kwargs) self.raise_event('new-source-package-version-in-repository', { 'name': src_pkg.name, 'version': src_pkg.version, 'repository': repository.name, }) else: # We get the entry to mark that the package version is still in # the repository. entry = SourcePackageRepositoryEntry.objects.get( repository=repository, source_package=src_pkg ) self._add_processed_repository_entry(entry) def get_source_for_binary(self, stanza): """ :param stanza: a ``Packages`` file entry :returns: A ``(source_name, source_version)`` pair for the binary package described by the entry """ source_name = ( stanza['source'] if 'source' in stanza else stanza['package']) # Extract the source version, if given in the Source field match = re.match(r'(.+) \((.+)\)', source_name) if match: source_name, source_version = match.group(1), match.group(2) else: source_version = stanza['version'] return source_name, source_version def _update_packages_file(self, repository, packages_file): for stanza in deb822.Packages.iter_paragraphs(packages_file): bin_pkg_name, created = BinaryPackageName.objects.get_or_create( name=stanza['package'] ) # Find the matching SourcePackage for the binary package source_name, source_version = self.get_source_for_binary(stanza) src_pkg, _ = SourcePackage.objects.get_or_create( source_package_name=SourcePackageName.objects.get_or_create( name=source_name)[0], version=source_version) bin_pkg, created_new_version = BinaryPackage.objects.get_or_create( binary_package_name=bin_pkg_name, version=stanza['version'], source_package=src_pkg ) if created_new_version: # Since it's a new version, extract package data from Packages entry = self._extract_information_from_packages_entry( bin_pkg, stanza) # Update the binary package information based on the newly # extracted data. bin_pkg.update(**entry) bin_pkg.save() if not repository.has_binary_package(bin_pkg): # Add it to the repository architecture, _ = Architecture.objects.get_or_create( name=stanza['architecture']) kwargs = { 'priority': stanza.get('priority', ''), 'section': stanza.get('section', ''), 'architecture': architecture, } entry = repository.add_binary_package(bin_pkg, **kwargs) else: # We get the entry to mark that the package version is still in # the repository. entry = BinaryPackageRepositoryEntry.objects.get( repository=repository, binary_package=bin_pkg) self._add_processed_repository_entry(entry) def _remove_query_set_if_count_zero(self, qs, count_field, event_generator=None): """ Removes elements from the given query set if their count of the given ``count_field`` is ``0``. :param qs: Instances which should be deleted in case their count of the field ``count_field`` is 0. :type qs: :class:`QuerySet <django.db.models.query.QuerySet>` :param count_field: Each instance in ``qs`` that has a 0 count for the field with this name is deleted. :type count_field: string :param event_generator: A ``callable`` which returns a ``(name, arguments)`` pair describing the event which should be raised based on the model instance given to it as an argument. :type event_generator: ``callable`` """ qs = qs.annotate(count=models.Count(count_field)) qs = qs.filter(count=0) if event_generator: for item in qs: self.raise_event(*event_generator(item)) qs.delete() def _remove_obsolete_packages(self): self.log("Removing obsolete source packages") # Clean up package versions which no longer exist in any repository. self._remove_query_set_if_count_zero( SourcePackage.objects.all(), 'repository', lambda source_package: ( 'lost-version-of-source-package', { 'name': source_package.name, 'version': source_package.version, } ) ) # Clean up names which no longer exist. self._remove_query_set_if_count_zero( SourcePackageName.objects.all(), 'source_package_versions', lambda package: ( 'lost-source-package', { 'name': package.name, } ) ) # Clean up binary package names which are no longer used by any source # package. self._remove_query_set_if_count_zero( BinaryPackageName.objects.all(), 'sourcepackage', lambda binary_package_name: ( 'lost-binary-package', { 'name': binary_package_name.name, } ) ) def _update_repository_entries(self, all_entries_qs, event_generator=None): """ Removes all repository entries which are no longer found in the repository after the last update. If the ``event_generator`` argument is provided, an event returned by the function is raised for each removed entry. :param all_entries_qs: All currently existing entries which should be filtered to only contain the ones still found after the update. :type all_entries_qs: :class:`QuerySet <django.db.models.query.QuerySet>` :event_generator: Takes a repository entry as a parameter and returns a two-tuple of ``(event_name, event_arguments)``. An event with the return parameters is raised by the function for each removed entry. :type event_generator: callable """ # Out of all entries in this repository, only those found in # the last update need to stay, so exclude them from the delete all_entries_qs = all_entries_qs.exclude( id__in=self._all_repository_entries) # Emit events for all packages that were removed from the repository if event_generator: for entry in all_entries_qs: self.raise_event(*event_generator(entry)) all_entries_qs.delete() self._clear_processed_repository_entries() def extract_package_versions(self, file_name): """ :param file_name: The name of the file from which package versions should be extracted. :type file_name: string :returns: A dict mapping package names to a list of versions found in Deb822 formatted file. """ with open(file_name, 'r') as packages_file: packages = {} for stanza in deb822.Deb822.iter_paragraphs(packages_file): package_name, version = stanza['package'], stanza['version'] packages.setdefault(package_name, []) packages[package_name].append(version) return packages def _mark_file_not_processed(self, repository, file_name, entry_manager): """ The given ``Sources`` or ``Packages`` file has not been changed in the last update. This method marks all package versions found in it as still existing in order to avoid deleting them. :param repository: The repository to which the file is associated :type repository: :class:`Repository <distro_tracker.core.models.Repository>` :param file_name: The name of the file whose packages should be saved :param entry_manager: The manager instance which handles the package entries. :type entry_manager: :class:`Manager <django.db.models.Manager>` """ # Extract all package versions from the file packages = self.extract_package_versions(file_name) # Only issue one DB query to retrieve the entries for packages with # the given names repository_entries = \ entry_manager.filter_by_package_name(packages.keys()) repository_entries = repository_entries.filter( repository=repository) repository_entries = repository_entries.select_related() # For each of those entries, make sure to keep only the ones # corresponding to the version found in the sources file for entry in repository_entries: if entry.version in packages[entry.name]: self._add_processed_repository_entry(entry) def group_files_by_repository(self, cached_files): """ :param cached_files: A list of ``(repository, file_name)`` pairs :returns: A dict mapping repositories to all file names found for that repository. """ repository_files = {} for repository, file_name in cached_files: repository_files.setdefault(repository, []) repository_files[repository].append(file_name) return repository_files def update_sources_files(self, updated_sources): """ Performs an update of tracked packages based on the updated Sources files. :param updated_sources: A list of ``(repository, sources_file_name)`` pairs giving the Sources files which were updated and should be used to update the Distro Tracker tracked information too. """ # Group all files by repository to which they belong repository_files = self.group_files_by_repository(updated_sources) for repository, sources_files in repository_files.items(): with transaction.atomic(): self.log("Processing Sources files of %s repository", repository.shorthand) # First update package information based on updated files for sources_file in sources_files: with open(sources_file) as sources_fd: self._update_sources_file(repository, sources_fd) # Mark package versions found in un-updated files as still # existing all_sources = \ self.apt_cache.get_sources_files_for_repository(repository) for sources_file in all_sources: if sources_file not in sources_files: self._mark_file_not_processed( repository, sources_file, SourcePackageRepositoryEntry.objects) # When all the files for the repository are handled, update # which packages are still found in it. self._update_repository_entries( SourcePackageRepositoryEntry.objects.filter( repository=repository), lambda entry: ( 'lost-source-package-version-in-repository', { 'name': entry.source_package.name, 'version': entry.source_package.version, 'repository': entry.repository.name, }) ) with transaction.atomic(): # When all repositories are handled, update which packages are # still found in at least one repository. self._remove_obsolete_packages() def update_packages_files(self, updated_packages): """ Performs an update of tracked packages based on the updated Packages files. :param updated_sources: A list of ``(repository, packages_file_name)`` pairs giving the Packages files which were updated and should be used to update the Distro Tracker tracked information too. """ # Group all files by repository to which they belong repository_files = self.group_files_by_repository(updated_packages) for repository, packages_files in repository_files.items(): self.log("Processing Packages files of %s repository", repository.shorthand) # First update package information based on updated files for packages_file in packages_files: with open(packages_file) as packages_fd: self._update_packages_file(repository, packages_fd) # Mark package versions found in un-updated files as still existing all_sources = \ self.apt_cache.get_packages_files_for_repository(repository) for packages_file in all_sources: if packages_file not in packages_files: self._mark_file_not_processed( repository, packages_file, BinaryPackageRepositoryEntry.objects) # When all the files for the repository are handled, update # which packages are still found in it. self._update_repository_entries( BinaryPackageRepositoryEntry.objects.filter( repository=repository)) def _update_dependencies_for_source(self, stanza, dependency_types): """ Updates the dependencies for a source package based on the ones found in the given ``Packages`` or ``Sources`` stanza. :param source_name: The name of the source package for which the dependencies are updated. :param stanza: The ``Packages`` or ``Sources`` entry :param dependency_type: A list of dependency types which should be considered (e.g. Build-Depends, Recommends, etc.) :param source_to_binary_deps: The dictionary which should be updated with the new dependencies. Maps source names to a list of dicts each describing a dependency. """ binary_dependencies = [] for dependency_type in dependency_types: # The Deb822 instance is case sensitive when it comes to relations dependencies = stanza.relations.get(dependency_type.lower(), ()) for dependency in itertools.chain(*dependencies): binary_name = dependency['name'] binary_dependencies.append({ 'dependency_type': dependency_type, 'binary': binary_name, }) return binary_dependencies def _process_source_to_binary_deps(self, source_to_binary_deps, all_sources, bin_to_src, default_repository): dependency_instances = [] for source_name, dependencies in source_to_binary_deps.items(): if source_name not in all_sources: continue # All dependencies for the current source package. all_dependencies = {} for dependency in dependencies: binary_name = dependency['binary'] dependency_type = dependency.pop('dependency_type') if binary_name not in bin_to_src: continue for source_dependency in bin_to_src[binary_name]: if source_name == source_dependency: continue source_dependencies = \ all_dependencies.setdefault(source_dependency, {}) source_dependencies.setdefault(dependency_type, []) if dependency not in source_dependencies[dependency_type]: source_dependencies[dependency_type].append(dependency) # Create the dependency instances for the current source package. for dependency_name, details in all_dependencies.items(): if dependency_name in all_sources: build_dep = any(dependency_type in details for dependency_type in self.SOURCE_DEPENDENCY_TYPES) binary_dep = any(dependency_type in details for dependency_type in self.BINARY_DEPENDENCY_TYPES) dependency_instances.append( SourcePackageDeps( source=all_sources[source_name], dependency=all_sources[dependency_name], build_dep=build_dep, binary_dep=binary_dep, repository=default_repository, details=details)) return dependency_instances def update_dependencies(self): """ Updates source-to-source package dependencies stemming from build bependencies and their binary packages' dependencies. """ # Build the dependency mapping try: default_repository = Repository.objects.get(default=True) except Repository.DoesNotExist: self.log("No default repository, no dependencies created.", level=logging.WARNING) return self.log("Parsing files to discover dependencies") sources_files = self.apt_cache.get_sources_files_for_repository( default_repository) packages_files = self.apt_cache.get_packages_files_for_repository( default_repository) bin_to_src = {} source_to_binary_deps = {} # First builds a list of binary dependencies of all source packages # based on the Sources file. for sources_file in sources_files: with open(sources_file) as sources_fd: for stanza in deb822.Sources.iter_paragraphs(sources_fd): source_name = stanza['package'] for binary in itertools.chain(*stanza.relations['binary']): sources_set = bin_to_src.setdefault(binary['name'], set()) sources_set.add(source_name) dependencies = source_to_binary_deps.setdefault(source_name, []) dependencies.extend(self._update_dependencies_for_source( stanza, self.SOURCE_DEPENDENCY_TYPES)) # Then a list of binary dependencies based on the Packages file. for packages_file in packages_files: with open(packages_file) as packages_fd: for stanza in deb822.Packages.iter_paragraphs(packages_fd): binary_name = stanza['package'] source_name, source_version = \ self.get_source_for_binary(stanza) sources_set = bin_to_src.setdefault(binary_name, set()) sources_set.add(source_name) new_dependencies = self._update_dependencies_for_source( stanza, self.BINARY_DEPENDENCY_TYPES) for dependency in new_dependencies: dependency['source_binary'] = binary_name dependencies = source_to_binary_deps.setdefault(source_name, []) dependencies.extend(new_dependencies) # The binary packages are matched with their source packages and each # source to source dependency created. all_sources = { source.name: source for source in SourcePackageName.objects.all() } self.log("Creating in-memory SourcePackageDeps") # Keeps a list of SourcePackageDeps instances which are to be bulk # created in the end. dependency_instances = \ self._process_source_to_binary_deps(source_to_binary_deps, all_sources, bin_to_src, default_repository) # Create all the model instances in one transaction self.log("Committing SourcePackagesDeps to database") SourcePackageDeps.objects.all().delete() SourcePackageDeps.objects.bulk_create(dependency_instances) @clear_all_events_on_exception def execute(self): self.log("Updating apt's cache") self.apt_cache = AptCache() updated_sources, updated_packages = ( self.apt_cache.update_repositories(self.force_update) ) self.log("Updating data from Sources files") self.update_sources_files(updated_sources) self.log("Updating data from Packages files") self.update_packages_files(updated_packages) self.log("Updating dependencies") self.update_dependencies()
class ExtractSourcePackageFiles(BaseTask, ProcessSourcePackage): """ A task which extracts some files from a new source package version. The extracted files are: - debian/changelog - debian/copyright - debian/rules - debian/control - debian/watch """ class Scheduler(IntervalScheduler): interval = 3600 ALL_FILES_TO_EXTRACT = ( 'changelog', 'copyright', 'rules', 'control', 'watch', ) def items_extend_queryset(self, queryset): return queryset.prefetch_related('extracted_source_files') def extract_files(self, source_package, files_to_extract=None): """ Extract files for just the given source package. :type source_package: :class:`SourcePackage <distro_tracker.core.models.SourcePackage>` :type files_to_extract: An iterable of file names which should be extracted """ if not hasattr(self, 'cache'): self.cache = AptCache() source_directory = self.cache.retrieve_source( source_package.source_package_name.name, source_package.version, debian_directory_only=True) debian_directory = os.path.join(source_directory, 'debian') if files_to_extract is None: files_to_extract = self.ALL_FILES_TO_EXTRACT for file_name in files_to_extract: file_path = os.path.join(debian_directory, file_name) if not os.path.exists(file_path): continue with open(file_path, 'rb') as f: extracted_file = File(f) ExtractedSourceFile.objects.create( source_package=source_package, extracted_file=extracted_file, name=file_name) def execute_main(self): # First remove all source files which are no longer to be included. qs = ExtractedSourceFile.objects.exclude( name__in=self.ALL_FILES_TO_EXTRACT) qs.delete() # Process pending items for srcpkg in self.items_to_process(): # Save what has been processed when it takes long enough that we # had to extend the lock if self.extend_lock(): self.save_data() extracted_files = [ extracted_file.name for extracted_file in srcpkg.extracted_source_files.all() ] files_to_extract = [ file_name for file_name in self.ALL_FILES_TO_EXTRACT if file_name not in extracted_files ] if files_to_extract: try: self.extract_files(srcpkg, files_to_extract) self.item_mark_processed(srcpkg) except Exception: logger.exception( 'Problem extracting source files for %s version %s', srcpkg, srcpkg.version) else: self.item_mark_processed(srcpkg)