def extract_files(self, source_package, files_to_extract=None):
        """
        Extract files for just the given source package.

        :type source_package: :class:`SourcePackage
            <distro_tracker.core.models.SourcePackage>`
        :type files_to_extract: An iterable of file names which should be
            extracted
        """
        if self.cache is None:
            self.cache = AptCache()

        source_directory = self.cache.retrieve_source(
            source_package.source_package_name.name,
            source_package.version,
            debian_directory_only=True)
        debian_directory = os.path.join(source_directory, 'debian')

        if files_to_extract is None:
            files_to_extract = self.ALL_FILES_TO_EXTRACT

        for file_name in files_to_extract:
            file_path = os.path.join(debian_directory, file_name)
            if not os.path.exists(file_path):
                continue
            with open(file_path, 'r') as f:
                extracted_file = File(f)
                ExtractedSourceFile.objects.create(
                    source_package=source_package,
                    extracted_file=extracted_file,
                    name=file_name)
    def execute(self):
        self.log("Updating apt's cache")
        self.apt_cache = AptCache()
        updated_sources, updated_packages = (
            self.apt_cache.update_repositories(self.force_update)
        )

        self.log("Updating data from Sources files")
        self.update_sources_files(updated_sources)
        self.log("Updating data from Packages files")
        self.update_packages_files(updated_packages)
        self.log("Updating dependencies")
        self.update_dependencies()
class ExtractSourcePackageFiles(BaseTask):
    """
    A task which extracts some files from a new source package version.
    The extracted files are:

    - debian/changelog
    - debian/copyright
    - debian/rules
    - debian/control
    - debian/watch
    """
    DEPENDS_ON_EVENTS = (
        'new-source-package-version',
    )

    PRODUCES_EVENTS = (
        'source-files-extracted',
    )

    ALL_FILES_TO_EXTRACT = (
        'changelog',
        'copyright',
        'rules',
        'control',
        'watch',
    )

    def __init__(self, *args, **kwargs):
        super(ExtractSourcePackageFiles, self).__init__(*args, **kwargs)
        self.cache = None

    def extract_files(self, source_package, files_to_extract=None):
        """
        Extract files for just the given source package.

        :type source_package: :class:`SourcePackage
            <distro_tracker.core.models.SourcePackage>`
        :type files_to_extract: An iterable of file names which should be
            extracted
        """
        if self.cache is None:
            self.cache = AptCache()

        source_directory = self.cache.retrieve_source(
            source_package.source_package_name.name,
            source_package.version,
            debian_directory_only=True)
        debian_directory = os.path.join(source_directory, 'debian')

        if files_to_extract is None:
            files_to_extract = self.ALL_FILES_TO_EXTRACT

        for file_name in files_to_extract:
            file_path = os.path.join(debian_directory, file_name)
            if not os.path.exists(file_path):
                continue
            with open(file_path, 'r') as f:
                extracted_file = File(f)
                ExtractedSourceFile.objects.create(
                    source_package=source_package,
                    extracted_file=extracted_file,
                    name=file_name)

    def _execute_initial(self):
        """
        When the task is directly ran, instead of relying on events to know
        which packages' source files should be retrieved, the task scans all
        existing packages and adds any missing source packages for each of
        them.
        """
        # First remove all source files which are no longer to be included.
        qs = ExtractedSourceFile.objects.exclude(
            name__in=self.ALL_FILES_TO_EXTRACT)
        qs.delete()

        # Retrieves the packages and all the associated files with each of them
        # in only two db queries.
        source_packages = SourcePackage.objects.all()
        source_packages.prefetch_related('extracted_source_files')

        # Find the difference of packages and extract only those for each
        # package
        for srcpkg in source_packages:
            extracted_files = [
                extracted_file.name
                for extracted_file in srcpkg.extracted_source_files.all()
            ]
            files_to_extract = [
                file_name
                for file_name in self.ALL_FILES_TO_EXTRACT
                if file_name not in extracted_files
            ]
            if files_to_extract:
                try:
                    self.extract_files(srcpkg, files_to_extract)
                except:
                    logger.exception(
                        'Problem extracting source files for'
                        ' {pkg} version {ver}'.format(
                            pkg=srcpkg, ver=srcpkg.version))

    def execute(self):
        if self.is_initial_task():
            return self._execute_initial()

        # When the task is not the initial task, then all the packages it
        # should process should come from received events.
        new_version_pks = [
            event.arguments['pk']
            for event in self.get_all_events()
        ]
        source_packages = SourcePackage.objects.filter(pk__in=new_version_pks)
        source_packages = source_packages.select_related()

        for source_package in source_packages:
            try:
                self.extract_files(source_package)
            except:
                logger.exception(
                    'Problem extracting source files for'
                    ' {pkg} version {ver}'.format(
                        pkg=source_package, ver=source_package.version))

        self.raise_event('source-files-extracted')
class UpdateRepositoriesTask(PackageUpdateTask):
    """
    Performs an update of repository information.

    New (source and binary) packages are created if necessary and old ones are
    deleted. An event is emitted for each situation, allowing other tasks to
    perform updates based on updated package information.
    """
    PRODUCES_EVENTS = (
        'new-source-package',
        'new-source-package-version',
        'new-source-package-in-repository',
        'new-source-package-version-in-repository',

        'new-binary-package',

        # Source package no longer found in any repository
        'lost-source-package',
        # Source package version no longer found in the given repository
        'lost-source-package-version-in-repository',
        # A particular version of a source package no longer found in any repo
        'lost-version-of-source-package',
        # Binary package name no longer used by any source package
        'lost-binary-package',
    )

    SOURCE_DEPENDENCY_TYPES = ('Build-Depends', 'Build-Depends-Indep')
    BINARY_DEPENDENCY_TYPES = ('Depends', 'Recommends', 'Suggests')

    def __init__(self, *args, **kwargs):
        super(UpdateRepositoriesTask, self).__init__(*args, **kwargs)
        self._all_packages = []
        self._all_repository_entries = []

    def _clear_processed_repository_entries(self):
        self._all_repository_entries = []

    def _add_processed_repository_entry(self, repository_entry):
        self._all_repository_entries.append(repository_entry.id)

    def _extract_information_from_sources_entry(self, src_pkg, stanza):
        entry = extract_information_from_sources_entry(stanza)

        # Convert the parsed data into corresponding model instances
        if 'architectures' in entry:
            # Map the list of architecture names to their objects
            # Discards any unknown architectures.
            entry['architectures'] = Architecture.objects.filter(
                name__in=entry['architectures'])

        if 'binary_packages' in entry:
            # Map the list of binary package names to list of existing
            # binary package names.
            binary_package_names = entry['binary_packages']
            existing_binaries_qs = BinaryPackageName.objects.filter(
                name__in=binary_package_names)
            existing_binaries_names = []
            binaries = []
            for binary in existing_binaries_qs:
                binaries.append(binary)
                existing_binaries_names.append(binary.name)
            for binary_name in binary_package_names:
                if binary_name not in existing_binaries_names:
                    binary_package_name, _ = PackageName.objects.get_or_create(
                        name=binary_name)
                    binary_package_name.binary = True
                    binary_package_name.save()
                    binary_package_name = BinaryPackageName.objects.get(
                        name=binary_name)
                    binaries.append(binary_package_name)
                    self.raise_event('new-binary-package', {
                        'name': binary_name,
                    })
            entry['binary_packages'] = binaries

        if 'maintainer' in entry:
            maintainer_email, _ = UserEmail.objects.get_or_create(
                email=entry['maintainer']['email'])
            maintainer = ContributorName.objects.get_or_create(
                contributor_email=maintainer_email,
                name=entry['maintainer'].get('name', ''))[0]
            entry['maintainer'] = maintainer

        if 'uploaders' in entry:
            uploader_emails = [
                uploader['email']
                for uploader in entry['uploaders']
            ]
            uploader_names = [
                uploader.get('name', '')
                for uploader in entry['uploaders']
            ]
            existing_contributor_emails_qs = UserEmail.objects.filter(
                email__in=uploader_emails)
            existing_contributor_emails = {
                contributor.email: contributor
                for contributor in existing_contributor_emails_qs
            }
            uploaders = []
            for email, name in zip(uploader_emails, uploader_names):
                if email not in existing_contributor_emails:
                    contributor_email, _ = UserEmail.objects.get_or_create(
                        email=email)
                    existing_contributor_emails[email] = contributor_email
                else:
                    contributor_email = existing_contributor_emails[email]
                uploaders.append(ContributorName.objects.get_or_create(
                    contributor_email=contributor_email,
                    name=name)[0]
                )

            entry['uploaders'] = uploaders

        return entry

    def _extract_information_from_packages_entry(self, bin_pkg, stanza):
        entry = extract_information_from_packages_entry(stanza)

        return entry

    def _update_sources_file(self, repository, sources_file):
        for stanza in deb822.Sources.iter_paragraphs(sources_file):
            allow, implemented = vendor.call('allow_package', stanza)
            if allow is not None and implemented and not allow:
                # The vendor-provided function indicates that the package
                # should not be included
                continue

            src_pkg_name, created = SourcePackageName.objects.get_or_create(
                name=stanza['package']
            )
            if created:
                self.raise_event('new-source-package', {
                    'name': src_pkg_name.name
                })

            src_pkg, created_new_version = SourcePackage.objects.get_or_create(
                source_package_name=src_pkg_name,
                version=stanza['version']
            )
            if created_new_version:
                self.raise_event('new-source-package-version', {
                    'name': src_pkg.name,
                    'version': src_pkg.version,
                    'pk': src_pkg.pk,
                })
                # Since it's a new version, extract package data from Sources
                entry = self._extract_information_from_sources_entry(
                    src_pkg, stanza)
                # Update the source package information based on the newly
                # extracted data.
                src_pkg.update(**entry)
                src_pkg.save()

            if not repository.has_source_package(src_pkg):
                # Does it have any version of the package?
                if not repository.has_source_package_name(src_pkg.name):
                    self.raise_event('new-source-package-in-repository', {
                        'name': src_pkg.name,
                        'repository': repository.name,
                    })

                # Add it to the repository
                kwargs = {
                    'priority': stanza.get('priority', ''),
                    'section': stanza.get('section', ''),
                }
                entry = repository.add_source_package(src_pkg, **kwargs)
                self.raise_event('new-source-package-version-in-repository', {
                    'name': src_pkg.name,
                    'version': src_pkg.version,
                    'repository': repository.name,
                })
            else:
                # We get the entry to mark that the package version is still in
                # the repository.
                entry = SourcePackageRepositoryEntry.objects.get(
                    repository=repository,
                    source_package=src_pkg
                )

            self._add_processed_repository_entry(entry)

    def get_source_for_binary(self, stanza):
        """
        :param stanza: a ``Packages`` file entry
        :returns: A ``(source_name, source_version)`` pair for the binary
            package described by the entry
        """
        source_name = (
            stanza['source']
            if 'source' in stanza else
            stanza['package'])
        # Extract the source version, if given in the Source field
        match = re.match(r'(.+) \((.+)\)', source_name)
        if match:
            source_name, source_version = match.group(1), match.group(2)
        else:
            source_version = stanza['version']

        return source_name, source_version

    def _update_packages_file(self, repository, packages_file):
        for stanza in deb822.Packages.iter_paragraphs(packages_file):
            bin_pkg_name, created = BinaryPackageName.objects.get_or_create(
                name=stanza['package']
            )
            # Find the matching SourcePackage for the binary package
            source_name, source_version = self.get_source_for_binary(stanza)
            src_pkg, _ = SourcePackage.objects.get_or_create(
                source_package_name=SourcePackageName.objects.get_or_create(
                    name=source_name)[0],
                version=source_version)

            bin_pkg, created_new_version = BinaryPackage.objects.get_or_create(
                binary_package_name=bin_pkg_name,
                version=stanza['version'],
                source_package=src_pkg
            )
            if created_new_version:
                # Since it's a new version, extract package data from Packages
                entry = self._extract_information_from_packages_entry(
                    bin_pkg, stanza)
                # Update the binary package information based on the newly
                # extracted data.
                bin_pkg.update(**entry)
                bin_pkg.save()

            if not repository.has_binary_package(bin_pkg):
                # Add it to the repository
                architecture, _ = Architecture.objects.get_or_create(
                    name=stanza['architecture'])
                kwargs = {
                    'priority': stanza.get('priority', ''),
                    'section': stanza.get('section', ''),
                    'architecture': architecture,
                }
                entry = repository.add_binary_package(bin_pkg, **kwargs)
            else:
                # We get the entry to mark that the package version is still in
                # the repository.
                entry = BinaryPackageRepositoryEntry.objects.get(
                    repository=repository,
                    binary_package=bin_pkg)

            self._add_processed_repository_entry(entry)

    def _remove_query_set_if_count_zero(self, qs, count_field,
                                        event_generator=None):
        """
        Removes elements from the given query set if their count of the given
        ``count_field`` is ``0``.

        :param qs: Instances which should be deleted in case their count of the
            field ``count_field`` is 0.
        :type qs: :class:`QuerySet <django.db.models.query.QuerySet>`

        :param count_field: Each instance in ``qs`` that has a 0 count for the
            field with this name is deleted.
        :type count_field: string

        :param event_generator: A ``callable`` which returns a
            ``(name, arguments)`` pair describing the event which should be
            raised based on the model instance given to it as an argument.
        :type event_generator: ``callable``
        """
        qs = qs.annotate(count=models.Count(count_field))
        qs = qs.filter(count=0)
        if event_generator:
            for item in qs:
                self.raise_event(*event_generator(item))
        qs.delete()

    def _remove_obsolete_packages(self):
        self.log("Removing obsolete source packages")
        # Clean up package versions which no longer exist in any repository.
        self._remove_query_set_if_count_zero(
            SourcePackage.objects.all(),
            'repository',
            lambda source_package: (
                'lost-version-of-source-package', {
                    'name': source_package.name,
                    'version': source_package.version,
                }
            )
        )
        # Clean up names which no longer exist.
        self._remove_query_set_if_count_zero(
            SourcePackageName.objects.all(),
            'source_package_versions',
            lambda package: (
                'lost-source-package', {
                    'name': package.name,
                }
            )
        )
        # Clean up binary package names which are no longer used by any source
        # package.
        self._remove_query_set_if_count_zero(
            BinaryPackageName.objects.all(),
            'sourcepackage',
            lambda binary_package_name: (
                'lost-binary-package', {
                    'name': binary_package_name.name,
                }
            )
        )

    def _update_repository_entries(self, all_entries_qs, event_generator=None):
        """
        Removes all repository entries which are no longer found in the
        repository after the last update.
        If the ``event_generator`` argument is provided, an event returned by
        the function is raised for each removed entry.

        :param all_entries_qs: All currently existing entries which should be
            filtered to only contain the ones still found after the update.
        :type all_entries_qs:
            :class:`QuerySet <django.db.models.query.QuerySet>`
        :event_generator: Takes a repository entry as a parameter and returns a
            two-tuple of ``(event_name, event_arguments)``. An event with the
            return parameters is raised by the function for each removed entry.
        :type event_generator: callable
        """
        # Out of all entries in this repository, only those found in
        # the last update need to stay, so exclude them from the delete
        all_entries_qs = all_entries_qs.exclude(
            id__in=self._all_repository_entries)
        # Emit events for all packages that were removed from the repository
        if event_generator:
            for entry in all_entries_qs:
                self.raise_event(*event_generator(entry))
        all_entries_qs.delete()

        self._clear_processed_repository_entries()

    def extract_package_versions(self, file_name):
        """
        :param file_name: The name of the file from which package versions
            should be extracted.
        :type file_name: string
        :returns: A dict mapping package names to a list of versions found in
            Deb822 formatted file.
        """
        with open(file_name, 'r') as packages_file:
            packages = {}
            for stanza in deb822.Deb822.iter_paragraphs(packages_file):
                package_name, version = stanza['package'], stanza['version']
                packages.setdefault(package_name, [])
                packages[package_name].append(version)

            return packages

    def _mark_file_not_processed(self, repository, file_name, entry_manager):
        """
        The given ``Sources`` or ``Packages`` file has not been changed in the
        last update. This method marks all package versions found in it as
        still existing in order to avoid deleting them.

        :param repository: The repository to which the file is associated
        :type repository:
            :class:`Repository <distro_tracker.core.models.Repository>`
        :param file_name: The name of the file whose packages should be saved
        :param entry_manager: The manager instance which handles the package
            entries.
        :type entry_manager: :class:`Manager <django.db.models.Manager>`
        """
        # Extract all package versions from the file
        packages = self.extract_package_versions(file_name)

        # Only issue one DB query to retrieve the entries for packages with
        # the given names
        repository_entries = \
            entry_manager.filter_by_package_name(packages.keys())
        repository_entries = repository_entries.filter(
            repository=repository)
        repository_entries = repository_entries.select_related()
        # For each of those entries, make sure to keep only the ones
        # corresponding to the version found in the sources file
        for entry in repository_entries:
            if entry.version in packages[entry.name]:
                self._add_processed_repository_entry(entry)

    def group_files_by_repository(self, cached_files):
        """
        :param cached_files: A list of ``(repository, file_name)`` pairs
        :returns: A dict mapping repositories to all file names found for that
            repository.
        """
        repository_files = {}
        for repository, file_name in cached_files:
            repository_files.setdefault(repository, [])
            repository_files[repository].append(file_name)

        return repository_files

    def update_sources_files(self, updated_sources):
        """
        Performs an update of tracked packages based on the updated Sources
        files.

        :param updated_sources: A list of ``(repository, sources_file_name)``
            pairs giving the Sources files which were updated and should be
            used to update the Distro Tracker tracked information too.
        """
        # Group all files by repository to which they belong
        repository_files = self.group_files_by_repository(updated_sources)

        for repository, sources_files in repository_files.items():
            with transaction.atomic():
                self.log("Processing Sources files of %s repository",
                         repository.shorthand)
                # First update package information based on updated files
                for sources_file in sources_files:
                    with open(sources_file) as sources_fd:
                        self._update_sources_file(repository, sources_fd)

                # Mark package versions found in un-updated files as still
                # existing
                all_sources = \
                    self.apt_cache.get_sources_files_for_repository(repository)
                for sources_file in all_sources:
                    if sources_file not in sources_files:
                        self._mark_file_not_processed(
                            repository,
                            sources_file,
                            SourcePackageRepositoryEntry.objects)

                # When all the files for the repository are handled, update
                # which packages are still found in it.
                self._update_repository_entries(
                    SourcePackageRepositoryEntry.objects.filter(
                        repository=repository),
                    lambda entry: (
                        'lost-source-package-version-in-repository', {
                            'name': entry.source_package.name,
                            'version': entry.source_package.version,
                            'repository': entry.repository.name,
                        })
                )

        with transaction.atomic():
            # When all repositories are handled, update which packages are
            # still found in at least one repository.
            self._remove_obsolete_packages()

    def update_packages_files(self, updated_packages):
        """
        Performs an update of tracked packages based on the updated Packages
        files.

        :param updated_sources: A list of ``(repository, packages_file_name)``
            pairs giving the Packages files which were updated and should be
            used to update the Distro Tracker tracked information too.
        """
        # Group all files by repository to which they belong
        repository_files = self.group_files_by_repository(updated_packages)

        for repository, packages_files in repository_files.items():
            self.log("Processing Packages files of %s repository",
                     repository.shorthand)
            # First update package information based on updated files
            for packages_file in packages_files:
                with open(packages_file) as packages_fd:
                    self._update_packages_file(repository, packages_fd)

            # Mark package versions found in un-updated files as still existing
            all_sources = \
                self.apt_cache.get_packages_files_for_repository(repository)
            for packages_file in all_sources:
                if packages_file not in packages_files:
                    self._mark_file_not_processed(
                        repository, packages_file,
                        BinaryPackageRepositoryEntry.objects)

            # When all the files for the repository are handled, update
            # which packages are still found in it.
            self._update_repository_entries(
                BinaryPackageRepositoryEntry.objects.filter(
                    repository=repository))

    def _update_dependencies_for_source(self,
                                        stanza,
                                        dependency_types):
        """
        Updates the dependencies for a source package based on the ones found
        in the given ``Packages`` or ``Sources`` stanza.

        :param source_name: The name of the source package for which the
            dependencies are updated.
        :param stanza: The ``Packages`` or ``Sources`` entry
        :param dependency_type: A list of dependency types which should be
            considered (e.g. Build-Depends, Recommends, etc.)
        :param source_to_binary_deps: The dictionary which should be updated
            with the new dependencies. Maps source names to a list of dicts
            each describing a dependency.
        """
        binary_dependencies = []
        for dependency_type in dependency_types:
            # The Deb822 instance is case sensitive when it comes to relations
            dependencies = stanza.relations.get(dependency_type.lower(), ())

            for dependency in itertools.chain(*dependencies):
                binary_name = dependency['name']
                binary_dependencies.append({
                    'dependency_type': dependency_type,
                    'binary': binary_name,
                })

        return binary_dependencies

    def _process_source_to_binary_deps(self, source_to_binary_deps, all_sources,
                                       bin_to_src, default_repository):
        dependency_instances = []
        for source_name, dependencies in source_to_binary_deps.items():
            if source_name not in all_sources:
                continue

            # All dependencies for the current source package.
            all_dependencies = {}
            for dependency in dependencies:
                binary_name = dependency['binary']
                dependency_type = dependency.pop('dependency_type')
                if binary_name not in bin_to_src:
                    continue

                for source_dependency in bin_to_src[binary_name]:
                    if source_name == source_dependency:
                        continue

                    source_dependencies = \
                        all_dependencies.setdefault(source_dependency, {})
                    source_dependencies.setdefault(dependency_type, [])
                    if dependency not in source_dependencies[dependency_type]:
                        source_dependencies[dependency_type].append(dependency)

            # Create the dependency instances for the current source package.
            for dependency_name, details in all_dependencies.items():
                if dependency_name in all_sources:
                    build_dep = any(dependency_type in details
                                    for dependency_type
                                    in self.SOURCE_DEPENDENCY_TYPES)
                    binary_dep = any(dependency_type in details
                                     for dependency_type
                                     in self.BINARY_DEPENDENCY_TYPES)
                    dependency_instances.append(
                        SourcePackageDeps(
                            source=all_sources[source_name],
                            dependency=all_sources[dependency_name],
                            build_dep=build_dep,
                            binary_dep=binary_dep,
                            repository=default_repository,
                            details=details))

        return dependency_instances

    def update_dependencies(self):
        """
        Updates source-to-source package dependencies stemming from
        build bependencies and their binary packages' dependencies.
        """
        # Build the dependency mapping
        try:
            default_repository = Repository.objects.get(default=True)
        except Repository.DoesNotExist:
            self.log("No default repository, no dependencies created.",
                     level=logging.WARNING)
            return

        self.log("Parsing files to discover dependencies")
        sources_files = self.apt_cache.get_sources_files_for_repository(
            default_repository)
        packages_files = self.apt_cache.get_packages_files_for_repository(
            default_repository)

        bin_to_src = {}
        source_to_binary_deps = {}

        # First builds a list of binary dependencies of all source packages
        # based on the Sources file.
        for sources_file in sources_files:
            with open(sources_file) as sources_fd:
                for stanza in deb822.Sources.iter_paragraphs(sources_fd):
                    source_name = stanza['package']

                    for binary in itertools.chain(*stanza.relations['binary']):
                        sources_set = bin_to_src.setdefault(binary['name'],
                                                            set())
                        sources_set.add(source_name)

                    dependencies = source_to_binary_deps.setdefault(source_name,
                                                                    [])
                    dependencies.extend(self._update_dependencies_for_source(
                        stanza,
                        self.SOURCE_DEPENDENCY_TYPES))

        # Then a list of binary dependencies based on the Packages file.
        for packages_file in packages_files:
            with open(packages_file) as packages_fd:
                for stanza in deb822.Packages.iter_paragraphs(packages_fd):
                    binary_name = stanza['package']
                    source_name, source_version = \
                        self.get_source_for_binary(stanza)

                    sources_set = bin_to_src.setdefault(binary_name, set())
                    sources_set.add(source_name)

                    new_dependencies = self._update_dependencies_for_source(
                        stanza,
                        self.BINARY_DEPENDENCY_TYPES)
                    for dependency in new_dependencies:
                        dependency['source_binary'] = binary_name
                    dependencies = source_to_binary_deps.setdefault(source_name,
                                                                    [])
                    dependencies.extend(new_dependencies)

        # The binary packages are matched with their source packages and each
        # source to source dependency created.
        all_sources = {
            source.name: source
            for source in SourcePackageName.objects.all()
        }

        self.log("Creating in-memory SourcePackageDeps")
        # Keeps a list of SourcePackageDeps instances which are to be bulk
        # created in the end.
        dependency_instances = \
            self._process_source_to_binary_deps(source_to_binary_deps,
                                                all_sources, bin_to_src,
                                                default_repository)

        # Create all the model instances in one transaction
        self.log("Committing SourcePackagesDeps to database")
        SourcePackageDeps.objects.all().delete()
        SourcePackageDeps.objects.bulk_create(dependency_instances)

    @clear_all_events_on_exception
    def execute(self):
        self.log("Updating apt's cache")
        self.apt_cache = AptCache()
        updated_sources, updated_packages = (
            self.apt_cache.update_repositories(self.force_update)
        )

        self.log("Updating data from Sources files")
        self.update_sources_files(updated_sources)
        self.log("Updating data from Packages files")
        self.update_packages_files(updated_packages)
        self.log("Updating dependencies")
        self.update_dependencies()
class ExtractSourcePackageFiles(BaseTask, ProcessSourcePackage):
    """
    A task which extracts some files from a new source package version.
    The extracted files are:

    - debian/changelog
    - debian/copyright
    - debian/rules
    - debian/control
    - debian/watch
    """

    class Scheduler(IntervalScheduler):
        interval = 3600

    ALL_FILES_TO_EXTRACT = (
        'changelog',
        'copyright',
        'rules',
        'control',
        'watch',
    )

    def items_extend_queryset(self, queryset):
        return queryset.prefetch_related('extracted_source_files')

    def extract_files(self, source_package, files_to_extract=None):
        """
        Extract files for just the given source package.

        :type source_package: :class:`SourcePackage
            <distro_tracker.core.models.SourcePackage>`
        :type files_to_extract: An iterable of file names which should be
            extracted
        """
        if not hasattr(self, 'cache'):
            self.cache = AptCache()

        source_directory = self.cache.retrieve_source(
            source_package.source_package_name.name,
            source_package.version,
            debian_directory_only=True)
        debian_directory = os.path.join(source_directory, 'debian')

        if files_to_extract is None:
            files_to_extract = self.ALL_FILES_TO_EXTRACT

        for file_name in files_to_extract:
            file_path = os.path.join(debian_directory, file_name)
            if not os.path.exists(file_path):
                continue
            with open(file_path, 'rb') as f:
                extracted_file = File(f)
                ExtractedSourceFile.objects.create(
                    source_package=source_package,
                    extracted_file=extracted_file,
                    name=file_name)

    def execute_main(self):
        # First remove all source files which are no longer to be included.
        qs = ExtractedSourceFile.objects.exclude(
            name__in=self.ALL_FILES_TO_EXTRACT)
        qs.delete()

        # Process pending items
        for srcpkg in self.items_to_process():
            # Save what has been processed when it takes long enough that we
            # had to extend the lock
            if self.extend_lock():
                self.save_data()

            extracted_files = [
                extracted_file.name
                for extracted_file in srcpkg.extracted_source_files.all()
            ]
            files_to_extract = [
                file_name
                for file_name in self.ALL_FILES_TO_EXTRACT
                if file_name not in extracted_files
            ]
            if files_to_extract:
                try:
                    self.extract_files(srcpkg, files_to_extract)
                    self.item_mark_processed(srcpkg)
                except Exception:
                    logger.exception(
                        'Problem extracting source files for %s version %s',
                        srcpkg, srcpkg.version)
            else:
                self.item_mark_processed(srcpkg)