Beispiel #1
0
    def _push_packages(self, projects: Iterable[List[Package]]) -> None:
        self._logger.log('updating projects')

        field_stats_per_repo: Dict[str, FieldStatistics] = defaultdict(
            FieldStatistics)
        stats = ProjectsChangeStatistics()

        prev_total = 0

        changed_projects = ChangedProjectsAccumulator(self._database)

        for change in iter_changed_projects(
                iter_project_hashes(self._database), projects, stats):
            if isinstance(change, UpdatedProject):
                if len(change.packages) >= 20000:
                    raise RuntimeError(
                        'sanity check failed, more than 20k packages for a single project'
                    )

                fill_packageset_versions(change.packages)
                self._database.add_packages(map(adapt_package,
                                                change.packages))
                self._database.update_project_hash(change.effname,
                                                   change.hash_)

                for package in change.packages:
                    field_stats_per_repo[package.repo].add(package)

            elif isinstance(change, RemovedProject):
                self._database.remove_project_hash(change.effname)

            changed_projects.add(change.effname)

            if stats.total - prev_total >= 10000 or prev_total == 0:
                self._logger.log(f'  at "{change.effname}": {stats}')
                prev_total = stats.total

        changed_projects.flush()
        self._logger.log(f'  done: {stats}')

        self._logger.log('updating field statistics')
        for repo, field_stats in field_stats_per_repo.items():
            self._database.update_repository_used_package_fields(
                repo, field_stats.get_used_fields(),
                field_stats.get_used_link_types())

        # This was picked randomly
        self._enable_explicit_analyze = stats.change_fraction > 0.05
Beispiel #2
0
    def _push_packages(self, projects: Iterable[List[Package]]) -> None:
        self._logger.log('updating projects')

        field_stats_per_repo: Dict[str, FieldStatistics] = defaultdict(
            FieldStatistics)
        stats = ProjectsChangeStatistics()

        prev_total = 0

        changed_projects = ChangedProjectsAccumulator(self._database)

        for change in iter_changed_projects(
                iter_project_hashes(self._database), projects, stats):
            if isinstance(change, UpdatedProject):
                fill_packageset_versions(change.packages)
                self._database.add_packages(change.packages)
                self._database.update_project_hash(change.effname,
                                                   change.hash_)

                for package in change.packages:
                    field_stats_per_repo[package.repo].add(package)

            elif isinstance(change, RemovedProject):
                self._database.remove_project_hash(change.effname)

            changed_projects.add(change.effname)

            if stats.total - prev_total >= 10000 or prev_total == 0:
                self._logger.log(f'  at "{change.effname}": {stats}')
                prev_total = stats.total

        changed_projects.flush()
        self._logger.log(f'  done: {stats}')

        self._logger.log('updating field statistics')
        for repo, field_stats in field_stats_per_repo.items():
            self._database.update_repository_used_package_fields(
                repo, field_stats.get_used_fields())

        # Fraction picked experimentally: at change size of around 100k of 400k projects
        # time of partial update of most binding tables approaches or exceeds full update
        # time. In fact this doesn't matter much, as general update is arond 0.001 (0.1%),
        # and a few cases of > 0.01 (1%) are when new repositories are added, othewise it's
        # 1 (100%) when Package format changes or when database is filled for the first time.
        self._enable_partial_update = stats.change_fraction < 0.25

        # This was picked randomly
        self._enable_explicit_analyze = stats.change_fraction > 0.05
Beispiel #3
0
def update_repology(database: Database,
                    projects: Optional[Iterable[List[Package]]],
                    logger: Logger) -> None:
    logger.log('starting the update')
    database.update_start()

    logger.log('updating projects')

    field_stats_per_repo: Dict[str,
                               FieldStatistics] = defaultdict(FieldStatistics)
    stats = ProjectsChangeStatistics()

    if projects is not None:
        prev_total = 0

        changed_projects = ChangedProjectsAccumulator(database)

        for change in iter_changed_projects(iter_project_hashes(database),
                                            projects, stats):
            if isinstance(change, UpdatedProject):
                update_project(database, change)

                for package in change.packages:
                    field_stats_per_repo[package.repo].add(package)

            elif isinstance(change, RemovedProject):
                remove_project(database, change)

            changed_projects.add(change.effname)

            if stats.total - prev_total >= 10000 or prev_total == 0:
                logger.log(f'  at "{change.effname}": {stats}')
                prev_total = stats.total

        changed_projects.flush()
        logger.log(f'  done: {stats}')

    # Fraction picked experimentally: at change size of around 100k of 400k projects
    # time of partial update of most binding tables approaches or exceeds full update
    # time. In fact this doesn't matter much, as general update is arond 0.001 (0.1%),
    # and a few cases of > 0.01 (1%) are when new repositories are added, othewise it's
    # 1 (100%) when Package format changes or when database is filled for the first time.
    enable_partial = stats.change_fraction < 0.25

    # This was picked randomly
    enable_analyze = stats.change_fraction > 0.05

    logger.log(f'update mode is {"partial" if enable_partial else "full"}')
    logger.log(
        f'explicit analyze is {"enabled" if enable_analyze else "disabled"}')

    logger.log('updating field statistics')
    for repo, field_stats in field_stats_per_repo.items():
        database.update_repository_used_package_fields(
            repo, field_stats.get_used_fields())

    logger.log('preparing updated packages')
    database.update_prepare_packages()

    logger.log('updating projects (precreate)')
    database.update_precreate_projects()

    logger.log('updating maintainers (precreate)')
    database.update_precreate_maintainers()

    logger.log('updating tracks')
    database.update_tracks(enable_partial, enable_analyze)

    logger.log('updating track versions')
    database.update_track_versions(enable_partial, enable_analyze)

    logger.log('updating project releases')
    database.update_project_releases(enable_partial, enable_analyze)

    logger.log('updating project events')
    database.update_project_events()

    logger.log('updating maintainer events')
    database.update_maintainer_events()

    logger.log('updating repositry events')
    database.update_repository_events()

    logger.log('updating projects turnover')
    database.update_projects_turnover()

    logger.log('updating links')
    database.update_links()

    logger.log('updating statistics (delta)')
    database.update_statistics_delta()

    # Note: before this, packages table still contains old versions of packages,
    # while new versions reside in incoming_packages temporary table
    logger.log('applying updated packages')
    database.update_apply_packages(enable_partial, enable_analyze)
    # Note: after this, packages table contain new versions of packages

    logger.log('updating metapackages')
    database.update_metapackages()

    logger.log('updating repositories')
    database.update_repositories()

    logger.log('updating maintainers')
    database.update_maintainers()

    logger.log('updating binding table repo_metapackages')
    database.update_binding_repo_metapackages(enable_partial, enable_analyze)

    logger.log('updating binding table category_metapackages')
    database.update_binding_category_metapackages(enable_partial,
                                                  enable_analyze)

    logger.log('updating binding table maintainer_metapackages')
    database.update_binding_maintainer_metapackages(enable_partial,
                                                    enable_analyze)

    logger.log('updating binding table maintainer_and_repo_metapackages')
    database.update_binding_maintainer_and_repo_metapackages(
        enable_partial, enable_analyze)

    logger.log('updating url relations (all)')
    database.update_url_relations_all(enable_partial, enable_analyze)

    logger.log('updating url relations (filtered)')
    database.update_url_relations_filtered(enable_partial, enable_analyze)

    logger.log('updating projects has_related flag')
    database.update_projects_has_related()

    logger.log('updating problems')
    database.update_problems(enable_partial, enable_analyze)

    logger.log('updating problem counts')
    database.update_repositories_problem_counts()

    logger.log('updating statistics (global)')
    database.update_statistics_global()

    logger.log('updating histories')
    database.update_histories()

    logger.log('finalizing the update')
    database.update_finish()