Ejemplo n.º 1
0
def LinkProcessorWorker(queue, workerid, options, logger):
    database = Database(options.dsn, readonly=False)

    logger = logger.GetPrefixed('worker{}: '.format(workerid))

    logger.Log('Worker spawned')

    while True:
        pack = queue.get()
        if pack is None:
            logger.Log('Worker exiting')
            return

        logger.Log('Processing {} urls ({}..{})'.format(
            len(pack), pack[0], pack[-1]))
        for result in GetLinkStatuses(pack,
                                      delay=options.delay,
                                      timeout=options.timeout):
            url, status, redirect, size, location = result
            database.UpdateLinkStatus(url=url,
                                      status=status,
                                      redirect=redirect,
                                      size=size,
                                      location=location)

        database.Commit()
        logger.Log('Done processing {} urls ({}..{})'.format(
            len(pack), pack[0], pack[-1]))
Ejemplo n.º 2
0
def ProcessDatabase(options, logger, repoman, repositories_updated):
    logger.Log('connecting to database')

    db_logger = logger.GetIndented()

    database = Database(options.dsn, readonly=False)
    if options.initdb:
        db_logger.Log('(re)initializing database schema')
        database.CreateSchema()

    if options.database:
        db_logger.Log('clearing the database')
        database.Clear()

        package_queue = []
        num_pushed = 0

        def PackageProcessor(packageset):
            nonlocal package_queue, num_pushed
            FillPackagesetVersions(packageset)
            package_queue.extend(packageset)

            if len(package_queue) >= 1000:
                database.AddPackages(package_queue)
                num_pushed += len(package_queue)
                package_queue = []
                db_logger.Log('  pushed {} packages'.format(num_pushed))

        db_logger.Log('pushing packages to database')
        repoman.StreamDeserializeMulti(processor=PackageProcessor,
                                       reponames=options.reponames)

        # process what's left in the queue
        database.AddPackages(package_queue)

        if options.fetch and options.update and options.parse:
            db_logger.Log('recording repo updates')
            database.MarkRepositoriesUpdated(repositories_updated)
        else:
            db_logger.Log(
                'not recording repo updates, need --fetch --update --parse')

        db_logger.Log('updating views')
        database.UpdateViews()
        database.ExtractLinks()

        db_logger.Log('updating history')
        database.SnapshotRepositoriesHistory()

        db_logger.Log('committing changes')
        database.Commit()

    logger.Log('database processing complete')
Ejemplo n.º 3
0
def get_db():
    # XXX: this is not really a persistent DB connection!
    if not hasattr(flask.g, 'database'):
        flask.g.database = Database(config['DSN'],
                                    readonly=False,
                                    autocommit=True)
    return flask.g.database
Ejemplo n.º 4
0
def LinkUpdatingWorker(queue, options, logger):
    database = Database(options.dsn, readonly=False)

    logger = logger.GetPrefixed('writer: ')

    logger.Log('Writer spawned')

    while True:
        pack = queue.get()
        if pack is None:
            logger.Log('Writer exiting')
            return

        for url, status, redirect, size, location in pack:
            database.UpdateLinkStatus(url=url, status=status, redirect=redirect, size=size, location=location)

        database.Commit()
        logger.Log('Updated {} url(s) ({} .. {})'.format(len(pack), pack[0][0], pack[-1][0]))
Ejemplo n.º 5
0
def get_db():
    # XXX: this is not really a persistent DB connection!
    if not hasattr(flask.g, 'database'):
        flask.g.database = Database(config['DSN'],
                                    _querymgr,
                                    readonly=False,
                                    autocommit=True,
                                    application_name='repology-app')
    return flask.g.database
Ejemplo n.º 6
0
def iter_project_hashes(database: Database) -> Iterable[ProjectHash]:
    prev_effname = None

    batch_size = 1000

    while True:
        pack = database.get_project_hashes(prev_effname, batch_size)
        if not pack:
            return

        yield from pack
        prev_effname = pack[-1][0]
Ejemplo n.º 7
0
def LinkUpdatingWorker(queue, options, querymgr, logger):
    database = Database(options.dsn,
                        querymgr,
                        readonly=False,
                        application_name='repology-linkchecker/writer')

    logger = logger.get_prefixed('writer: ')

    logger.log('Writer spawned')

    while True:
        pack = queue.get()
        if pack is None:
            logger.log('Writer exiting')
            return

        for url, status, redirect, size, location in pack:
            database.update_link_status(url=url,
                                        status=status,
                                        redirect=redirect,
                                        size=size,
                                        location=location)

        database.commit()
        logger.log('Updated {} url(s) ({} .. {})'.format(
            len(pack), pack[0][0], pack[-1][0]))
Ejemplo n.º 8
0
def main() -> int:
    options = parse_arguments()

    querymgr = QueryManager(options.sql_dir)
    database = Database(options.dsn,
                        querymgr,
                        readonly=True,
                        application_name='repology-benchmark')

    reference: Dict[str, float] = {}
    if options.load:
        try:
            with open(options.load, 'rb') as reffile:
                reference = pickle.load(reffile)
        except:
            pass

    results = []
    for num, (method, name, kwargs) in enumerate(queries):
        if not check_keywords(name, options.keywords):
            continue
        print('===> {}/{}: "{}"\n'.format(num + 1, len(queries), name),
              file=sys.stderr,
              end='')
        results.append(
            (name, run_single_query(database, method, kwargs, options)))
        sys.stderr.flush()

    for name, delta in results:
        change = ''
        if name in reference:
            if max(delta, reference[name]) / min(delta, reference[name]) < (
                    1 + options.epsilon):
                change = ' no change'
            elif delta > reference[name]:
                change = ' \033[0;91m{:.1f}% slower\033[0m'.format(
                    100.0 * delta / reference[name] - 100.0)
            else:
                change = ' \033[0;92m{:.1f}% faster\033[0m'.format(
                    100.0 * reference[name] / delta - 100.0)

            change += ' (was {:.2f}ms)'.format(reference[name] * 1000.0)

        print('{:>50s} {:.2f}ms{}'.format(name, delta * 1000.0, change),
              file=sys.stderr)

    if options.save:
        reference = {name: delta for name, delta in results}
        with open(options.save, 'wb') as reffile:
            pickle.dump(reference, reffile)

    return 0
Ejemplo n.º 9
0
def remove_project(database: Database, change: RemovedProject) -> None:
    database.remove_project_hash(change.effname)
Ejemplo n.º 10
0
def update_project(database: Database, change: UpdatedProject) -> None:
    fill_packageset_versions(change.packages)

    database.add_packages(change.packages)

    database.update_project_hash(change.effname, change.hash)
Ejemplo n.º 11
0
def ProcessDatabase(options, logger, repomgr, repoproc, repositories_updated,
                    reponames):
    logger.Log('connecting to database')

    db_logger = logger.GetIndented()

    querymgr = QueryManager(options.sql_dir)
    database = Database(options.dsn,
                        querymgr,
                        readonly=False,
                        application_name='repology-update')
    if options.initdb:
        db_logger.Log('(re)initializing database schema')
        database.create_schema()

        db_logger.Log('committing changes')
        database.commit()

    if options.database:
        db_logger.Log('clearing the database')
        database.update_start()

        db_logger.Log('updating repository metadata')
        database.add_repositories(repomgr.GetMetadatas(reponames))

        package_queue = []
        num_pushed = 0
        start_time = timer()

        def PackageProcessor(packageset):
            nonlocal package_queue, num_pushed, start_time
            FillPackagesetVersions(packageset)
            package_queue.extend(packageset)

            if len(package_queue) >= 10000:
                database.add_packages(package_queue)
                num_pushed += len(package_queue)
                package_queue = []
                db_logger.Log(
                    '  pushed {} packages, {:.2f} packages/second'.format(
                        num_pushed, num_pushed / (timer() - start_time)))

        db_logger.Log('pushing packages to database')
        repoproc.StreamDeserializeMulti(processor=PackageProcessor,
                                        reponames=options.reponames)

        # process what's left in the queue
        database.add_packages(package_queue)

        if options.fetch and options.update and options.parse:
            db_logger.Log('recording repo updates')
            database.mark_repositories_updated(repositories_updated)
        else:
            db_logger.Log(
                'not recording repo updates, need --fetch --update --parse')

        db_logger.Log('updating views')
        database.update_finish()

        database.commit()

    if options.postupdate:
        db_logger.Log('performing database post-update actions')
        database.update_post()

        database.commit()

    logger.Log('database processing complete')
Ejemplo n.º 12
0
def Main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dsn',
                        default=repology.config.DSN,
                        help='database connection params')
    parser.add_argument('--logfile',
                        help='path to log file (log to stderr by default)')

    parser.add_argument('--timeout',
                        type=float,
                        default=60.0,
                        help='timeout for link requests in seconds')
    parser.add_argument('--delay',
                        type=float,
                        default=3.0,
                        help='delay between requests to one host')
    parser.add_argument('--age',
                        type=int,
                        default=365,
                        help='min age for recheck in days')
    parser.add_argument('--packsize',
                        type=int,
                        default=128,
                        help='pack size for link processing')
    parser.add_argument(
        '--maxpacksize',
        type=int,
        help='max pack size for link processing (useful to skip large hosts)')
    parser.add_argument('--jobs',
                        type=int,
                        default=1,
                        help='number of parallel jobs')

    parser.add_argument('--unchecked',
                        action='store_true',
                        help='only process unchecked (newly discovered) links')
    parser.add_argument('--checked',
                        action='store_true',
                        help='only process old (already checked) links')
    parser.add_argument('--failed',
                        action='store_true',
                        help='only process links that were checked and failed')
    parser.add_argument('--succeeded',
                        action='store_true',
                        help='only process links that were checked and failed')
    parser.add_argument('--prefix',
                        help='only process links with specified prefix')
    options = parser.parse_args()

    logger = FileLogger(options.logfile) if options.logfile else StderrLogger()
    database = Database(options.dsn, readonly=True, autocommit=True)

    queue = multiprocessing.Queue(1)
    processpool = [
        multiprocessing.Process(target=LinkProcessorWorker,
                                args=(queue, i, options, logger))
        for i in range(options.jobs)
    ]
    for process in processpool:
        process.start()

    # base logger already passed to workers, may append prefix here
    logger = logger.GetPrefixed('master: ')

    prev_url = None
    while True:
        # Get pack of links
        logger.Log('Requesting pack of urls'.format(prev_url))
        urls = database.GetLinksForCheck(
            after=prev_url,
            prefix=options.prefix,  # no limit by default
            limit=options.packsize,
            recheck_age=options.age * 60 * 60 * 24,
            unchecked_only=options.unchecked,
            checked_only=options.checked,
            failed_only=options.failed,
            succeeded_only=options.succeeded)
        if not urls:
            logger.Log('  No more urls to process')
            break

        # Get another pack of urls with the last hostname to ensure
        # that all urls for one hostname get into a same large pack
        match = re.match('([a-z]+://[^/]+/)', urls[-1])
        if match:
            urls += database.GetLinksForCheck(after=urls[-1],
                                              prefix=match.group(1),
                                              recheck_age=options.age * 60 *
                                              60 * 24,
                                              unchecked_only=options.unchecked,
                                              checked_only=options.checked,
                                              failed_only=options.failed,
                                              succeeded_only=options.succeeded)

        # Process
        if options.maxpacksize and len(urls) > options.maxpacksize:
            logger.Log(
                'Skipping {} urls ({}..{}), exceeds max pack size'.format(
                    len(urls), urls[0], urls[-1]))
        else:
            queue.put(urls)
            logger.Log('Enqueued {} urls ({}..{})'.format(
                len(urls), urls[0], urls[-1]))

        prev_url = urls[-1]

    logger.Log('Waiting for child processes to exit')

    for process in processpool:
        queue.put(None)

    for process in processpool:
        process.join()

    logger.Log('Done')

    return 0
Ejemplo n.º 13
0
def main() -> int:
    options = parse_arguments()

    querymgr = QueryManager(options.sql_dir)
    database = Database(options.dsn, querymgr, readonly=True, application_name='repology-gensitemap')

    urls: List[str] = []
    if options.main:
        urls = ['/', '/news', '/statistics', '/about', '/api/v1', '/repositories/']

        urls.extend(('/maintainer/' + name for name in database.get_all_maintainer_names()))
        urls.extend(('/repository/' + name for name in database.get_all_repository_names()))
    elif options.metapackages:
        links_per_metapackage = 3

        print('Guessing threshold for important metapackages', file=sys.stderr)

        num_repos = 1
        while True:
            num_metapackages = database.get_all_metapackage_names_by_min_spread_count(num_repos)

            num_urls_total = len(urls) + num_metapackages * links_per_metapackage

            print('Threshold = {}, {} metapackages, {} total urls'.format(num_repos, num_metapackages, num_urls_total), file=sys.stderr)

            if num_urls_total <= options.max_urls:
                print('  Looks good', file=sys.stderr)
                break

            if num_repos > 20:
                print('  Giving up, will truncate metapackage list', file=sys.stderr)
                break

            num_repos += 1

        # get most important packages
        for name in database.get_all_metapackage_names_by_min_spread(num_repos, (options.max_urls - len(urls)) // links_per_metapackage):
            urls.append('/project/' + name + '/versions')
            urls.append('/project/' + name + '/packages')
            urls.append('/project/' + name + '/information')

        # fill the remaining space with less important packages
        for name in database.get_all_metapackage_names_by_spread(num_repos - 1, (options.max_urls - len(urls)) // links_per_metapackage):
            urls.append('/project/' + name + '/versions')
            urls.append('/project/' + name + '/packages')
            urls.append('/project/' + name + '/information')
    else:
        print('Please specify output mode', file=sys.stderr)

    shuffle(urls)

    # write XML
    print('Writing XML', file=sys.stderr)

    print('<?xml version="1.0" encoding="UTF-8"?>')
    print('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
    for url in urls:
        print('<url><loc>' + html.escape(options.www_home + url) + '</loc><changefreq>daily</changefreq></url>')
    print('</urlset>')

    return 0
Ejemplo n.º 14
0
def Main():
    options = ParseArguments()

    database = Database(options.dsn, readonly=True)

    urls = []
    if options.main:
        urls = [
            '/', '/news', '/statistics', '/about', '/api/v1', '/repositories/'
        ]

        urls.extend(
            map(lambda row: '/maintainer/' + row[0],
                database.Query('SELECT maintainer FROM maintainers')))
        urls.extend(
            map(lambda row: '/repository/' + row[0],
                database.Query('SELECT name FROM repositories')))
    elif options.metapackages:
        links_per_metapackage = 3

        print('Guessing threshold for important metapackages', file=sys.stderr)

        num_repos = 1
        while True:
            num_metapackages = database.Query(
                'SELECT count(DISTINCT effname) FROM metapackage_repocounts WHERE num_families >= %s',
                num_repos)[0][0]

            num_urls_total = len(
                urls) + num_metapackages * links_per_metapackage

            print('Threshold = {}, {} metapackages, {} total urls'.format(
                num_repos, num_metapackages, num_urls_total),
                  file=sys.stderr)

            if num_urls_total <= options.max_urls:
                print('  Looks good', file=sys.stderr)
                break

            if num_repos > 20:
                print('  Giving up, will truncate metapackage list',
                      file=sys.stderr)
                break

            num_repos += 1

        # get most important packages
        for row in database.Query(
                'SELECT DISTINCT effname FROM metapackage_repocounts WHERE num_families >= %s LIMIT %s',
                num_repos,
            (options.max_urls - len(urls)) // links_per_metapackage):
            urls.append('/metapackage/' + row[0] + '/versions')
            urls.append('/metapackage/' + row[0] + '/packages')
            urls.append('/metapackage/' + row[0] + '/information')

        # fill the remaining space with less important packages
        for row in database.Query(
                'SELECT DISTINCT effname FROM metapackage_repocounts WHERE num_families = %s LIMIT %s',
                num_repos - 1,
            (options.max_urls - len(urls)) // links_per_metapackage):
            urls.append('/metapackage/' + row[0] + '/versions')
            urls.append('/metapackage/' + row[0] + '/packages')
            urls.append('/metapackage/' + row[0] + '/information')
    else:
        print('Please specify output mode', file=sys.stderr)

    shuffle(urls)

    # write XML
    print('Writing XML', file=sys.stderr)

    print('<?xml version="1.0" encoding="UTF-8"?>')
    print('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
    for url in urls:
        print('<url><loc>' + html.escape(options.www_home + url) +
              '</loc><changefreq>daily</changefreq></url>')
    print('</urlset>')

    return 0
Ejemplo n.º 15
0
 def get_main_database_connection(self) -> Database:
     return Database(self.options.dsn,
                     self.get_query_manager(),
                     readonly=False,
                     application_name='repology-update')
Ejemplo n.º 16
0
def update_repology(database: Database,
                    projects: Optional[Iterable[List[Package]]],
                    logger: Logger) -> None:
    logger.log('starting the update')
    database.update_start()

    logger.log('updating projects')

    field_stats_per_repo: Dict[str,
                               FieldStatistics] = defaultdict(FieldStatistics)
    stats = ProjectsChangeStatistics()

    if projects is not None:
        prev_total = 0

        changed_projects = ChangedProjectsAccumulator(database)

        for change in iter_changed_projects(iter_project_hashes(database),
                                            projects, stats):
            if isinstance(change, UpdatedProject):
                update_project(database, change)

                for package in change.packages:
                    field_stats_per_repo[package.repo].add(package)

            elif isinstance(change, RemovedProject):
                remove_project(database, change)

            changed_projects.add(change.effname)

            if stats.total - prev_total >= 10000 or prev_total == 0:
                logger.log(f'  at "{change.effname}": {stats}')
                prev_total = stats.total

        changed_projects.flush()
        logger.log(f'  done: {stats}')

    # Fraction picked experimentally: at change size of around 100k of 400k projects
    # time of partial update of most binding tables approaches or exceeds full update
    # time. In fact this doesn't matter much, as general update is arond 0.001 (0.1%),
    # and a few cases of > 0.01 (1%) are when new repositories are added, othewise it's
    # 1 (100%) when Package format changes or when database is filled for the first time.
    enable_partial = stats.change_fraction < 0.25

    # This was picked randomly
    enable_analyze = stats.change_fraction > 0.05

    logger.log(f'update mode is {"partial" if enable_partial else "full"}')
    logger.log(
        f'explicit analyze is {"enabled" if enable_analyze else "disabled"}')

    logger.log('updating field statistics')
    for repo, field_stats in field_stats_per_repo.items():
        database.update_repository_used_package_fields(
            repo, field_stats.get_used_fields())

    logger.log('preparing updated packages')
    database.update_prepare_packages()

    logger.log('updating projects (precreate)')
    database.update_precreate_projects()

    logger.log('updating maintainers (precreate)')
    database.update_precreate_maintainers()

    logger.log('updating tracks')
    database.update_tracks(enable_partial, enable_analyze)

    logger.log('updating track versions')
    database.update_track_versions(enable_partial, enable_analyze)

    logger.log('updating project releases')
    database.update_project_releases(enable_partial, enable_analyze)

    logger.log('updating project events')
    database.update_project_events()

    logger.log('updating maintainer events')
    database.update_maintainer_events()

    logger.log('updating repositry events')
    database.update_repository_events()

    logger.log('updating projects turnover')
    database.update_projects_turnover()

    logger.log('updating links')
    database.update_links()

    logger.log('updating statistics (delta)')
    database.update_statistics_delta()

    # Note: before this, packages table still contains old versions of packages,
    # while new versions reside in incoming_packages temporary table
    logger.log('applying updated packages')
    database.update_apply_packages(enable_partial, enable_analyze)
    # Note: after this, packages table contain new versions of packages

    logger.log('updating metapackages')
    database.update_metapackages()

    logger.log('updating repositories')
    database.update_repositories()

    logger.log('updating maintainers')
    database.update_maintainers()

    logger.log('updating binding table repo_metapackages')
    database.update_binding_repo_metapackages(enable_partial, enable_analyze)

    logger.log('updating binding table category_metapackages')
    database.update_binding_category_metapackages(enable_partial,
                                                  enable_analyze)

    logger.log('updating binding table maintainer_metapackages')
    database.update_binding_maintainer_metapackages(enable_partial,
                                                    enable_analyze)

    logger.log('updating binding table maintainer_and_repo_metapackages')
    database.update_binding_maintainer_and_repo_metapackages(
        enable_partial, enable_analyze)

    logger.log('updating url relations (all)')
    database.update_url_relations_all(enable_partial, enable_analyze)

    logger.log('updating url relations (filtered)')
    database.update_url_relations_filtered(enable_partial, enable_analyze)

    logger.log('updating projects has_related flag')
    database.update_projects_has_related()

    logger.log('updating problems')
    database.update_problems(enable_partial, enable_analyze)

    logger.log('updating problem counts')
    database.update_repositories_problem_counts()

    logger.log('updating statistics (global)')
    database.update_statistics_global()

    logger.log('updating histories')
    database.update_histories()

    logger.log('finalizing the update')
    database.update_finish()
Ejemplo n.º 17
0
def Main():
    options = ParseArguments()

    logger = FileLogger(options.logfile) if options.logfile else StderrLogger()
    database = Database(options.dsn, readonly=True, autocommit=True)

    readqueue = multiprocessing.Queue(10)
    writequeue = multiprocessing.Queue(10)

    writer = multiprocessing.Process(target=LinkUpdatingWorker,
                                     args=(writequeue, options, logger))
    writer.start()

    processpool = [
        multiprocessing.Process(target=LinkProcessingWorker,
                                args=(readqueue, writequeue, i, options,
                                      logger)) for i in range(options.jobs)
    ]
    for process in processpool:
        process.start()

    # base logger already passed to workers, may append prefix here
    logger = logger.GetPrefixed('master: ')

    prev_url = None
    while True:
        # Get pack of links
        logger.Log('Requesting pack of urls')
        urls = database.GetLinksForCheck(
            after=prev_url,
            prefix=options.prefix,  # no limit by default
            limit=options.packsize,
            recheck_age=options.age * 60 * 60 * 24,
            unchecked_only=options.unchecked,
            checked_only=options.checked,
            failed_only=options.failed,
            succeeded_only=options.succeeded)
        if not urls:
            logger.Log('  No more urls to process')
            break

        # Get another pack of urls with the last hostname to ensure
        # that all urls for one hostname get into a same large pack
        match = re.match('([a-z]+://[^/]+/)', urls[-1])
        if match:
            urls += database.GetLinksForCheck(after=urls[-1],
                                              prefix=match.group(1),
                                              recheck_age=options.age * 60 *
                                              60 * 24,
                                              unchecked_only=options.unchecked,
                                              checked_only=options.checked,
                                              failed_only=options.failed,
                                              succeeded_only=options.succeeded)

        # Process
        if options.maxpacksize and len(urls) > options.maxpacksize:
            logger.Log(
                'Skipping {} urls ({}..{}), exceeds max pack size'.format(
                    len(urls), urls[0], urls[-1]))
        else:
            readqueue.put(urls)
            logger.Log('Enqueued {} urls ({}..{})'.format(
                len(urls), urls[0], urls[-1]))

        prev_url = urls[-1]

    logger.Log('Waiting for child processes to exit')

    # close workers
    for process in processpool:
        readqueue.put(None)
    for process in processpool:
        process.join()

    # close writer
    writequeue.put(None)
    writer.join()

    logger.Log('Done')

    return 0
Ejemplo n.º 18
0
def Main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('-D',
                        '--dsn',
                        default=repology.config.DSN,
                        help='database connection params')

    parser.add_argument('-w',
                        '--www-home',
                        default=repology.config.REPOLOGY_HOME,
                        help='repology www home')
    parser.add_argument('-m',
                        '--max-urls',
                        default=50000,
                        help='max number of urls to generate')

    parser.add_argument('--main',
                        action='store_true',
                        help='generate maintainers sitemap')
    parser.add_argument('--metapackages',
                        action='store_true',
                        help='generate maintainers sitemap')

    options = parser.parse_args()

    database = Database(options.dsn, readonly=True)

    urls = []
    if options.main:
        urls = [
            '/', '/news', '/statistics', '/about', '/api/v1', '/repositories/'
        ]

        urls.extend(
            map(lambda row: '/maintainer/' + row[0],
                database.Query('SELECT maintainer FROM maintainers')))
        urls.extend(
            map(lambda row: '/repository/' + row[0],
                database.Query('SELECT name FROM repositories')))
    elif options.metapackages:
        LINKS_PER_METAPACKAGE = 3

        print('Guessing threshold for important metapackages', file=sys.stderr)

        num_repos = 1
        while True:
            num_metapackages = database.Query(
                'SELECT count(DISTINCT effname) FROM metapackage_repocounts WHERE num_families >= %s',
                num_repos)[0][0]

            num_urls_total = len(
                urls) + num_metapackages * LINKS_PER_METAPACKAGE

            print('Threshold = {}, {} metapackages, {} total urls'.format(
                num_repos, num_metapackages, num_urls_total),
                  file=sys.stderr)

            if num_urls_total <= options.max_urls:
                print('  Looks good', file=sys.stderr)
                break

            if num_repos > 20:
                print('  Giving up, will truncate metapackage list',
                      file=sys.stderr)
                break

            num_repos += 1

        # get most important packages
        for row in database.Query(
                'SELECT DISTINCT effname FROM metapackage_repocounts WHERE num_families >= %s LIMIT %s',
                num_repos,
            (options.max_urls - len(urls)) // LINKS_PER_METAPACKAGE):
            urls.append('/metapackage/' + row[0] + '/versions')
            urls.append('/metapackage/' + row[0] + '/packages')
            urls.append('/metapackage/' + row[0] + '/information')

        # fill the remaining space with less important packages
        for row in database.Query(
                'SELECT DISTINCT effname FROM metapackage_repocounts WHERE num_families = %s LIMIT %s',
                num_repos - 1,
            (options.max_urls - len(urls)) // LINKS_PER_METAPACKAGE):
            urls.append('/metapackage/' + row[0] + '/versions')
            urls.append('/metapackage/' + row[0] + '/packages')
            urls.append('/metapackage/' + row[0] + '/information')
    else:
        print('Please specify output mode', file=sys.stderr)

    shuffle(urls)

    # write XML
    print('Writing XML', file=sys.stderr)

    print('<?xml version="1.0" encoding="UTF-8"?>')
    print('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">')
    for url in urls:
        print('<url><loc>' + html.escape(options.www_home + url) +
              '</loc><changefreq>daily</changefreq></url>')
    print('</urlset>')

    return 0
Ejemplo n.º 19
0
 def flush(self, db: Database, run_id: int) -> None:
     for lineno, (timestamp, formatted_message,
                  severity) in enumerate(self._lines, 1):
         db.add_log_line(run_id, lineno, timestamp,
                         _severity_to_sql(severity), formatted_message)
Ejemplo n.º 20
0
 def get_logging_database_connection(self) -> Database:
     return Database(self.options.dsn,
                     self.get_query_manager(),
                     readonly=False,
                     autocommit=True,
                     application_name='repology-update-logging')
Ejemplo n.º 21
0
def get_db():
    if not hasattr(flask.g, 'database'):
        flask.g.database = Database(app.config['DSN'],
                                    readonly=False,
                                    autocommit=True)
    return flask.g.database