def LinkProcessorWorker(queue, workerid, options, logger): database = Database(options.dsn, readonly=False) logger = logger.GetPrefixed('worker{}: '.format(workerid)) logger.Log('Worker spawned') while True: pack = queue.get() if pack is None: logger.Log('Worker exiting') return logger.Log('Processing {} urls ({}..{})'.format( len(pack), pack[0], pack[-1])) for result in GetLinkStatuses(pack, delay=options.delay, timeout=options.timeout): url, status, redirect, size, location = result database.UpdateLinkStatus(url=url, status=status, redirect=redirect, size=size, location=location) database.Commit() logger.Log('Done processing {} urls ({}..{})'.format( len(pack), pack[0], pack[-1]))
def ProcessDatabase(options, logger, repoman, repositories_updated): logger.Log('connecting to database') db_logger = logger.GetIndented() database = Database(options.dsn, readonly=False) if options.initdb: db_logger.Log('(re)initializing database schema') database.CreateSchema() if options.database: db_logger.Log('clearing the database') database.Clear() package_queue = [] num_pushed = 0 def PackageProcessor(packageset): nonlocal package_queue, num_pushed FillPackagesetVersions(packageset) package_queue.extend(packageset) if len(package_queue) >= 1000: database.AddPackages(package_queue) num_pushed += len(package_queue) package_queue = [] db_logger.Log(' pushed {} packages'.format(num_pushed)) db_logger.Log('pushing packages to database') repoman.StreamDeserializeMulti(processor=PackageProcessor, reponames=options.reponames) # process what's left in the queue database.AddPackages(package_queue) if options.fetch and options.update and options.parse: db_logger.Log('recording repo updates') database.MarkRepositoriesUpdated(repositories_updated) else: db_logger.Log( 'not recording repo updates, need --fetch --update --parse') db_logger.Log('updating views') database.UpdateViews() database.ExtractLinks() db_logger.Log('updating history') database.SnapshotRepositoriesHistory() db_logger.Log('committing changes') database.Commit() logger.Log('database processing complete')
def get_db(): # XXX: this is not really a persistent DB connection! if not hasattr(flask.g, 'database'): flask.g.database = Database(config['DSN'], readonly=False, autocommit=True) return flask.g.database
def LinkUpdatingWorker(queue, options, logger): database = Database(options.dsn, readonly=False) logger = logger.GetPrefixed('writer: ') logger.Log('Writer spawned') while True: pack = queue.get() if pack is None: logger.Log('Writer exiting') return for url, status, redirect, size, location in pack: database.UpdateLinkStatus(url=url, status=status, redirect=redirect, size=size, location=location) database.Commit() logger.Log('Updated {} url(s) ({} .. {})'.format(len(pack), pack[0][0], pack[-1][0]))
def get_db(): # XXX: this is not really a persistent DB connection! if not hasattr(flask.g, 'database'): flask.g.database = Database(config['DSN'], _querymgr, readonly=False, autocommit=True, application_name='repology-app') return flask.g.database
def iter_project_hashes(database: Database) -> Iterable[ProjectHash]: prev_effname = None batch_size = 1000 while True: pack = database.get_project_hashes(prev_effname, batch_size) if not pack: return yield from pack prev_effname = pack[-1][0]
def LinkUpdatingWorker(queue, options, querymgr, logger): database = Database(options.dsn, querymgr, readonly=False, application_name='repology-linkchecker/writer') logger = logger.get_prefixed('writer: ') logger.log('Writer spawned') while True: pack = queue.get() if pack is None: logger.log('Writer exiting') return for url, status, redirect, size, location in pack: database.update_link_status(url=url, status=status, redirect=redirect, size=size, location=location) database.commit() logger.log('Updated {} url(s) ({} .. {})'.format( len(pack), pack[0][0], pack[-1][0]))
def main() -> int: options = parse_arguments() querymgr = QueryManager(options.sql_dir) database = Database(options.dsn, querymgr, readonly=True, application_name='repology-benchmark') reference: Dict[str, float] = {} if options.load: try: with open(options.load, 'rb') as reffile: reference = pickle.load(reffile) except: pass results = [] for num, (method, name, kwargs) in enumerate(queries): if not check_keywords(name, options.keywords): continue print('===> {}/{}: "{}"\n'.format(num + 1, len(queries), name), file=sys.stderr, end='') results.append( (name, run_single_query(database, method, kwargs, options))) sys.stderr.flush() for name, delta in results: change = '' if name in reference: if max(delta, reference[name]) / min(delta, reference[name]) < ( 1 + options.epsilon): change = ' no change' elif delta > reference[name]: change = ' \033[0;91m{:.1f}% slower\033[0m'.format( 100.0 * delta / reference[name] - 100.0) else: change = ' \033[0;92m{:.1f}% faster\033[0m'.format( 100.0 * reference[name] / delta - 100.0) change += ' (was {:.2f}ms)'.format(reference[name] * 1000.0) print('{:>50s} {:.2f}ms{}'.format(name, delta * 1000.0, change), file=sys.stderr) if options.save: reference = {name: delta for name, delta in results} with open(options.save, 'wb') as reffile: pickle.dump(reference, reffile) return 0
def remove_project(database: Database, change: RemovedProject) -> None: database.remove_project_hash(change.effname)
def update_project(database: Database, change: UpdatedProject) -> None: fill_packageset_versions(change.packages) database.add_packages(change.packages) database.update_project_hash(change.effname, change.hash)
def ProcessDatabase(options, logger, repomgr, repoproc, repositories_updated, reponames): logger.Log('connecting to database') db_logger = logger.GetIndented() querymgr = QueryManager(options.sql_dir) database = Database(options.dsn, querymgr, readonly=False, application_name='repology-update') if options.initdb: db_logger.Log('(re)initializing database schema') database.create_schema() db_logger.Log('committing changes') database.commit() if options.database: db_logger.Log('clearing the database') database.update_start() db_logger.Log('updating repository metadata') database.add_repositories(repomgr.GetMetadatas(reponames)) package_queue = [] num_pushed = 0 start_time = timer() def PackageProcessor(packageset): nonlocal package_queue, num_pushed, start_time FillPackagesetVersions(packageset) package_queue.extend(packageset) if len(package_queue) >= 10000: database.add_packages(package_queue) num_pushed += len(package_queue) package_queue = [] db_logger.Log( ' pushed {} packages, {:.2f} packages/second'.format( num_pushed, num_pushed / (timer() - start_time))) db_logger.Log('pushing packages to database') repoproc.StreamDeserializeMulti(processor=PackageProcessor, reponames=options.reponames) # process what's left in the queue database.add_packages(package_queue) if options.fetch and options.update and options.parse: db_logger.Log('recording repo updates') database.mark_repositories_updated(repositories_updated) else: db_logger.Log( 'not recording repo updates, need --fetch --update --parse') db_logger.Log('updating views') database.update_finish() database.commit() if options.postupdate: db_logger.Log('performing database post-update actions') database.update_post() database.commit() logger.Log('database processing complete')
def Main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dsn', default=repology.config.DSN, help='database connection params') parser.add_argument('--logfile', help='path to log file (log to stderr by default)') parser.add_argument('--timeout', type=float, default=60.0, help='timeout for link requests in seconds') parser.add_argument('--delay', type=float, default=3.0, help='delay between requests to one host') parser.add_argument('--age', type=int, default=365, help='min age for recheck in days') parser.add_argument('--packsize', type=int, default=128, help='pack size for link processing') parser.add_argument( '--maxpacksize', type=int, help='max pack size for link processing (useful to skip large hosts)') parser.add_argument('--jobs', type=int, default=1, help='number of parallel jobs') parser.add_argument('--unchecked', action='store_true', help='only process unchecked (newly discovered) links') parser.add_argument('--checked', action='store_true', help='only process old (already checked) links') parser.add_argument('--failed', action='store_true', help='only process links that were checked and failed') parser.add_argument('--succeeded', action='store_true', help='only process links that were checked and failed') parser.add_argument('--prefix', help='only process links with specified prefix') options = parser.parse_args() logger = FileLogger(options.logfile) if options.logfile else StderrLogger() database = Database(options.dsn, readonly=True, autocommit=True) queue = multiprocessing.Queue(1) processpool = [ multiprocessing.Process(target=LinkProcessorWorker, args=(queue, i, options, logger)) for i in range(options.jobs) ] for process in processpool: process.start() # base logger already passed to workers, may append prefix here logger = logger.GetPrefixed('master: ') prev_url = None while True: # Get pack of links logger.Log('Requesting pack of urls'.format(prev_url)) urls = database.GetLinksForCheck( after=prev_url, prefix=options.prefix, # no limit by default limit=options.packsize, recheck_age=options.age * 60 * 60 * 24, unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded) if not urls: logger.Log(' No more urls to process') break # Get another pack of urls with the last hostname to ensure # that all urls for one hostname get into a same large pack match = re.match('([a-z]+://[^/]+/)', urls[-1]) if match: urls += database.GetLinksForCheck(after=urls[-1], prefix=match.group(1), recheck_age=options.age * 60 * 60 * 24, unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded) # Process if options.maxpacksize and len(urls) > options.maxpacksize: logger.Log( 'Skipping {} urls ({}..{}), exceeds max pack size'.format( len(urls), urls[0], urls[-1])) else: queue.put(urls) logger.Log('Enqueued {} urls ({}..{})'.format( len(urls), urls[0], urls[-1])) prev_url = urls[-1] logger.Log('Waiting for child processes to exit') for process in processpool: queue.put(None) for process in processpool: process.join() logger.Log('Done') return 0
def main() -> int: options = parse_arguments() querymgr = QueryManager(options.sql_dir) database = Database(options.dsn, querymgr, readonly=True, application_name='repology-gensitemap') urls: List[str] = [] if options.main: urls = ['/', '/news', '/statistics', '/about', '/api/v1', '/repositories/'] urls.extend(('/maintainer/' + name for name in database.get_all_maintainer_names())) urls.extend(('/repository/' + name for name in database.get_all_repository_names())) elif options.metapackages: links_per_metapackage = 3 print('Guessing threshold for important metapackages', file=sys.stderr) num_repos = 1 while True: num_metapackages = database.get_all_metapackage_names_by_min_spread_count(num_repos) num_urls_total = len(urls) + num_metapackages * links_per_metapackage print('Threshold = {}, {} metapackages, {} total urls'.format(num_repos, num_metapackages, num_urls_total), file=sys.stderr) if num_urls_total <= options.max_urls: print(' Looks good', file=sys.stderr) break if num_repos > 20: print(' Giving up, will truncate metapackage list', file=sys.stderr) break num_repos += 1 # get most important packages for name in database.get_all_metapackage_names_by_min_spread(num_repos, (options.max_urls - len(urls)) // links_per_metapackage): urls.append('/project/' + name + '/versions') urls.append('/project/' + name + '/packages') urls.append('/project/' + name + '/information') # fill the remaining space with less important packages for name in database.get_all_metapackage_names_by_spread(num_repos - 1, (options.max_urls - len(urls)) // links_per_metapackage): urls.append('/project/' + name + '/versions') urls.append('/project/' + name + '/packages') urls.append('/project/' + name + '/information') else: print('Please specify output mode', file=sys.stderr) shuffle(urls) # write XML print('Writing XML', file=sys.stderr) print('<?xml version="1.0" encoding="UTF-8"?>') print('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">') for url in urls: print('<url><loc>' + html.escape(options.www_home + url) + '</loc><changefreq>daily</changefreq></url>') print('</urlset>') return 0
def Main(): options = ParseArguments() database = Database(options.dsn, readonly=True) urls = [] if options.main: urls = [ '/', '/news', '/statistics', '/about', '/api/v1', '/repositories/' ] urls.extend( map(lambda row: '/maintainer/' + row[0], database.Query('SELECT maintainer FROM maintainers'))) urls.extend( map(lambda row: '/repository/' + row[0], database.Query('SELECT name FROM repositories'))) elif options.metapackages: links_per_metapackage = 3 print('Guessing threshold for important metapackages', file=sys.stderr) num_repos = 1 while True: num_metapackages = database.Query( 'SELECT count(DISTINCT effname) FROM metapackage_repocounts WHERE num_families >= %s', num_repos)[0][0] num_urls_total = len( urls) + num_metapackages * links_per_metapackage print('Threshold = {}, {} metapackages, {} total urls'.format( num_repos, num_metapackages, num_urls_total), file=sys.stderr) if num_urls_total <= options.max_urls: print(' Looks good', file=sys.stderr) break if num_repos > 20: print(' Giving up, will truncate metapackage list', file=sys.stderr) break num_repos += 1 # get most important packages for row in database.Query( 'SELECT DISTINCT effname FROM metapackage_repocounts WHERE num_families >= %s LIMIT %s', num_repos, (options.max_urls - len(urls)) // links_per_metapackage): urls.append('/metapackage/' + row[0] + '/versions') urls.append('/metapackage/' + row[0] + '/packages') urls.append('/metapackage/' + row[0] + '/information') # fill the remaining space with less important packages for row in database.Query( 'SELECT DISTINCT effname FROM metapackage_repocounts WHERE num_families = %s LIMIT %s', num_repos - 1, (options.max_urls - len(urls)) // links_per_metapackage): urls.append('/metapackage/' + row[0] + '/versions') urls.append('/metapackage/' + row[0] + '/packages') urls.append('/metapackage/' + row[0] + '/information') else: print('Please specify output mode', file=sys.stderr) shuffle(urls) # write XML print('Writing XML', file=sys.stderr) print('<?xml version="1.0" encoding="UTF-8"?>') print('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">') for url in urls: print('<url><loc>' + html.escape(options.www_home + url) + '</loc><changefreq>daily</changefreq></url>') print('</urlset>') return 0
def get_main_database_connection(self) -> Database: return Database(self.options.dsn, self.get_query_manager(), readonly=False, application_name='repology-update')
def update_repology(database: Database, projects: Optional[Iterable[List[Package]]], logger: Logger) -> None: logger.log('starting the update') database.update_start() logger.log('updating projects') field_stats_per_repo: Dict[str, FieldStatistics] = defaultdict(FieldStatistics) stats = ProjectsChangeStatistics() if projects is not None: prev_total = 0 changed_projects = ChangedProjectsAccumulator(database) for change in iter_changed_projects(iter_project_hashes(database), projects, stats): if isinstance(change, UpdatedProject): update_project(database, change) for package in change.packages: field_stats_per_repo[package.repo].add(package) elif isinstance(change, RemovedProject): remove_project(database, change) changed_projects.add(change.effname) if stats.total - prev_total >= 10000 or prev_total == 0: logger.log(f' at "{change.effname}": {stats}') prev_total = stats.total changed_projects.flush() logger.log(f' done: {stats}') # Fraction picked experimentally: at change size of around 100k of 400k projects # time of partial update of most binding tables approaches or exceeds full update # time. In fact this doesn't matter much, as general update is arond 0.001 (0.1%), # and a few cases of > 0.01 (1%) are when new repositories are added, othewise it's # 1 (100%) when Package format changes or when database is filled for the first time. enable_partial = stats.change_fraction < 0.25 # This was picked randomly enable_analyze = stats.change_fraction > 0.05 logger.log(f'update mode is {"partial" if enable_partial else "full"}') logger.log( f'explicit analyze is {"enabled" if enable_analyze else "disabled"}') logger.log('updating field statistics') for repo, field_stats in field_stats_per_repo.items(): database.update_repository_used_package_fields( repo, field_stats.get_used_fields()) logger.log('preparing updated packages') database.update_prepare_packages() logger.log('updating projects (precreate)') database.update_precreate_projects() logger.log('updating maintainers (precreate)') database.update_precreate_maintainers() logger.log('updating tracks') database.update_tracks(enable_partial, enable_analyze) logger.log('updating track versions') database.update_track_versions(enable_partial, enable_analyze) logger.log('updating project releases') database.update_project_releases(enable_partial, enable_analyze) logger.log('updating project events') database.update_project_events() logger.log('updating maintainer events') database.update_maintainer_events() logger.log('updating repositry events') database.update_repository_events() logger.log('updating projects turnover') database.update_projects_turnover() logger.log('updating links') database.update_links() logger.log('updating statistics (delta)') database.update_statistics_delta() # Note: before this, packages table still contains old versions of packages, # while new versions reside in incoming_packages temporary table logger.log('applying updated packages') database.update_apply_packages(enable_partial, enable_analyze) # Note: after this, packages table contain new versions of packages logger.log('updating metapackages') database.update_metapackages() logger.log('updating repositories') database.update_repositories() logger.log('updating maintainers') database.update_maintainers() logger.log('updating binding table repo_metapackages') database.update_binding_repo_metapackages(enable_partial, enable_analyze) logger.log('updating binding table category_metapackages') database.update_binding_category_metapackages(enable_partial, enable_analyze) logger.log('updating binding table maintainer_metapackages') database.update_binding_maintainer_metapackages(enable_partial, enable_analyze) logger.log('updating binding table maintainer_and_repo_metapackages') database.update_binding_maintainer_and_repo_metapackages( enable_partial, enable_analyze) logger.log('updating url relations (all)') database.update_url_relations_all(enable_partial, enable_analyze) logger.log('updating url relations (filtered)') database.update_url_relations_filtered(enable_partial, enable_analyze) logger.log('updating projects has_related flag') database.update_projects_has_related() logger.log('updating problems') database.update_problems(enable_partial, enable_analyze) logger.log('updating problem counts') database.update_repositories_problem_counts() logger.log('updating statistics (global)') database.update_statistics_global() logger.log('updating histories') database.update_histories() logger.log('finalizing the update') database.update_finish()
def Main(): options = ParseArguments() logger = FileLogger(options.logfile) if options.logfile else StderrLogger() database = Database(options.dsn, readonly=True, autocommit=True) readqueue = multiprocessing.Queue(10) writequeue = multiprocessing.Queue(10) writer = multiprocessing.Process(target=LinkUpdatingWorker, args=(writequeue, options, logger)) writer.start() processpool = [ multiprocessing.Process(target=LinkProcessingWorker, args=(readqueue, writequeue, i, options, logger)) for i in range(options.jobs) ] for process in processpool: process.start() # base logger already passed to workers, may append prefix here logger = logger.GetPrefixed('master: ') prev_url = None while True: # Get pack of links logger.Log('Requesting pack of urls') urls = database.GetLinksForCheck( after=prev_url, prefix=options.prefix, # no limit by default limit=options.packsize, recheck_age=options.age * 60 * 60 * 24, unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded) if not urls: logger.Log(' No more urls to process') break # Get another pack of urls with the last hostname to ensure # that all urls for one hostname get into a same large pack match = re.match('([a-z]+://[^/]+/)', urls[-1]) if match: urls += database.GetLinksForCheck(after=urls[-1], prefix=match.group(1), recheck_age=options.age * 60 * 60 * 24, unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded) # Process if options.maxpacksize and len(urls) > options.maxpacksize: logger.Log( 'Skipping {} urls ({}..{}), exceeds max pack size'.format( len(urls), urls[0], urls[-1])) else: readqueue.put(urls) logger.Log('Enqueued {} urls ({}..{})'.format( len(urls), urls[0], urls[-1])) prev_url = urls[-1] logger.Log('Waiting for child processes to exit') # close workers for process in processpool: readqueue.put(None) for process in processpool: process.join() # close writer writequeue.put(None) writer.join() logger.Log('Done') return 0
def Main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('-D', '--dsn', default=repology.config.DSN, help='database connection params') parser.add_argument('-w', '--www-home', default=repology.config.REPOLOGY_HOME, help='repology www home') parser.add_argument('-m', '--max-urls', default=50000, help='max number of urls to generate') parser.add_argument('--main', action='store_true', help='generate maintainers sitemap') parser.add_argument('--metapackages', action='store_true', help='generate maintainers sitemap') options = parser.parse_args() database = Database(options.dsn, readonly=True) urls = [] if options.main: urls = [ '/', '/news', '/statistics', '/about', '/api/v1', '/repositories/' ] urls.extend( map(lambda row: '/maintainer/' + row[0], database.Query('SELECT maintainer FROM maintainers'))) urls.extend( map(lambda row: '/repository/' + row[0], database.Query('SELECT name FROM repositories'))) elif options.metapackages: LINKS_PER_METAPACKAGE = 3 print('Guessing threshold for important metapackages', file=sys.stderr) num_repos = 1 while True: num_metapackages = database.Query( 'SELECT count(DISTINCT effname) FROM metapackage_repocounts WHERE num_families >= %s', num_repos)[0][0] num_urls_total = len( urls) + num_metapackages * LINKS_PER_METAPACKAGE print('Threshold = {}, {} metapackages, {} total urls'.format( num_repos, num_metapackages, num_urls_total), file=sys.stderr) if num_urls_total <= options.max_urls: print(' Looks good', file=sys.stderr) break if num_repos > 20: print(' Giving up, will truncate metapackage list', file=sys.stderr) break num_repos += 1 # get most important packages for row in database.Query( 'SELECT DISTINCT effname FROM metapackage_repocounts WHERE num_families >= %s LIMIT %s', num_repos, (options.max_urls - len(urls)) // LINKS_PER_METAPACKAGE): urls.append('/metapackage/' + row[0] + '/versions') urls.append('/metapackage/' + row[0] + '/packages') urls.append('/metapackage/' + row[0] + '/information') # fill the remaining space with less important packages for row in database.Query( 'SELECT DISTINCT effname FROM metapackage_repocounts WHERE num_families = %s LIMIT %s', num_repos - 1, (options.max_urls - len(urls)) // LINKS_PER_METAPACKAGE): urls.append('/metapackage/' + row[0] + '/versions') urls.append('/metapackage/' + row[0] + '/packages') urls.append('/metapackage/' + row[0] + '/information') else: print('Please specify output mode', file=sys.stderr) shuffle(urls) # write XML print('Writing XML', file=sys.stderr) print('<?xml version="1.0" encoding="UTF-8"?>') print('<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">') for url in urls: print('<url><loc>' + html.escape(options.www_home + url) + '</loc><changefreq>daily</changefreq></url>') print('</urlset>') return 0
def flush(self, db: Database, run_id: int) -> None: for lineno, (timestamp, formatted_message, severity) in enumerate(self._lines, 1): db.add_log_line(run_id, lineno, timestamp, _severity_to_sql(severity), formatted_message)
def get_logging_database_connection(self) -> Database: return Database(self.options.dsn, self.get_query_manager(), readonly=False, autocommit=True, application_name='repology-update-logging')
def get_db(): if not hasattr(flask.g, 'database'): flask.g.database = Database(app.config['DSN'], readonly=False, autocommit=True) return flask.g.database