def Main(): options = ParseArguments() logger = FileLogger(options.logfile) if options.logfile else StderrLogger() database = Database(options.dsn, readonly=True, autocommit=True) readqueue = multiprocessing.Queue(10) writequeue = multiprocessing.Queue(10) writer = multiprocessing.Process(target=LinkUpdatingWorker, args=(writequeue, options, logger)) writer.start() processpool = [ multiprocessing.Process(target=LinkProcessingWorker, args=(readqueue, writequeue, i, options, logger)) for i in range(options.jobs) ] for process in processpool: process.start() # base logger already passed to workers, may append prefix here logger = logger.GetPrefixed('master: ') prev_url = None while True: # Get pack of links logger.Log('Requesting pack of urls') urls = database.GetLinksForCheck( after=prev_url, prefix=options.prefix, # no limit by default limit=options.packsize, recheck_age=options.age * 60 * 60 * 24, unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded) if not urls: logger.Log(' No more urls to process') break # Get another pack of urls with the last hostname to ensure # that all urls for one hostname get into a same large pack match = re.match('([a-z]+://[^/]+/)', urls[-1]) if match: urls += database.GetLinksForCheck(after=urls[-1], prefix=match.group(1), recheck_age=options.age * 60 * 60 * 24, unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded) # Process if options.maxpacksize and len(urls) > options.maxpacksize: logger.Log( 'Skipping {} urls ({}..{}), exceeds max pack size'.format( len(urls), urls[0], urls[-1])) else: readqueue.put(urls) logger.Log('Enqueued {} urls ({}..{})'.format( len(urls), urls[0], urls[-1])) prev_url = urls[-1] logger.Log('Waiting for child processes to exit') # close workers for process in processpool: readqueue.put(None) for process in processpool: process.join() # close writer writequeue.put(None) writer.join() logger.Log('Done') return 0
def Main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--dsn', default=repology.config.DSN, help='database connection params') parser.add_argument('--logfile', help='path to log file (log to stderr by default)') parser.add_argument('--timeout', type=float, default=60.0, help='timeout for link requests in seconds') parser.add_argument('--delay', type=float, default=3.0, help='delay between requests to one host') parser.add_argument('--age', type=int, default=365, help='min age for recheck in days') parser.add_argument('--packsize', type=int, default=128, help='pack size for link processing') parser.add_argument( '--maxpacksize', type=int, help='max pack size for link processing (useful to skip large hosts)') parser.add_argument('--jobs', type=int, default=1, help='number of parallel jobs') parser.add_argument('--unchecked', action='store_true', help='only process unchecked (newly discovered) links') parser.add_argument('--checked', action='store_true', help='only process old (already checked) links') parser.add_argument('--failed', action='store_true', help='only process links that were checked and failed') parser.add_argument('--succeeded', action='store_true', help='only process links that were checked and failed') parser.add_argument('--prefix', help='only process links with specified prefix') options = parser.parse_args() logger = FileLogger(options.logfile) if options.logfile else StderrLogger() database = Database(options.dsn, readonly=True, autocommit=True) queue = multiprocessing.Queue(1) processpool = [ multiprocessing.Process(target=LinkProcessorWorker, args=(queue, i, options, logger)) for i in range(options.jobs) ] for process in processpool: process.start() # base logger already passed to workers, may append prefix here logger = logger.GetPrefixed('master: ') prev_url = None while True: # Get pack of links logger.Log('Requesting pack of urls'.format(prev_url)) urls = database.GetLinksForCheck( after=prev_url, prefix=options.prefix, # no limit by default limit=options.packsize, recheck_age=options.age * 60 * 60 * 24, unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded) if not urls: logger.Log(' No more urls to process') break # Get another pack of urls with the last hostname to ensure # that all urls for one hostname get into a same large pack match = re.match('([a-z]+://[^/]+/)', urls[-1]) if match: urls += database.GetLinksForCheck(after=urls[-1], prefix=match.group(1), recheck_age=options.age * 60 * 60 * 24, unchecked_only=options.unchecked, checked_only=options.checked, failed_only=options.failed, succeeded_only=options.succeeded) # Process if options.maxpacksize and len(urls) > options.maxpacksize: logger.Log( 'Skipping {} urls ({}..{}), exceeds max pack size'.format( len(urls), urls[0], urls[-1])) else: queue.put(urls) logger.Log('Enqueued {} urls ({}..{})'.format( len(urls), urls[0], urls[-1])) prev_url = urls[-1] logger.Log('Waiting for child processes to exit') for process in processpool: queue.put(None) for process in processpool: process.join() logger.Log('Done') return 0