Ejemplo n.º 1
0
def Main():
    options = ParseArguments()

    logger = FileLogger(options.logfile) if options.logfile else StderrLogger()
    database = Database(options.dsn, readonly=True, autocommit=True)

    readqueue = multiprocessing.Queue(10)
    writequeue = multiprocessing.Queue(10)

    writer = multiprocessing.Process(target=LinkUpdatingWorker,
                                     args=(writequeue, options, logger))
    writer.start()

    processpool = [
        multiprocessing.Process(target=LinkProcessingWorker,
                                args=(readqueue, writequeue, i, options,
                                      logger)) for i in range(options.jobs)
    ]
    for process in processpool:
        process.start()

    # base logger already passed to workers, may append prefix here
    logger = logger.GetPrefixed('master: ')

    prev_url = None
    while True:
        # Get pack of links
        logger.Log('Requesting pack of urls')
        urls = database.GetLinksForCheck(
            after=prev_url,
            prefix=options.prefix,  # no limit by default
            limit=options.packsize,
            recheck_age=options.age * 60 * 60 * 24,
            unchecked_only=options.unchecked,
            checked_only=options.checked,
            failed_only=options.failed,
            succeeded_only=options.succeeded)
        if not urls:
            logger.Log('  No more urls to process')
            break

        # Get another pack of urls with the last hostname to ensure
        # that all urls for one hostname get into a same large pack
        match = re.match('([a-z]+://[^/]+/)', urls[-1])
        if match:
            urls += database.GetLinksForCheck(after=urls[-1],
                                              prefix=match.group(1),
                                              recheck_age=options.age * 60 *
                                              60 * 24,
                                              unchecked_only=options.unchecked,
                                              checked_only=options.checked,
                                              failed_only=options.failed,
                                              succeeded_only=options.succeeded)

        # Process
        if options.maxpacksize and len(urls) > options.maxpacksize:
            logger.Log(
                'Skipping {} urls ({}..{}), exceeds max pack size'.format(
                    len(urls), urls[0], urls[-1]))
        else:
            readqueue.put(urls)
            logger.Log('Enqueued {} urls ({}..{})'.format(
                len(urls), urls[0], urls[-1]))

        prev_url = urls[-1]

    logger.Log('Waiting for child processes to exit')

    # close workers
    for process in processpool:
        readqueue.put(None)
    for process in processpool:
        process.join()

    # close writer
    writequeue.put(None)
    writer.join()

    logger.Log('Done')

    return 0
Ejemplo n.º 2
0
def Main():
    parser = argparse.ArgumentParser(
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument('--dsn',
                        default=repology.config.DSN,
                        help='database connection params')
    parser.add_argument('--logfile',
                        help='path to log file (log to stderr by default)')

    parser.add_argument('--timeout',
                        type=float,
                        default=60.0,
                        help='timeout for link requests in seconds')
    parser.add_argument('--delay',
                        type=float,
                        default=3.0,
                        help='delay between requests to one host')
    parser.add_argument('--age',
                        type=int,
                        default=365,
                        help='min age for recheck in days')
    parser.add_argument('--packsize',
                        type=int,
                        default=128,
                        help='pack size for link processing')
    parser.add_argument(
        '--maxpacksize',
        type=int,
        help='max pack size for link processing (useful to skip large hosts)')
    parser.add_argument('--jobs',
                        type=int,
                        default=1,
                        help='number of parallel jobs')

    parser.add_argument('--unchecked',
                        action='store_true',
                        help='only process unchecked (newly discovered) links')
    parser.add_argument('--checked',
                        action='store_true',
                        help='only process old (already checked) links')
    parser.add_argument('--failed',
                        action='store_true',
                        help='only process links that were checked and failed')
    parser.add_argument('--succeeded',
                        action='store_true',
                        help='only process links that were checked and failed')
    parser.add_argument('--prefix',
                        help='only process links with specified prefix')
    options = parser.parse_args()

    logger = FileLogger(options.logfile) if options.logfile else StderrLogger()
    database = Database(options.dsn, readonly=True, autocommit=True)

    queue = multiprocessing.Queue(1)
    processpool = [
        multiprocessing.Process(target=LinkProcessorWorker,
                                args=(queue, i, options, logger))
        for i in range(options.jobs)
    ]
    for process in processpool:
        process.start()

    # base logger already passed to workers, may append prefix here
    logger = logger.GetPrefixed('master: ')

    prev_url = None
    while True:
        # Get pack of links
        logger.Log('Requesting pack of urls'.format(prev_url))
        urls = database.GetLinksForCheck(
            after=prev_url,
            prefix=options.prefix,  # no limit by default
            limit=options.packsize,
            recheck_age=options.age * 60 * 60 * 24,
            unchecked_only=options.unchecked,
            checked_only=options.checked,
            failed_only=options.failed,
            succeeded_only=options.succeeded)
        if not urls:
            logger.Log('  No more urls to process')
            break

        # Get another pack of urls with the last hostname to ensure
        # that all urls for one hostname get into a same large pack
        match = re.match('([a-z]+://[^/]+/)', urls[-1])
        if match:
            urls += database.GetLinksForCheck(after=urls[-1],
                                              prefix=match.group(1),
                                              recheck_age=options.age * 60 *
                                              60 * 24,
                                              unchecked_only=options.unchecked,
                                              checked_only=options.checked,
                                              failed_only=options.failed,
                                              succeeded_only=options.succeeded)

        # Process
        if options.maxpacksize and len(urls) > options.maxpacksize:
            logger.Log(
                'Skipping {} urls ({}..{}), exceeds max pack size'.format(
                    len(urls), urls[0], urls[-1]))
        else:
            queue.put(urls)
            logger.Log('Enqueued {} urls ({}..{})'.format(
                len(urls), urls[0], urls[-1]))

        prev_url = urls[-1]

    logger.Log('Waiting for child processes to exit')

    for process in processpool:
        queue.put(None)

    for process in processpool:
        process.join()

    logger.Log('Done')

    return 0