Beispiel #1
0
def brozzler_worker(argv=None):
    '''
    Main entry point for brozzler, gets sites and pages to brozzle from
    rethinkdb, brozzles them.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    add_rethinkdb_options(arg_parser)
    arg_parser.add_argument('-e',
                            '--chrome-exe',
                            dest='chrome_exe',
                            default=suggest_default_chrome_exe(),
                            help='executable to use to invoke chrome')
    arg_parser.add_argument(
        '-n',
        '--max-browsers',
        dest='max_browsers',
        default='1',
        help='max number of chrome instances simultaneously browsing pages')
    arg_parser.add_argument('--proxy',
                            dest='proxy',
                            default=None,
                            help='http proxy')
    arg_parser.add_argument(
        '--warcprox-auto',
        dest='warcprox_auto',
        action='store_true',
        help=('when needed, choose an available instance of warcprox from '
              'the rethinkdb service registry'))
    arg_parser.add_argument('--skip-extract-outlinks',
                            dest='skip_extract_outlinks',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-visit-hashtags',
                            dest='skip_visit_hashtags',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-youtube-dl',
                            dest='skip_youtube_dl',
                            action='store_true',
                            help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    def dump_state(signum, frame):
        signal.signal(signal.SIGQUIT, signal.SIG_IGN)
        try:
            state_strs = []
            frames = sys._current_frames()
            threads = {th.ident: th for th in threading.enumerate()}
            for ident in frames:
                if threads[ident]:
                    state_strs.append(str(threads[ident]))
                else:
                    state_strs.append('<???:thread:ident=%s>' % ident)
                stack = traceback.format_stack(frames[ident])
                state_strs.append(''.join(stack))
            logging.info('dumping state (caught signal %s)\n%s' %
                         (signum, '\n'.join(state_strs)))
        except BaseException as e:
            logging.error('exception dumping state: %s' % e)
        finally:
            signal.signal(signal.SIGQUIT, dump_state)

    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    service_registry = doublethink.ServiceRegistry(rr)
    worker = brozzler.worker.BrozzlerWorker(
        frontier,
        service_registry,
        max_browsers=int(args.max_browsers),
        chrome_exe=args.chrome_exe,
        proxy=args.proxy,
        warcprox_auto=args.warcprox_auto,
        skip_extract_outlinks=args.skip_extract_outlinks,
        skip_visit_hashtags=args.skip_visit_hashtags,
        skip_youtube_dl=args.skip_youtube_dl)

    signal.signal(signal.SIGQUIT, dump_state)
    signal.signal(signal.SIGTERM, lambda s, f: worker.stop())
    signal.signal(signal.SIGINT, lambda s, f: worker.stop())

    th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread')
    th.start()
    th.join()
    logging.info('brozzler-worker is all done, exiting')
Beispiel #2
0
def brozzler_worker(argv=None):
    '''
    Main entry point for brozzler, gets sites and pages to brozzle from
    rethinkdb, brozzles them.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(argv[0]),
            formatter_class=BetterArgumentDefaultsHelpFormatter)
    add_rethinkdb_options(arg_parser)
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chrome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '-n', '--max-browsers', dest='max_browsers', default='1',
            help='max number of chrome instances simultaneously browsing pages')
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None, help='http proxy')
    arg_parser.add_argument(
            '--warcprox-auto', dest='warcprox_auto', action='store_true',
            help=(
                'when needed, choose an available instance of warcprox from '
                'the rethinkdb service registry'))
    arg_parser.add_argument(
            '--skip-extract-outlinks', dest='skip_extract_outlinks',
            action='store_true', help=argparse.SUPPRESS)
    arg_parser.add_argument(
            '--skip-visit-hashtags', dest='skip_visit_hashtags',
            action='store_true', help=argparse.SUPPRESS)
    arg_parser.add_argument(
            '--skip-youtube-dl', dest='skip_youtube_dl',
            action='store_true', help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)
    brozzler.chrome.check_version(args.chrome_exe)

    def dump_state(signum, frame):
        signal.signal(signal.SIGQUIT, signal.SIG_IGN)
        try:
            state_strs = []
            frames = sys._current_frames()
            threads = {th.ident: th for th in threading.enumerate()}
            for ident in frames:
                if threads[ident]:
                    state_strs.append(str(threads[ident]))
                else:
                    state_strs.append('<???:thread:ident=%s>' % ident)
                stack = traceback.format_stack(frames[ident])
                state_strs.append(''.join(stack))
            logging.info(
                    'dumping state (caught signal %s)\n%s' % (
                        signum, '\n'.join(state_strs)))
        except BaseException as e:
            logging.error('exception dumping state: %s' % e)
        finally:
            signal.signal(signal.SIGQUIT, dump_state)

    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    service_registry = doublethink.ServiceRegistry(rr)
    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=int(args.max_browsers),
            chrome_exe=args.chrome_exe, proxy=args.proxy,
            warcprox_auto=args.warcprox_auto,
            skip_extract_outlinks=args.skip_extract_outlinks,
            skip_visit_hashtags=args.skip_visit_hashtags,
            skip_youtube_dl=args.skip_youtube_dl)

    signal.signal(signal.SIGQUIT, dump_state)
    signal.signal(signal.SIGTERM, lambda s,f: worker.stop())
    signal.signal(signal.SIGINT, lambda s,f: worker.stop())

    th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread')
    th.start()
    th.join()
    logging.info('brozzler-worker is all done, exiting')