コード例 #1
0
ファイル: cli.py プロジェクト: Cloudxtreme/brozzler
def brozzler_ensure_tables():
    '''
    Creates rethinkdb tables if they don't already exist. Brozzler
    (brozzler-worker, brozzler-new-job, etc) normally creates the tables it
    needs on demand at startup, but if multiple instances are starting up at
    the same time, you can end up with duplicate broken tables. So it's a good
    idea to use this utility at an early step when spinning up a cluster.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    _add_rethinkdb_options(arg_parser)
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(','), args.rethinkdb_db)

    # services table
    rethinkstuff.ServiceRegistry(r)

    # sites, pages, jobs tables
    brozzler.frontier.RethinkDbFrontier(r)
コード例 #2
0
ファイル: cli.py プロジェクト: mouse-reeve/brozzler
def brozzler_new_job():
    '''
    Command line utility entry point for queuing a new brozzler job. Takes a
    yaml brozzler job configuration file, creates job, sites, and pages objects
    in rethinkdb, which brozzler-workers will look at and start crawling.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzler-new-job - queue new job with brozzler',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument(
            'job_conf_file', metavar='JOB_CONF_FILE',
            help='brozzler job configuration file in yaml')
    _add_rethinkdb_options(arg_parser)
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(','), args.rethinkdb_db)
    frontier = brozzler.RethinkDbFrontier(r)
    try:
        brozzler.job.new_job_file(frontier, args.job_conf_file)
    except brozzler.job.InvalidJobConf as e:
        print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr)
        print('  ' + yaml.dump(e.errors).rstrip().replace('\n', '\n  '), file=sys.stderr)
        sys.exit(1)
コード例 #3
0
 def _init_brozzler_worker(self, args):
     r = rethinkstuff.Rethinker(
             args.rethinkdb_servers.split(","), args.rethinkdb_db)
     frontier = brozzler.RethinkDbFrontier(r)
     service_registry = rethinkstuff.ServiceRegistry(r)
     worker = brozzler.worker.BrozzlerWorker(
             frontier, service_registry,
             max_browsers=args.max_browsers,
             chrome_exe=args.chrome_exe,
             proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
             enable_warcprox_features=True)
     return worker
コード例 #4
0
def ensure_rethinkdb_tables():
    '''
    Creates rethinkdb tables if they don't already exist. Warcprox normally
    creates the tables it needs on demand at startup, but if multiple instances
    are starting up at the same time, you can end up with duplicate broken
    tables. So it's a good idea to use this utility at an early step when
    spinning up a cluster.
    '''
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(sys.argv[0]),
        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument(
        '--rethinkdb-servers',
        dest='rethinkdb_servers',
        default='localhost',
        help='rethinkdb servers e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org'
    )
    arg_parser.add_argument('--rethinkdb-db',
                            dest='rethinkdb_db',
                            default='warcprox',
                            help='rethinkdb database name')
    arg_parser.add_argument('-q',
                            '--quiet',
                            dest='log_level',
                            action='store_const',
                            default=logging.INFO,
                            const=logging.WARN)
    arg_parser.add_argument('-v',
                            '--verbose',
                            dest='log_level',
                            action='store_const',
                            default=logging.INFO,
                            const=logging.DEBUG)
    args = arg_parser.parse_args(args=sys.argv[1:])

    logging.basicConfig(
        stream=sys.stdout,
        level=args.log_level,
        format=('%(asctime)s %(levelname)s %(name)s.%(funcName)s'
                '(%(filename)s:%(lineno)d) %(message)s'))

    r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(','),
                               args.rethinkdb_db)

    # services table
    rethinkstuff.ServiceRegistry(r)

    # stats table
    warcprox.stats.RethinkStatsDb(r)

    # captures table
    warcprox.bigtable.RethinkCaptures(r)
コード例 #5
0
ファイル: test_cluster.py プロジェクト: mouse-reeve/brozzler
def test_brozzle_site(httpd):
    test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(
            seed='http://localhost:%s/' % httpd.server_port,
            proxy='localhost:8000', enable_warcprox_features=True,
            warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}})

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/' % httpd.server_port
    page2 = 'http://localhost:%s/file1.txt' % httpd.server_port

    assert site.id is None
    r = rethinkstuff.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    # the site should be brozzled fairly quickly
    start = time.time()
    while site.status != 'FINISHED' and time.time() - start < 300:
        time.sleep(0.5)
        site = frontier.site(site.id)
    assert site.status == 'FINISHED'

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/' % httpd.server_port,
            'http://localhost:%s/file1.txt' % httpd.server_port }

    # take a look at the captures table
    captures = r.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'}
    assert page1 in captures_by_url
    assert '%srobots.txt' % page1 in captures_by_url
    assert page2 in captures_by_url
    assert 'screenshot:%s' % page1 in captures_by_url
    assert 'thumbnail:%s' % page1 in captures_by_url
    # no screenshots of plaintext

    # check pywb
    t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
    wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
    expected_payload = open(os.path.join(
        os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read()
    assert requests.get(wb_url).content == expected_payload
コード例 #6
0
ファイル: cli.py プロジェクト: Cloudxtreme/brozzler
def brozzler_new_site():
    '''
    Command line utility entry point for queuing a new brozzler site.
    Takes a seed url and creates a site and page object in rethinkdb, which
    brozzler-workers will look at and start crawling.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzler-new-site - register site to brozzle',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('seed', metavar='SEED', help='seed url')
    _add_rethinkdb_options(arg_parser)
    _add_proxy_options(arg_parser)
    arg_parser.add_argument(
            '--time-limit', dest='time_limit', default=None,
            help='time limit in seconds for this site')
    arg_parser.add_argument(
            '--ignore-robots', dest='ignore_robots', action='store_true',
            help='ignore robots.txt for this site')
    arg_parser.add_argument(
            '--warcprox-meta', dest='warcprox_meta',
            help=(
                'Warcprox-Meta http request header to send with each request; '
                'must be a json blob, ignored unless warcprox features are '
                'enabled'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            seed=args.seed, proxy=args.proxy,
            time_limit=int(args.time_limit) if args.time_limit else None,
            ignore_robots=args.ignore_robots,
            enable_warcprox_features=args.enable_warcprox_features,
            warcprox_meta=(
                json.loads(args.warcprox_meta) if args.warcprox_meta else None))

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(","), args.rethinkdb_db)
    frontier = brozzler.RethinkDbFrontier(r)
    brozzler.new_site(frontier, site)
コード例 #7
0
ファイル: test_cluster.py プロジェクト: mouse-reeve/brozzler
def test_services_up():
    '''Check that the expected services are up and running.'''
    # check that rethinkdb is listening and looks sane
    r = rethinkstuff.Rethinker(db='rethinkdb')  # built-in db
    tbls = r.table_list().run()
    assert len(tbls) > 10

    # check that warcprox is listening
    with socket.socket() as s:
        # if the connect fails an exception is raised and the test fails
        s.connect(('localhost', 8000))

    # check that pywb is listening
    with socket.socket() as s:
        # if the connect fails an exception is raised and the test fails
        s.connect(('localhost', 8880))

    # check that brozzler webconsole is listening
    with socket.socket() as s:
        # if the connect fails an exception is raised and the test fails
        s.connect(('localhost', 8881))
コード例 #8
0
 def r(self):
     try:
         return self._r
     except AttributeError:
         self._r = rethinkstuff.Rethinker(self.servers, self.db)
         return self._r
コード例 #9
0
ファイル: cli.py プロジェクト: Cloudxtreme/brozzler
def brozzler_worker():
    '''
    Main entrypoint for brozzler, gets sites and pages to brozzle from
    rethinkdb, brozzles them.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(__file__),
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    _add_rethinkdb_options(arg_parser)
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '-n', '--max-browsers', dest='max_browsers', default='1',
            help='max number of chrome instances simultaneously browsing pages')
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    def sigterm(signum, frame):
        raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)')
    def sigint(signum, frame):
        raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)')

    # do not print in signal handler to avoid RuntimeError: reentrant call
    state_dump_msgs = []
    def queue_state_dump(signum, frame):
        signal.signal(signal.SIGQUIT, signal.SIG_IGN)
        try:
            state_strs = []
            frames = sys._current_frames()
            threads = {th.ident: th for th in threading.enumerate()}
            for ident in frames:
                if threads[ident]:
                    state_strs.append(str(threads[ident]))
                else:
                    state_strs.append('<???:thread:ident=%s>' % ident)
                stack = traceback.format_stack(frames[ident])
                state_strs.append(''.join(stack))
            state_dump_msgs.append(
                    'dumping state (caught signal %s)\n%s' % (
                        signum, '\n'.join(state_strs)))
        except BaseException as e:
            state_dump_msgs.append('exception dumping state: %s' % e)
        finally:
            signal.signal(signal.SIGQUIT, queue_state_dump)

    signal.signal(signal.SIGQUIT, queue_state_dump)
    signal.signal(signal.SIGTERM, sigterm)
    signal.signal(signal.SIGINT, sigint)

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(','), args.rethinkdb_db)
    frontier = brozzler.RethinkDbFrontier(r)
    service_registry = rethinkstuff.ServiceRegistry(r)
    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=int(args.max_browsers),
            chrome_exe=args.chrome_exe)

    worker.start()
    try:
        while worker.is_alive():
            while state_dump_msgs:
                logging.warn(state_dump_msgs.pop(0))
            time.sleep(0.5)
        logging.critical('worker thread has died, shutting down')
    except brozzler.ShutdownRequested as e:
        pass
    finally:
        worker.shutdown_now()

    logging.info('brozzler-worker is all done, exiting')
コード例 #10
0
def init_controller(args):
    '''
    Creates a warcprox.controller.WarcproxController configured according to
    the supplied arguments (normally the result of parse_args(sys.argv)).
    '''
    options = warcprox.Options(**vars(args))

    try:
        hashlib.new(args.digest_algorithm)
    except Exception as e:
        logging.fatal(e)
        exit(1)

    listeners = []
    if args.rethinkdb_servers:
        r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","),
                                   args.rethinkdb_db)
        if args.rethinkdb_big_table:
            captures_db = warcprox.bigtable.RethinkCaptures(r, options=options)
            dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db,
                                                              options=options)
            listeners.append(captures_db)
        else:
            dedup_db = warcprox.dedup.RethinkDedupDb(r, options=options)
            listeners.append(dedup_db)
    elif args.dedup_db_file in (None, '', '/dev/null'):
        logging.info('deduplication disabled')
        dedup_db = None
    else:
        dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
        listeners.append(dedup_db)

    if args.rethinkdb_servers:
        stats_db = warcprox.stats.RethinkStatsDb(r, options=options)
        listeners.append(stats_db)
    elif args.stats_db_file in (None, '', '/dev/null'):
        logging.info('statistics tracking disabled')
        stats_db = None
    else:
        stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options)
        listeners.append(stats_db)

    if args.kafka_broker_list:
        kafka_capture_feed = warcprox.kafkafeed.CaptureFeed(
            args.kafka_broker_list, args.kafka_capture_feed_topic)
        listeners.append(kafka_capture_feed)

    recorded_url_q = queue.Queue(maxsize=args.queue_size)

    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
    ca = certauth.certauth.CertificateAuthority(args.cacert,
                                                args.certs_dir,
                                                ca_name=ca_name)

    proxy = warcprox.warcproxy.WarcProxy(ca=ca,
                                         recorded_url_q=recorded_url_q,
                                         stats_db=stats_db,
                                         options=options)

    if args.playback_port is not None:
        playback_index_db = warcprox.playback.PlaybackIndexDb(
            args.playback_index_db_file, options=options)
        playback_proxy = warcprox.playback.PlaybackProxy(
            server_address=(args.address, args.playback_port),
            ca=ca,
            playback_index_db=playback_index_db,
            warcs_dir=args.directory,
            options=options)
        listeners.append(playback_index_db)
    else:
        playback_index_db = None
        playback_proxy = None

    writer_pool = warcprox.writer.WarcWriterPool(options=options)
    warc_writer_thread = warcprox.writerthread.WarcWriterThread(
        recorded_url_q=recorded_url_q,
        writer_pool=writer_pool,
        dedup_db=dedup_db,
        listeners=listeners,
        options=options)

    if args.rethinkdb_servers:
        svcreg = rethinkstuff.ServiceRegistry(r)
    else:
        svcreg = None

    controller = warcprox.controller.WarcproxController(
        proxy,
        warc_writer_thread,
        playback_proxy,
        service_registry=svcreg,
        options=options)

    return controller
コード例 #11
0
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn
gunicorn_error_logger = logging.getLogger('gunicorn.error')
app.logger.handlers.extend(gunicorn_error_logger.handlers)
app.logger.setLevel(logging.INFO)

# configure with environment variables
SETTINGS = {
    'RETHINKDB_SERVERS':
    os.environ.get('RETHINKDB_SERVERS', 'localhost').split(','),
    'RETHINKDB_DB':
    os.environ.get('RETHINKDB_DB', 'brozzler'),
    'WAYBACK_BASEURL':
    os.environ.get('WAYBACK_BASEURL', 'http://*****:*****@app.route("/api/sites/<site_id>/queued_count")
@app.route("/api/site/<site_id>/queued_count")
def queued_count(site_id):
    count = r.table("pages").between([site_id, 0, False, r.minval],
                                     [site_id, 0, False, r.maxval],