def brozzler_ensure_tables(): ''' Creates rethinkdb tables if they don't already exist. Brozzler (brozzler-worker, brozzler-new-job, etc) normally creates the tables it needs on demand at startup, but if multiple instances are starting up at the same time, you can end up with duplicate broken tables. So it's a good idea to use this utility at an early step when spinning up a cluster. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter) _add_rethinkdb_options(arg_parser) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(','), args.rethinkdb_db) # services table rethinkstuff.ServiceRegistry(r) # sites, pages, jobs tables brozzler.frontier.RethinkDbFrontier(r)
def brozzler_new_job(): ''' Command line utility entry point for queuing a new brozzler job. Takes a yaml brozzler job configuration file, creates job, sites, and pages objects in rethinkdb, which brozzler-workers will look at and start crawling. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzler-new-job - queue new job with brozzler', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument( 'job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml') _add_rethinkdb_options(arg_parser) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(','), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) try: brozzler.job.new_job_file(frontier, args.job_conf_file) except brozzler.job.InvalidJobConf as e: print('brozzler-new-job: invalid job file:', args.job_conf_file, file=sys.stderr) print(' ' + yaml.dump(e.errors).rstrip().replace('\n', '\n '), file=sys.stderr) sys.exit(1)
def _init_brozzler_worker(self, args): r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) service_registry = rethinkstuff.ServiceRegistry(r) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=args.max_browsers, chrome_exe=args.chrome_exe, proxy='%s:%s' % self.warcprox_controller.proxy.server_address, enable_warcprox_features=True) return worker
def ensure_rethinkdb_tables(): ''' Creates rethinkdb tables if they don't already exist. Warcprox normally creates the tables it needs on demand at startup, but if multiple instances are starting up at the same time, you can end up with duplicate broken tables. So it's a good idea to use this utility at an early step when spinning up a cluster. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument( '--rethinkdb-servers', dest='rethinkdb_servers', default='localhost', help='rethinkdb servers e.g. db0.foo.org,db0.foo.org:38015,db1.foo.org' ) arg_parser.add_argument('--rethinkdb-db', dest='rethinkdb_db', default='warcprox', help='rethinkdb database name') arg_parser.add_argument('-q', '--quiet', dest='log_level', action='store_const', default=logging.INFO, const=logging.WARN) arg_parser.add_argument('-v', '--verbose', dest='log_level', action='store_const', default=logging.INFO, const=logging.DEBUG) args = arg_parser.parse_args(args=sys.argv[1:]) logging.basicConfig( stream=sys.stdout, level=args.log_level, format=('%(asctime)s %(levelname)s %(name)s.%(funcName)s' '(%(filename)s:%(lineno)d) %(message)s')) r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(','), args.rethinkdb_db) # services table rethinkstuff.ServiceRegistry(r) # stats table warcprox.stats.RethinkStatsDb(r) # captures table warcprox.bigtable.RethinkCaptures(r)
def test_brozzle_site(httpd): test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site( seed='http://localhost:%s/' % httpd.server_port, proxy='localhost:8000', enable_warcprox_features=True, warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) # the two pages we expect to be crawled page1 = 'http://localhost:%s/' % httpd.server_port page2 = 'http://localhost:%s/file1.txt' % httpd.server_port assert site.id is None r = rethinkstuff.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site = frontier.site(site.id) assert site.status == 'FINISHED' # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, 'http://localhost:%s/file1.txt' % httpd.server_port } # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'} assert page1 in captures_by_url assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # no screenshots of plaintext # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload
def brozzler_new_site(): ''' Command line utility entry point for queuing a new brozzler site. Takes a seed url and creates a site and page object in rethinkdb, which brozzler-workers will look at and start crawling. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzler-new-site - register site to brozzle', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') _add_rethinkdb_options(arg_parser) _add_proxy_options(arg_parser) arg_parser.add_argument( '--time-limit', dest='time_limit', default=None, help='time limit in seconds for this site') arg_parser.add_argument( '--ignore-robots', dest='ignore_robots', action='store_true', help='ignore robots.txt for this site') arg_parser.add_argument( '--warcprox-meta', dest='warcprox_meta', help=( 'Warcprox-Meta http request header to send with each request; ' 'must be a json blob, ignored unless warcprox features are ' 'enabled')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( seed=args.seed, proxy=args.proxy, time_limit=int(args.time_limit) if args.time_limit else None, ignore_robots=args.ignore_robots, enable_warcprox_features=args.enable_warcprox_features, warcprox_meta=( json.loads(args.warcprox_meta) if args.warcprox_meta else None)) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site)
def test_services_up(): '''Check that the expected services are up and running.''' # check that rethinkdb is listening and looks sane r = rethinkstuff.Rethinker(db='rethinkdb') # built-in db tbls = r.table_list().run() assert len(tbls) > 10 # check that warcprox is listening with socket.socket() as s: # if the connect fails an exception is raised and the test fails s.connect(('localhost', 8000)) # check that pywb is listening with socket.socket() as s: # if the connect fails an exception is raised and the test fails s.connect(('localhost', 8880)) # check that brozzler webconsole is listening with socket.socket() as s: # if the connect fails an exception is raised and the test fails s.connect(('localhost', 8881))
def r(self): try: return self._r except AttributeError: self._r = rethinkstuff.Rethinker(self.servers, self.db) return self._r
def brozzler_worker(): ''' Main entrypoint for brozzler, gets sites and pages to brozzle from rethinkdb, brozzles them. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(__file__), formatter_class=argparse.ArgumentDefaultsHelpFormatter) _add_rethinkdb_options(arg_parser) arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) def sigterm(signum, frame): raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)') def sigint(signum, frame): raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)') # do not print in signal handler to avoid RuntimeError: reentrant call state_dump_msgs = [] def queue_state_dump(signum, frame): signal.signal(signal.SIGQUIT, signal.SIG_IGN) try: state_strs = [] frames = sys._current_frames() threads = {th.ident: th for th in threading.enumerate()} for ident in frames: if threads[ident]: state_strs.append(str(threads[ident])) else: state_strs.append('<???:thread:ident=%s>' % ident) stack = traceback.format_stack(frames[ident]) state_strs.append(''.join(stack)) state_dump_msgs.append( 'dumping state (caught signal %s)\n%s' % ( signum, '\n'.join(state_strs))) except BaseException as e: state_dump_msgs.append('exception dumping state: %s' % e) finally: signal.signal(signal.SIGQUIT, queue_state_dump) signal.signal(signal.SIGQUIT, queue_state_dump) signal.signal(signal.SIGTERM, sigterm) signal.signal(signal.SIGINT, sigint) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(','), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) service_registry = rethinkstuff.ServiceRegistry(r) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe) worker.start() try: while worker.is_alive(): while state_dump_msgs: logging.warn(state_dump_msgs.pop(0)) time.sleep(0.5) logging.critical('worker thread has died, shutting down') except brozzler.ShutdownRequested as e: pass finally: worker.shutdown_now() logging.info('brozzler-worker is all done, exiting')
def init_controller(args): ''' Creates a warcprox.controller.WarcproxController configured according to the supplied arguments (normally the result of parse_args(sys.argv)). ''' options = warcprox.Options(**vars(args)) try: hashlib.new(args.digest_algorithm) except Exception as e: logging.fatal(e) exit(1) listeners = [] if args.rethinkdb_servers: r = rethinkstuff.Rethinker(args.rethinkdb_servers.split(","), args.rethinkdb_db) if args.rethinkdb_big_table: captures_db = warcprox.bigtable.RethinkCaptures(r, options=options) dedup_db = warcprox.bigtable.RethinkCapturesDedup(captures_db, options=options) listeners.append(captures_db) else: dedup_db = warcprox.dedup.RethinkDedupDb(r, options=options) listeners.append(dedup_db) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None else: dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options) listeners.append(dedup_db) if args.rethinkdb_servers: stats_db = warcprox.stats.RethinkStatsDb(r, options=options) listeners.append(stats_db) elif args.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') stats_db = None else: stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) listeners.append(stats_db) if args.kafka_broker_list: kafka_capture_feed = warcprox.kafkafeed.CaptureFeed( args.kafka_broker_list, args.kafka_capture_feed_topic) listeners.append(kafka_capture_feed) recorded_url_q = queue.Queue(maxsize=args.queue_size) ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, ca_name=ca_name) proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, options=options) if args.playback_port is not None: playback_index_db = warcprox.playback.PlaybackIndexDb( args.playback_index_db_file, options=options) playback_proxy = warcprox.playback.PlaybackProxy( server_address=(args.address, args.playback_port), ca=ca, playback_index_db=playback_index_db, warcs_dir=args.directory, options=options) listeners.append(playback_index_db) else: playback_index_db = None playback_proxy = None writer_pool = warcprox.writer.WarcWriterPool(options=options) warc_writer_thread = warcprox.writerthread.WarcWriterThread( recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=listeners, options=options) if args.rethinkdb_servers: svcreg = rethinkstuff.ServiceRegistry(r) else: svcreg = None controller = warcprox.controller.WarcproxController( proxy, warc_writer_thread, playback_proxy, service_registry=svcreg, options=options) return controller
# http://stackoverflow.com/questions/26578733/why-is-flask-application-not-creating-any-logs-when-hosted-by-gunicorn gunicorn_error_logger = logging.getLogger('gunicorn.error') app.logger.handlers.extend(gunicorn_error_logger.handlers) app.logger.setLevel(logging.INFO) # configure with environment variables SETTINGS = { 'RETHINKDB_SERVERS': os.environ.get('RETHINKDB_SERVERS', 'localhost').split(','), 'RETHINKDB_DB': os.environ.get('RETHINKDB_DB', 'brozzler'), 'WAYBACK_BASEURL': os.environ.get('WAYBACK_BASEURL', 'http://*****:*****@app.route("/api/sites/<site_id>/queued_count") @app.route("/api/site/<site_id>/queued_count") def queued_count(site_id): count = r.table("pages").between([site_id, 0, False, r.minval], [site_id, 0, False, r.maxval],