def __init__(self, rethinkdb_trough_db_url, promotion_interval=None): ''' TroughClient constructor Args: rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to trough configuration database promotion_interval: if specified, `TroughClient` will spawn a thread that "promotes" (pushed to hdfs) "dirty" trough segments (segments that have received writes) periodically, sleeping for `promotion_interval` seconds between cycles (default None) ''' parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url) self.rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) self.svcreg = doublethink.ServiceRegistry(self.rr) self._write_url_cache = {} self._read_url_cache = {} self._dirty_segments = set() self._dirty_segments_lock = threading.RLock() self.promotion_interval = promotion_interval self._promoter_thread = None if promotion_interval: self._promoter_thread = threading.Thread( target=self._promotrix, name='TroughClient-promoter') self._promoter_thread.setDaemon(True) self._promoter_thread.start()
def __init__(self): self.rethinker = doublethink.Rethinker( db="trough_configuration", servers=settings['RETHINKDB_HOSTS']) self.services = doublethink.ServiceRegistry(self.rethinker) self.registry = trough.sync.HostRegistry(rethinker=self.rethinker, services=self.services) trough.sync.init(self.rethinker)
def brozzler_ensure_tables(argv=None): ''' Creates rethinkdb tables if they don't already exist. Brozzler (brozzler-worker, brozzler-new-job, etc) normally creates the tables it needs on demand at startup, but if multiple instances are starting up at the same time, you can end up with duplicate broken tables. So it's a good idea to use this utility at an early step when spinning up a cluster. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) # services table doublethink.ServiceRegistry(rr) # sites, pages, jobs tables brozzler.frontier.RethinkDbFrontier(rr)
def test_unique_service(rr): svcreg = doublethink.ServiceRegistry(rr) assert svcreg.unique_service('example-role') == None # this raises an exception: no ttl. with pytest.raises(Exception) as excinfo: svcreg.unique_service('example-role', candidate={}) svc01 = { "role": "example-role", "ttl": 1.2, "node": "test01.example.com", "foo": "bar", } svc02 = { "role": "example-role", "ttl": 1.2, "node": "test02.example.com", "baz": "quux", } # register svc01. output should be svc01. output = svcreg.unique_service('example-role', candidate=svc01) assert output['node'] == svc01['node'] # try to register svc02. Output should still be svc01. output = svcreg.unique_service('example-role', candidate=svc02) assert output['node'] == svc01['node'] time.sleep(0.2) output1 = svcreg.unique_service('example-role', candidate=svc01) assert output1['last_heartbeat'] > output1['first_heartbeat'] output2 = svcreg.unique_service('example-role', candidate=svc02) assert output1['last_heartbeat'] == output2['last_heartbeat'] time.sleep(0.2) output3 = svcreg.unique_service('example-role', candidate=svc01) assert output3['last_heartbeat'] > output1['last_heartbeat'] svcreg.unregister('example-role')
def test_proxy_for_write_segment(self, requests): def post(*args, **kwargs): response = mock.Mock() response.headers = {"Content-Type": "application/json"} response.iter_content = lambda: (b"test", b"output") response.status_code = 200 response.__enter__ = lambda *args, **kwargs: response response.__exit__ = lambda *args, **kwargs: None return response requests.post = post consul = mock.Mock() registry = mock.Mock() rethinker = doublethink.Rethinker(db="trough_configuration", servers=settings['RETHINKDB_HOSTS']) services = doublethink.ServiceRegistry(rethinker) segment = trough.sync.Segment(segment_id="TEST", rethinker=rethinker, services=services, registry=registry, size=0) output = self.server.proxy_for_write_host( 'localhost', segment, "SELECT * FROM mock;", start_response=lambda *args, **kwargs: None) self.assertEqual(list(output), [b"test", b"output"])
def test_leader_election(rr): svcreg = doublethink.ServiceRegistry(rr) assert svcreg.leader('example-role') == None # this raises an exception: no heartbeat_interval. with pytest.raises(Exception) as excinfo: svcreg.leader('example-role', default={}) svc01 = { "role": "example-role", "load": 0.0, "heartbeat_interval": 0.4, "node": "test01.example.com" } svc02 = { "role": "example-role", "load": 0.0, "heartbeat_interval": 0.4, "node": "test02.example.com" } # register svc01. output should be svc01. output = svcreg.leader('example-role', default=svc01) assert output['node'] == svc01['node'] # try to register svc02. Output should still be svc01. output = svcreg.leader('example-role', default=svc02) assert output['node'] == svc01['node'] svcreg.unregister('example-role')
def setUp(self): self.rethinker = doublethink.Rethinker( db=random_db, servers=settings['RETHINKDB_HOSTS']) self.services = doublethink.ServiceRegistry(self.rethinker) self.registry = sync.HostRegistry(rethinker=self.rethinker, services=self.services) self.snakebite_client = mock.Mock() self.rethinker.table("services").delete().run()
def setUp(self): self.rethinker = doublethink.Rethinker( db=random_db, servers=settings['RETHINKDB_HOSTS']) self.services = doublethink.ServiceRegistry(self.rethinker) sync.init(self.rethinker) self.rethinker.table("services").delete().run() self.rethinker.table("lock").delete().run() self.rethinker.table("assignment").delete().run()
def service_registry(options): if options.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) return doublethink.ServiceRegistry(rr, table=parsed.table) else: return None
def _init_brozzler_worker(self, args): rr = doublethink.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, chrome_exe=args.chrome_exe, proxy='%s:%s' % self.warcprox_controller.proxy.server_address, max_browsers=args.max_browsers) return worker
def _do_write(self, query): # send provision query to server if not self._write_url. # after send provision query, set self._write_url. # send query to server, return JSON rethinker = doublethink.Rethinker(db="trough_configuration", servers=self.rethinkdb) services = doublethink.ServiceRegistry(rethinker) master_node = services.unique_service('trough-sync-master') logging.info('master_node=%r', master_node) if not master_node: raise Exception( 'no healthy trough-sync-master in service registry') if not self._write_url: buffer = BytesIO() c = pycurl.Curl() c.setopt(c.URL, master_node.get('url')) c.setopt(c.POSTFIELDS, self.database) if self.proxy: c.setopt(pycurl.PROXY, self.proxy) c.setopt(pycurl.PROXYPORT, int(self.proxy_port)) c.setopt(pycurl.PROXYTYPE, self.proxy_type) c.setopt(c.WRITEDATA, buffer) c.perform() c.close() self._write_url = buffer.getvalue() logging.info('self._write_url=%r', self._write_url) buffer = BytesIO() c = pycurl.Curl() c.setopt(c.URL, self._write_url) c.setopt(c.POSTFIELDS, query) if self.proxy: c.setopt(pycurl.PROXY, self.proxy) c.setopt(pycurl.PROXYPORT, int(self.proxy_port)) c.setopt(pycurl.PROXYTYPE, self.proxy_type) c.setopt(c.WRITEDATA, buffer) c.perform() c.close() response = buffer.getvalue() if response.strip() != b'OK': raise Exception( 'Trough Query Failed: Database: %r Response: %r Query: %.200r' % (self.database, response, query)) self._last_results = None
def test_choose_warcprox(): rr = doublethink.Rethinker('localhost', db='ignoreme') svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 rr.table('sites').wait().run() rr.table('services').wait().run() rr.table('sites').index_wait().run() rr.table('services').index_wait().run() # clean slate rr.table('sites').delete().run() rr.table('services').delete().run() worker = brozzler.BrozzlerWorker(frontier, svcreg) assert worker._choose_warcprox() is None rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host1', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8001, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host3', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host4', 'port': 8000, 'load': 1, 'ttl': 60}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8001', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host3' assert instance['port'] == 8000 rr.table('sites').insert({ 'proxy': 'host3:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host4' assert instance['port'] == 8000 # clean up rr.table('sites').delete().run() rr.table('services').delete().run()
def _test_proxy_setting( httpd, proxy=None, warcprox_auto=False, is_warcprox=False): test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % ( proxy, warcprox_auto, is_warcprox, datetime.datetime.utcnow().isoformat()) # the two pages we expect to be crawled page1 = 'http://localhost:%s/site1/' % httpd.server_port page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port robots = 'http://localhost:%s/robots.txt' % httpd.server_port rr = doublethink.Rethinker('localhost', db='brozzler') service_registry = doublethink.ServiceRegistry(rr) site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=1, chrome_exe=brozzler.suggest_default_chrome_exe(), warcprox_auto=warcprox_auto, proxy=proxy) browser = worker._browser_pool.acquire() worker.brozzle_site(browser, site) worker._browser_pool.release(browser) # check proxy is set assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy site.refresh() # check that these things were persisted assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/site1/' % httpd.server_port, 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} if is_warcprox: assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload else: assert captures_by_url == {}
def test_service_registry(rr): svcreg = doublethink.ServiceRegistry(rr) # missing required fields with pytest.raises(Exception) as excinfo: svcreg.heartbeat({}) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"role": "foo", "load": 1}) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"role": "foo", "ttl": 1.0}) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"ttl": 1.0, "load": 1}) # invalid ttl (we accept anything for load and role) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"ttl": -1, "role": "foo", "load": 1}) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"ttl": "strang", "role": "foo", "load": 1}) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"ttl": [], "role": "foo", "load": 1}) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"ttl": [1], "role": "foo", "load": 1}) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"ttl": {}, "role": "foo", "load": 1}) with pytest.raises(Exception) as excinfo: svcreg.heartbeat({"ttl": {1: 2}, "role": "foo", "load": 1}) assert svcreg.available_service("yes-such-role") == None assert svcreg.available_services("yes-such-role") == [] assert svcreg.available_services() == [] svc0 = { "role": "yes-such-role", "load": 100.0, "ttl": 1.2, } svc1 = { "role": "yes-such-role", "load": 200.0, "ttl": 1.2, } svc0 = svcreg.heartbeat(svc0) svc1 = svcreg.heartbeat(svc1) assert "id" in svc0 assert "id" in svc1 assert svc0["id"] != svc1["id"] assert svc0["host"] == socket.gethostname() assert svc1["host"] == socket.gethostname() assert "pid" in svc0 assert "pid" in svc1 assert svc0["pid"] == os.getpid() assert svc1["pid"] == os.getpid() assert "first_heartbeat" in svc0 assert "first_heartbeat" in svc1 assert "last_heartbeat" in svc0 assert "last_heartbeat" in svc1 time.sleep(0.2) assert svcreg.available_service("no-such-role") == None assert svcreg.available_services("no-such-role") == [] # svc0 has less load assert svcreg.available_service("yes-such-role")["id"] == svc0["id"] assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 2 svc1["load"] = 50.0 svc1 = svcreg.heartbeat(svc1) time.sleep(0.2) assert svcreg.available_service("no-such-role") == None # now svc1 has less load assert svcreg.available_service("yes-such-role")["id"] == svc1["id"] assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 2 svc1["load"] = 200.0 svc1 = svcreg.heartbeat(svc1) time.sleep(0.2) assert svcreg.available_service("no-such-role") == None # now svc0 has less load again assert svcreg.available_service("yes-such-role")["id"] == svc0["id"] assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 2 svc1 = svcreg.heartbeat(svc1) time.sleep(0.2) svc1 = svcreg.heartbeat(svc1) time.sleep(0.7) assert svcreg.available_service("no-such-role") == None # now it's been too long since the last heartbeat from svc0 assert svcreg.available_service("yes-such-role")["id"] == svc1["id"] assert len(svcreg.available_services("yes-such-role")) == 1 assert len(svcreg.available_services()) == 1 svcreg.unregister(svc1["id"]) time.sleep(0.2) assert svcreg.available_service("no-such-role") == None assert svcreg.available_service("yes-such-role") == None assert svcreg.available_services("yes-such-role") == [] assert svcreg.available_services() == [] svc0 = { "role": "yes-such-role", "load": 100.0, "ttl": 1.2, } svc1 = { "role": "yes-such-role", "load": 200.0, "ttl": 1.2, } svc0 = svcreg.heartbeat(svc0) svc1 = svcreg.heartbeat(svc1) assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 2 svcreg.unregister(svc0["id"]) svcreg.unregister(svc1["id"]) svc0 = { "role": "yes-such-role", "load": 100.0, "ttl": 1.2, } svc1 = { "role": "yes-such-role", "load": 200.0, "ttl": 1.2, } svc2 = { "role": "another-such-role", "load": 200.0, "ttl": 1.2, } svc3 = { "role": "yet-another-such-role", "load": 200.0, "ttl": 1.2, } svc0 = svcreg.heartbeat(svc0) svc1 = svcreg.heartbeat(svc1) svc2 = svcreg.heartbeat(svc2) svc3 = svcreg.heartbeat(svc3) assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 4
def brozzler_worker(argv=None): ''' Main entry point for brozzler, gets sites and pages to brozzle from rethinkdb, brozzles them. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) add_rethinkdb_options(arg_parser) arg_parser.add_argument('-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') arg_parser.add_argument('--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--warcprox-auto', dest='warcprox_auto', action='store_true', help=('when needed, choose an available instance of warcprox from ' 'the rethinkdb service registry')) arg_parser.add_argument('--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) def dump_state(signum, frame): signal.signal(signal.SIGQUIT, signal.SIG_IGN) try: state_strs = [] frames = sys._current_frames() threads = {th.ident: th for th in threading.enumerate()} for ident in frames: if threads[ident]: state_strs.append(str(threads[ident])) else: state_strs.append('<???:thread:ident=%s>' % ident) stack = traceback.format_stack(frames[ident]) state_strs.append(''.join(stack)) logging.info('dumping state (caught signal %s)\n%s' % (signum, '\n'.join(state_strs))) except BaseException as e: logging.error('exception dumping state: %s' % e) finally: signal.signal(signal.SIGQUIT, dump_state) rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, warcprox_auto=args.warcprox_auto, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, lambda s, f: worker.stop()) signal.signal(signal.SIGINT, lambda s, f: worker.stop()) th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread') th.start() th.join() logging.info('brozzler-worker is all done, exiting')
def init_controller(args): ''' Creates a warcprox.controller.WarcproxController configured according to the supplied arguments (normally the result of parse_args(sys.argv)). ''' options = warcprox.Options(**vars(args)) try: hashlib.new(args.digest_algorithm) except Exception as e: logging.fatal(e) exit(1) listeners = [] if args.rethinkdb_dedup_url: dedup_db = warcprox.dedup.RethinkDedupDb(options=options) elif args.rethinkdb_big_table_url: dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) elif args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) elif args.cdxserver_dedup: dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup) elif args.dedup_db_file in (None, '', '/dev/null'): logging.info('deduplication disabled') dedup_db = None else: dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options) if dedup_db: listeners.append(dedup_db) if args.rethinkdb_stats_url: stats_db = warcprox.stats.RethinkStatsDb(options=options) listeners.append(stats_db) elif args.stats_db_file in (None, '', '/dev/null'): logging.info('statistics tracking disabled') stats_db = None else: stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options) listeners.append(stats_db) recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size) ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64] ca = certauth.certauth.CertificateAuthority(args.cacert, args.certs_dir, ca_name=ca_name) proxy = warcprox.warcproxy.WarcProxy(ca=ca, recorded_url_q=recorded_url_q, stats_db=stats_db, options=options) if args.playback_port is not None: playback_index_db = warcprox.playback.PlaybackIndexDb( args.playback_index_db_file, options=options) playback_proxy = warcprox.playback.PlaybackProxy( ca=ca, playback_index_db=playback_index_db, options=options) listeners.append(playback_index_db) else: playback_index_db = None playback_proxy = None if args.crawl_log_dir: listeners.append( warcprox.crawl_log.CrawlLogger(args.crawl_log_dir, options=options)) for qualname in args.plugins or []: try: (module_name, class_name) = qualname.rsplit('.', 1) module_ = importlib.import_module(module_name) class_ = getattr(module_, class_name) listener = class_() listener.notify # make sure it has this method listeners.append(listener) except Exception as e: logging.fatal('problem with plugin class %r: %s', qualname, e) sys.exit(1) writer_pool = warcprox.writer.WarcWriterPool(options=options) # number of warc writer threads = sqrt(proxy.max_threads) # I came up with this out of thin air because it strikes me as reasonable # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45 num_writer_threads = args.writer_threads or int(proxy.max_threads**0.5) logging.debug('initializing %d warc writer threads', num_writer_threads) warc_writer_threads = [ warcprox.writerthread.WarcWriterThread(name='WarcWriterThread%03d' % i, recorded_url_q=recorded_url_q, writer_pool=writer_pool, dedup_db=dedup_db, listeners=listeners, options=options) for i in range(num_writer_threads) ] if args.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) else: svcreg = None controller = warcprox.controller.WarcproxController( proxy, warc_writer_threads, playback_proxy, service_registry=svcreg, options=options) return controller
def ensure_rethinkdb_tables(argv=None): ''' Creates rethinkdb tables if they don't already exist. Warcprox normally creates the tables it needs on demand at startup, but if multiple instances are starting up at the same time, you can end up with duplicate broken tables. So it's a good idea to use this utility at an early step when spinning up a cluster. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument( '--rethinkdb-stats-url', dest='rethinkdb_stats_url', help=('rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/my_stats_table')) group = arg_parser.add_mutually_exclusive_group() group.add_argument( '--rethinkdb-dedup-url', dest='rethinkdb_dedup_url', help=('rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/my_dedup_table')) group.add_argument( '--rethinkdb-big-table-url', dest='rethinkdb_big_table_url', help=('rethinkdb big table url (table will be populated with ' 'various capture information and is suitable for use as ' 'index for playback), e.g. rethinkdb://db0.foo.org,' 'db1.foo.org:38015/my_warcprox_db/captures')) group.add_argument( '--rethinkdb-trough-db-url', dest='rethinkdb_trough_db_url', help=('🐷 url pointing to trough configuration rethinkdb database, ' 'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015' '/trough_configuration')) arg_parser.add_argument( '--rethinkdb-services-url', dest='rethinkdb_services_url', help=('rethinkdb service registry table url; if provided, warcprox ' 'will create and heartbeat entry for itself')) arg_parser.add_argument('-q', '--quiet', dest='log_level', action='store_const', default=logging.INFO, const=logging.WARN) arg_parser.add_argument('-v', '--verbose', dest='log_level', action='store_const', default=logging.INFO, const=logging.DEBUG) args = arg_parser.parse_args(args=argv[1:]) logging.basicConfig( stream=sys.stdout, level=args.log_level, format=('%(asctime)s %(levelname)s %(name)s.%(funcName)s' '(%(filename)s:%(lineno)d) %(message)s')) options = warcprox.Options(**vars(args)) did_something = False if args.rethinkdb_services_url: parsed = doublethink.parse_rethinkdb_url( options.rethinkdb_services_url) rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database) svcreg = doublethink.ServiceRegistry(rr, table=parsed.table) did_something = True if args.rethinkdb_stats_url: stats_db = warcprox.stats.RethinkStatsProcessor(options=options) stats_db._ensure_db_table() did_something = True if args.rethinkdb_dedup_url: dedup_db = warcprox.dedup.RethinkDedupDb(options=options) did_something = True if args.rethinkdb_big_table_url: dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options) did_something = True if args.rethinkdb_trough_db_url: dedup_db = warcprox.dedup.TroughDedupDb(options) logging.warn( 'trough is responsible for creating most of the rethinkdb ' 'tables that it uses') did_something = True if not did_something: logging.error('nothing to do, no --rethinkdb-* options supplied')
def test_warcprox_outage_resiliency(httpd): ''' Tests resiliency to warcprox outage. If no instances of warcprox are healthy when starting to crawl a site, brozzler-worker should sit there and wait until a healthy instance appears. If an instance goes down, sites assigned to that instance should bounce over to a healthy instance. If all instances of warcprox go down, brozzler-worker should sit and wait. ''' rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) svcreg = doublethink.ServiceRegistry(rr) # run two instances of warcprox opts = warcprox.Options() opts.address = '0.0.0.0' opts.port = 0 warcprox1 = warcprox.controller.WarcproxController( service_registry=svcreg, options=opts) warcprox2 = warcprox.controller.WarcproxController( service_registry=svcreg, options=opts) warcprox1_thread = threading.Thread( target=warcprox1.run_until_shutdown, name='warcprox1') warcprox2_thread = threading.Thread( target=warcprox2.run_until_shutdown, name='warcprox2') # put together a site to crawl test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/infinite/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) try: # we manage warcprox instances ourselves, so stop the one running on # the system, if any try: stop_service('warcprox') except Exception as e: logging.warn('problem stopping warcprox service: %s', e) # queue the site for brozzling brozzler.new_site(frontier, site) # check that nothing happens # XXX tail brozzler-worker.log or something? time.sleep(30) site.refresh() assert site.status == 'ACTIVE' assert not site.proxy assert len(list(frontier.site_pages(site.id))) == 1 # start one instance of warcprox warcprox1_thread.start() # check that it started using that instance start = time.time() while not site.proxy and time.time() - start < 30: time.sleep(0.5) site.refresh() assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port) # check that the site accumulates pages in the frontier, confirming # that crawling is really happening start = time.time() while (len(list(frontier.site_pages(site.id))) <= 1 and time.time() - start < 60): time.sleep(0.5) site.refresh() assert len(list(frontier.site_pages(site.id))) > 1 # stop warcprox #1, start warcprox #2 warcprox2_thread.start() warcprox1.stop.set() warcprox1_thread.join() # check that it switched over to warcprox #2 start = time.time() while ((not site.proxy or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port)) and time.time() - start < 30): time.sleep(0.5) site.refresh() assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port) # stop warcprox #2 warcprox2.stop.set() warcprox2_thread.join() page_count = len(list(frontier.site_pages(site.id))) assert page_count > 1 # check that it is waiting for a warcprox to appear time.sleep(30) site.refresh() assert site.status == 'ACTIVE' assert not site.proxy assert len(list(frontier.site_pages(site.id))) == page_count # stop crawling the site, else it can pollute subsequent test runs brozzler.cli.brozzler_stop_crawl([ 'brozzler-stop-crawl', '--site=%s' % site.id]) site.refresh() assert site.stop_requested # stop request should be honored quickly start = time.time() while not site.status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED_STOP_REQUESTED' finally: warcprox1.stop.set() warcprox2.stop.set() warcprox1_thread.join() warcprox2_thread.join() start_service('warcprox')
def service_registry(): global _svc_reg if not _svc_reg: _svc_reg = doublethink.ServiceRegistry(rr) return _svc_reg
def test_service_registry(rr): svcreg = doublethink.ServiceRegistry(rr) assert svcreg.available_service("yes-such-role") == None assert svcreg.available_services("yes-such-role") == [] assert svcreg.available_services() == [] svc0 = { "role": "yes-such-role", "load": 100.0, "heartbeat_interval": 0.4, } svc1 = { "role": "yes-such-role", "load": 200.0, "heartbeat_interval": 0.4, } svc0 = svcreg.heartbeat(svc0) svc1 = svcreg.heartbeat(svc1) assert "id" in svc0 assert "id" in svc1 assert svc0["id"] != svc1["id"] assert svc0["host"] == socket.gethostname() assert svc1["host"] == socket.gethostname() assert "pid" in svc0 assert "pid" in svc1 assert svc0["pid"] == os.getpid() assert svc1["pid"] == os.getpid() assert "first_heartbeat" in svc0 assert "first_heartbeat" in svc1 assert "last_heartbeat" in svc0 assert "last_heartbeat" in svc1 time.sleep(0.2) assert svcreg.available_service("no-such-role") == None assert svcreg.available_services("no-such-role") == [] # svc0 has less load assert svcreg.available_service("yes-such-role")["id"] == svc0["id"] assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 2 svc1["load"] = 50.0 svc1 = svcreg.heartbeat(svc1) time.sleep(0.2) assert svcreg.available_service("no-such-role") == None # now svc1 has less load assert svcreg.available_service("yes-such-role")["id"] == svc1["id"] assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 2 svc1["load"] = 200.0 svc1 = svcreg.heartbeat(svc1) time.sleep(0.2) assert svcreg.available_service("no-such-role") == None # now svc0 has less load again assert svcreg.available_service("yes-such-role")["id"] == svc0["id"] assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 2 svc1 = svcreg.heartbeat(svc1) time.sleep(0.2) svc1 = svcreg.heartbeat(svc1) time.sleep(0.7) assert svcreg.available_service("no-such-role") == None # now it's been too long since the last heartbeat from svc0 assert svcreg.available_service("yes-such-role")["id"] == svc1["id"] assert len(svcreg.available_services("yes-such-role")) == 1 assert len(svcreg.available_services()) == 1 svcreg.unregister(svc1["id"]) time.sleep(0.2) assert svcreg.available_service("no-such-role") == None assert svcreg.available_service("yes-such-role") == None assert svcreg.available_services("yes-such-role") == [] assert svcreg.available_services() == [] svc0 = { "role": "yes-such-role", "load": 100.0, "heartbeat_interval": 0.4, } svc1 = { "role": "yes-such-role", "load": 200.0, "heartbeat_interval": 0.4, } svc0 = svcreg.heartbeat(svc0) svc1 = svcreg.heartbeat(svc1) assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 2 svcreg.unregister(svc0["id"]) svcreg.unregister(svc1["id"]) svc0 = { "role": "yes-such-role", "load": 100.0, "heartbeat_interval": 0.4, } svc1 = { "role": "yes-such-role", "load": 200.0, "heartbeat_interval": 0.4, } svc2 = { "role": "another-such-role", "load": 200.0, "heartbeat_interval": 0.4, } svc3 = { "role": "yet-another-such-role", "load": 200.0, "heartbeat_interval": 0.4, } svc0 = svcreg.heartbeat(svc0) svc1 = svcreg.heartbeat(svc1) svc2 = svcreg.heartbeat(svc2) svc3 = svcreg.heartbeat(svc3) assert len(svcreg.available_services("yes-such-role")) == 2 assert len(svcreg.available_services()) == 4