Beispiel #1
0
    def __init__(self, rethinkdb_trough_db_url, promotion_interval=None):
        '''
        TroughClient constructor

        Args:
            rethinkdb_trough_db_url: url with schema rethinkdb:// pointing to
                trough configuration database
            promotion_interval: if specified, `TroughClient` will spawn a
                thread that "promotes" (pushed to hdfs) "dirty" trough segments
                (segments that have received writes) periodically, sleeping for
                `promotion_interval` seconds between cycles (default None)
        '''
        parsed = doublethink.parse_rethinkdb_url(rethinkdb_trough_db_url)
        self.rr = doublethink.Rethinker(servers=parsed.hosts,
                                        db=parsed.database)
        self.svcreg = doublethink.ServiceRegistry(self.rr)
        self._write_url_cache = {}
        self._read_url_cache = {}
        self._dirty_segments = set()
        self._dirty_segments_lock = threading.RLock()

        self.promotion_interval = promotion_interval
        self._promoter_thread = None
        if promotion_interval:
            self._promoter_thread = threading.Thread(
                target=self._promotrix, name='TroughClient-promoter')
            self._promoter_thread.setDaemon(True)
            self._promoter_thread.start()
Beispiel #2
0
 def __init__(self):
     self.rethinker = doublethink.Rethinker(
         db="trough_configuration", servers=settings['RETHINKDB_HOSTS'])
     self.services = doublethink.ServiceRegistry(self.rethinker)
     self.registry = trough.sync.HostRegistry(rethinker=self.rethinker,
                                              services=self.services)
     trough.sync.init(self.rethinker)
Beispiel #3
0
def brozzler_ensure_tables(argv=None):
    '''
    Creates rethinkdb tables if they don't already exist. Brozzler
    (brozzler-worker, brozzler-new-job, etc) normally creates the tables it
    needs on demand at startup, but if multiple instances are starting up at
    the same time, you can end up with duplicate broken tables. So it's a good
    idea to use this utility at an early step when spinning up a cluster.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    add_rethinkdb_options(arg_parser)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    rr = rethinker(args)

    # services table
    doublethink.ServiceRegistry(rr)

    # sites, pages, jobs tables
    brozzler.frontier.RethinkDbFrontier(rr)
Beispiel #4
0
def test_unique_service(rr):
    svcreg = doublethink.ServiceRegistry(rr)
    assert svcreg.unique_service('example-role') == None
    # this raises an exception: no ttl.
    with pytest.raises(Exception) as excinfo:
        svcreg.unique_service('example-role', candidate={})
    svc01 = {
        "role": "example-role",
        "ttl": 1.2,
        "node": "test01.example.com",
        "foo": "bar",
    }
    svc02 = {
        "role": "example-role",
        "ttl": 1.2,
        "node": "test02.example.com",
        "baz": "quux",
    }
    # register svc01. output should be svc01.
    output = svcreg.unique_service('example-role', candidate=svc01)
    assert output['node'] == svc01['node']
    # try to register svc02. Output should still be svc01.
    output = svcreg.unique_service('example-role', candidate=svc02)
    assert output['node'] == svc01['node']
    time.sleep(0.2)
    output1 = svcreg.unique_service('example-role', candidate=svc01)
    assert output1['last_heartbeat'] > output1['first_heartbeat']
    output2 = svcreg.unique_service('example-role', candidate=svc02)
    assert output1['last_heartbeat'] == output2['last_heartbeat']
    time.sleep(0.2)
    output3 = svcreg.unique_service('example-role', candidate=svc01)
    assert output3['last_heartbeat'] > output1['last_heartbeat']
    svcreg.unregister('example-role')
Beispiel #5
0
    def test_proxy_for_write_segment(self, requests):
        def post(*args, **kwargs):
            response = mock.Mock()
            response.headers = {"Content-Type": "application/json"}
            response.iter_content = lambda: (b"test", b"output")
            response.status_code = 200
            response.__enter__ = lambda *args, **kwargs: response
            response.__exit__ = lambda *args, **kwargs: None
            return response

        requests.post = post
        consul = mock.Mock()
        registry = mock.Mock()
        rethinker = doublethink.Rethinker(db="trough_configuration",
                                          servers=settings['RETHINKDB_HOSTS'])
        services = doublethink.ServiceRegistry(rethinker)
        segment = trough.sync.Segment(segment_id="TEST",
                                      rethinker=rethinker,
                                      services=services,
                                      registry=registry,
                                      size=0)
        output = self.server.proxy_for_write_host(
            'localhost',
            segment,
            "SELECT * FROM mock;",
            start_response=lambda *args, **kwargs: None)
        self.assertEqual(list(output), [b"test", b"output"])
Beispiel #6
0
def test_leader_election(rr):
    svcreg = doublethink.ServiceRegistry(rr)
    assert svcreg.leader('example-role') == None
    # this raises an exception: no heartbeat_interval.
    with pytest.raises(Exception) as excinfo:
        svcreg.leader('example-role', default={})
    svc01 = {
        "role": "example-role",
        "load": 0.0,
        "heartbeat_interval": 0.4,
        "node": "test01.example.com"
    }
    svc02 = {
        "role": "example-role",
        "load": 0.0,
        "heartbeat_interval": 0.4,
        "node": "test02.example.com"
    }
    # register svc01. output should be svc01.
    output = svcreg.leader('example-role', default=svc01)
    assert output['node'] == svc01['node']
    # try to register svc02. Output should still be svc01.
    output = svcreg.leader('example-role', default=svc02)
    assert output['node'] == svc01['node']
    svcreg.unregister('example-role')
Beispiel #7
0
 def setUp(self):
     self.rethinker = doublethink.Rethinker(
         db=random_db, servers=settings['RETHINKDB_HOSTS'])
     self.services = doublethink.ServiceRegistry(self.rethinker)
     self.registry = sync.HostRegistry(rethinker=self.rethinker,
                                       services=self.services)
     self.snakebite_client = mock.Mock()
     self.rethinker.table("services").delete().run()
Beispiel #8
0
 def setUp(self):
     self.rethinker = doublethink.Rethinker(
         db=random_db, servers=settings['RETHINKDB_HOSTS'])
     self.services = doublethink.ServiceRegistry(self.rethinker)
     sync.init(self.rethinker)
     self.rethinker.table("services").delete().run()
     self.rethinker.table("lock").delete().run()
     self.rethinker.table("assignment").delete().run()
Beispiel #9
0
 def service_registry(options):
     if options.rethinkdb_services_url:
         parsed = doublethink.parse_rethinkdb_url(
                 options.rethinkdb_services_url)
         rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
         return doublethink.ServiceRegistry(rr, table=parsed.table)
     else:
         return None
Beispiel #10
0
 def _init_brozzler_worker(self, args):
     rr = doublethink.Rethinker(
             args.rethinkdb_servers.split(","), args.rethinkdb_db)
     frontier = brozzler.RethinkDbFrontier(rr)
     service_registry = doublethink.ServiceRegistry(rr)
     worker = brozzler.worker.BrozzlerWorker(
             frontier, service_registry, chrome_exe=args.chrome_exe,
             proxy='%s:%s' % self.warcprox_controller.proxy.server_address,
             max_browsers=args.max_browsers)
     return worker
Beispiel #11
0
 def _do_write(self, query):
     # send provision query to server if not self._write_url.
     # after send provision query, set self._write_url.
     # send query to server, return JSON
     rethinker = doublethink.Rethinker(db="trough_configuration",
                                       servers=self.rethinkdb)
     services = doublethink.ServiceRegistry(rethinker)
     master_node = services.unique_service('trough-sync-master')
     logging.info('master_node=%r', master_node)
     if not master_node:
         raise Exception(
             'no healthy trough-sync-master in service registry')
     if not self._write_url:
         buffer = BytesIO()
         c = pycurl.Curl()
         c.setopt(c.URL, master_node.get('url'))
         c.setopt(c.POSTFIELDS, self.database)
         if self.proxy:
             c.setopt(pycurl.PROXY, self.proxy)
             c.setopt(pycurl.PROXYPORT, int(self.proxy_port))
             c.setopt(pycurl.PROXYTYPE, self.proxy_type)
         c.setopt(c.WRITEDATA, buffer)
         c.perform()
         c.close()
         self._write_url = buffer.getvalue()
         logging.info('self._write_url=%r', self._write_url)
     buffer = BytesIO()
     c = pycurl.Curl()
     c.setopt(c.URL, self._write_url)
     c.setopt(c.POSTFIELDS, query)
     if self.proxy:
         c.setopt(pycurl.PROXY, self.proxy)
         c.setopt(pycurl.PROXYPORT, int(self.proxy_port))
         c.setopt(pycurl.PROXYTYPE, self.proxy_type)
     c.setopt(c.WRITEDATA, buffer)
     c.perform()
     c.close()
     response = buffer.getvalue()
     if response.strip() != b'OK':
         raise Exception(
             'Trough Query Failed: Database: %r Response: %r Query: %.200r'
             % (self.database, response, query))
     self._last_results = None
Beispiel #12
0
def test_choose_warcprox():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    svcreg = doublethink.ServiceRegistry(rr)
    frontier = brozzler.RethinkDbFrontier(rr)

    # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
    rr.table('sites').wait().run()
    rr.table('services').wait().run()
    rr.table('sites').index_wait().run()
    rr.table('services').index_wait().run()

    # clean slate
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
    worker = brozzler.BrozzlerWorker(frontier, svcreg)
    assert worker._choose_warcprox() is None

    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host1', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8001,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host3', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host4', 'port': 8000,
        'load': 1, 'ttl': 60}).run()

    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8001', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host3'
    assert instance['port'] == 8000
    rr.table('sites').insert({
        'proxy': 'host3:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host4'
    assert instance['port'] == 8000

    # clean up
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
Beispiel #13
0
def _test_proxy_setting(
        httpd, proxy=None, warcprox_auto=False, is_warcprox=False):
    test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % (
            proxy, warcprox_auto, is_warcprox,
            datetime.datetime.utcnow().isoformat())

    # the two pages we expect to be crawled
    page1 = 'http://localhost:%s/site1/' % httpd.server_port
    page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port
    robots = 'http://localhost:%s/robots.txt' % httpd.server_port

    rr = doublethink.Rethinker('localhost', db='brozzler')
    service_registry = doublethink.ServiceRegistry(rr)
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/site1/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})
    assert site.id is None
    frontier = brozzler.RethinkDbFrontier(rr)
    brozzler.new_site(frontier, site)
    assert site.id is not None
    assert len(list(frontier.site_pages(site.id))) == 1

    worker = brozzler.worker.BrozzlerWorker(
            frontier, service_registry, max_browsers=1,
            chrome_exe=brozzler.suggest_default_chrome_exe(),
            warcprox_auto=warcprox_auto, proxy=proxy)
    browser = worker._browser_pool.acquire()
    worker.brozzle_site(browser, site)
    worker._browser_pool.release(browser)

    # check proxy is set
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy
    site.refresh() # check that these things were persisted
    assert site.status == 'FINISHED'
    if warcprox_auto:
        assert site.proxy[-5:] == ':8000'
    else:
        assert not site.proxy

    # check that we got the two pages we expected
    pages = list(frontier.site_pages(site.id))
    assert len(pages) == 2
    assert {page.url for page in pages} == {
            'http://localhost:%s/site1/' % httpd.server_port,
            'http://localhost:%s/site1/file1.txt' % httpd.server_port}

    time.sleep(2)   # in case warcprox hasn't finished processing urls
    # take a look at the captures table
    captures = rr.table('captures').filter({'test_id':test_id}).run()
    captures_by_url = {
            c['url']: c for c in captures if c['http_method'] != 'HEAD'}
    if is_warcprox:
        assert robots in captures_by_url
        assert page1 in captures_by_url
        assert page2 in captures_by_url
        assert 'screenshot:%s' % page1 in captures_by_url
        assert 'thumbnail:%s' % page1 in captures_by_url

        # check pywb
        t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S')
        wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2)
        expected_payload = open(os.path.join(
            os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read()
        assert requests.get(wb_url).content == expected_payload
    else:
        assert captures_by_url == {}
Beispiel #14
0
def test_service_registry(rr):
    svcreg = doublethink.ServiceRegistry(rr)
    # missing required fields
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({})
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"role": "foo", "load": 1})
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"role": "foo", "ttl": 1.0})
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"ttl": 1.0, "load": 1})

    # invalid ttl (we accept anything for load and role)
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"ttl": -1, "role": "foo", "load": 1})
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"ttl": "strang", "role": "foo", "load": 1})
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"ttl": [], "role": "foo", "load": 1})
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"ttl": [1], "role": "foo", "load": 1})
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"ttl": {}, "role": "foo", "load": 1})
    with pytest.raises(Exception) as excinfo:
        svcreg.heartbeat({"ttl": {1: 2}, "role": "foo", "load": 1})

    assert svcreg.available_service("yes-such-role") == None
    assert svcreg.available_services("yes-such-role") == []
    assert svcreg.available_services() == []
    svc0 = {
        "role": "yes-such-role",
        "load": 100.0,
        "ttl": 1.2,
    }
    svc1 = {
        "role": "yes-such-role",
        "load": 200.0,
        "ttl": 1.2,
    }
    svc0 = svcreg.heartbeat(svc0)
    svc1 = svcreg.heartbeat(svc1)
    assert "id" in svc0
    assert "id" in svc1
    assert svc0["id"] != svc1["id"]

    assert svc0["host"] == socket.gethostname()
    assert svc1["host"] == socket.gethostname()

    assert "pid" in svc0
    assert "pid" in svc1
    assert svc0["pid"] == os.getpid()
    assert svc1["pid"] == os.getpid()
    assert "first_heartbeat" in svc0
    assert "first_heartbeat" in svc1
    assert "last_heartbeat" in svc0
    assert "last_heartbeat" in svc1

    time.sleep(0.2)
    assert svcreg.available_service("no-such-role") == None
    assert svcreg.available_services("no-such-role") == []
    # svc0 has less load
    assert svcreg.available_service("yes-such-role")["id"] == svc0["id"]
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 2

    svc1["load"] = 50.0
    svc1 = svcreg.heartbeat(svc1)
    time.sleep(0.2)
    assert svcreg.available_service("no-such-role") == None
    # now svc1 has less load
    assert svcreg.available_service("yes-such-role")["id"] == svc1["id"]
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 2

    svc1["load"] = 200.0
    svc1 = svcreg.heartbeat(svc1)
    time.sleep(0.2)
    assert svcreg.available_service("no-such-role") == None
    # now svc0 has less load again
    assert svcreg.available_service("yes-such-role")["id"] == svc0["id"]
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 2

    svc1 = svcreg.heartbeat(svc1)
    time.sleep(0.2)
    svc1 = svcreg.heartbeat(svc1)
    time.sleep(0.7)
    assert svcreg.available_service("no-such-role") == None
    # now it's been too long since the last heartbeat from svc0
    assert svcreg.available_service("yes-such-role")["id"] == svc1["id"]
    assert len(svcreg.available_services("yes-such-role")) == 1
    assert len(svcreg.available_services()) == 1

    svcreg.unregister(svc1["id"])
    time.sleep(0.2)
    assert svcreg.available_service("no-such-role") == None
    assert svcreg.available_service("yes-such-role") == None
    assert svcreg.available_services("yes-such-role") == []
    assert svcreg.available_services() == []

    svc0 = {
        "role": "yes-such-role",
        "load": 100.0,
        "ttl": 1.2,
    }
    svc1 = {
        "role": "yes-such-role",
        "load": 200.0,
        "ttl": 1.2,
    }
    svc0 = svcreg.heartbeat(svc0)
    svc1 = svcreg.heartbeat(svc1)
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 2
    svcreg.unregister(svc0["id"])
    svcreg.unregister(svc1["id"])

    svc0 = {
        "role": "yes-such-role",
        "load": 100.0,
        "ttl": 1.2,
    }
    svc1 = {
        "role": "yes-such-role",
        "load": 200.0,
        "ttl": 1.2,
    }
    svc2 = {
        "role": "another-such-role",
        "load": 200.0,
        "ttl": 1.2,
    }
    svc3 = {
        "role": "yet-another-such-role",
        "load": 200.0,
        "ttl": 1.2,
    }
    svc0 = svcreg.heartbeat(svc0)
    svc1 = svcreg.heartbeat(svc1)
    svc2 = svcreg.heartbeat(svc2)
    svc3 = svcreg.heartbeat(svc3)
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 4
Beispiel #15
0
def brozzler_worker(argv=None):
    '''
    Main entry point for brozzler, gets sites and pages to brozzle from
    rethinkdb, brozzles them.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    add_rethinkdb_options(arg_parser)
    arg_parser.add_argument('-e',
                            '--chrome-exe',
                            dest='chrome_exe',
                            default=suggest_default_chrome_exe(),
                            help='executable to use to invoke chrome')
    arg_parser.add_argument(
        '-n',
        '--max-browsers',
        dest='max_browsers',
        default='1',
        help='max number of chrome instances simultaneously browsing pages')
    arg_parser.add_argument('--proxy',
                            dest='proxy',
                            default=None,
                            help='http proxy')
    arg_parser.add_argument(
        '--warcprox-auto',
        dest='warcprox_auto',
        action='store_true',
        help=('when needed, choose an available instance of warcprox from '
              'the rethinkdb service registry'))
    arg_parser.add_argument('--skip-extract-outlinks',
                            dest='skip_extract_outlinks',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-visit-hashtags',
                            dest='skip_visit_hashtags',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-youtube-dl',
                            dest='skip_youtube_dl',
                            action='store_true',
                            help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    def dump_state(signum, frame):
        signal.signal(signal.SIGQUIT, signal.SIG_IGN)
        try:
            state_strs = []
            frames = sys._current_frames()
            threads = {th.ident: th for th in threading.enumerate()}
            for ident in frames:
                if threads[ident]:
                    state_strs.append(str(threads[ident]))
                else:
                    state_strs.append('<???:thread:ident=%s>' % ident)
                stack = traceback.format_stack(frames[ident])
                state_strs.append(''.join(stack))
            logging.info('dumping state (caught signal %s)\n%s' %
                         (signum, '\n'.join(state_strs)))
        except BaseException as e:
            logging.error('exception dumping state: %s' % e)
        finally:
            signal.signal(signal.SIGQUIT, dump_state)

    rr = rethinker(args)
    frontier = brozzler.RethinkDbFrontier(rr)
    service_registry = doublethink.ServiceRegistry(rr)
    worker = brozzler.worker.BrozzlerWorker(
        frontier,
        service_registry,
        max_browsers=int(args.max_browsers),
        chrome_exe=args.chrome_exe,
        proxy=args.proxy,
        warcprox_auto=args.warcprox_auto,
        skip_extract_outlinks=args.skip_extract_outlinks,
        skip_visit_hashtags=args.skip_visit_hashtags,
        skip_youtube_dl=args.skip_youtube_dl)

    signal.signal(signal.SIGQUIT, dump_state)
    signal.signal(signal.SIGTERM, lambda s, f: worker.stop())
    signal.signal(signal.SIGINT, lambda s, f: worker.stop())

    th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread')
    th.start()
    th.join()
    logging.info('brozzler-worker is all done, exiting')
Beispiel #16
0
def init_controller(args):
    '''
    Creates a warcprox.controller.WarcproxController configured according to
    the supplied arguments (normally the result of parse_args(sys.argv)).
    '''
    options = warcprox.Options(**vars(args))

    try:
        hashlib.new(args.digest_algorithm)
    except Exception as e:
        logging.fatal(e)
        exit(1)

    listeners = []

    if args.rethinkdb_dedup_url:
        dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
    elif args.rethinkdb_big_table_url:
        dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
    elif args.rethinkdb_trough_db_url:
        dedup_db = warcprox.dedup.TroughDedupDb(options)
    elif args.cdxserver_dedup:
        dedup_db = warcprox.dedup.CdxServerDedup(cdx_url=args.cdxserver_dedup)
    elif args.dedup_db_file in (None, '', '/dev/null'):
        logging.info('deduplication disabled')
        dedup_db = None
    else:
        dedup_db = warcprox.dedup.DedupDb(args.dedup_db_file, options=options)
    if dedup_db:
        listeners.append(dedup_db)

    if args.rethinkdb_stats_url:
        stats_db = warcprox.stats.RethinkStatsDb(options=options)
        listeners.append(stats_db)
    elif args.stats_db_file in (None, '', '/dev/null'):
        logging.info('statistics tracking disabled')
        stats_db = None
    else:
        stats_db = warcprox.stats.StatsDb(args.stats_db_file, options=options)
        listeners.append(stats_db)

    recorded_url_q = warcprox.TimestampedQueue(maxsize=args.queue_size)

    ca_name = 'Warcprox CA on {}'.format(socket.gethostname())[:64]
    ca = certauth.certauth.CertificateAuthority(args.cacert,
                                                args.certs_dir,
                                                ca_name=ca_name)

    proxy = warcprox.warcproxy.WarcProxy(ca=ca,
                                         recorded_url_q=recorded_url_q,
                                         stats_db=stats_db,
                                         options=options)

    if args.playback_port is not None:
        playback_index_db = warcprox.playback.PlaybackIndexDb(
            args.playback_index_db_file, options=options)
        playback_proxy = warcprox.playback.PlaybackProxy(
            ca=ca, playback_index_db=playback_index_db, options=options)
        listeners.append(playback_index_db)
    else:
        playback_index_db = None
        playback_proxy = None

    if args.crawl_log_dir:
        listeners.append(
            warcprox.crawl_log.CrawlLogger(args.crawl_log_dir,
                                           options=options))

    for qualname in args.plugins or []:
        try:
            (module_name, class_name) = qualname.rsplit('.', 1)
            module_ = importlib.import_module(module_name)
            class_ = getattr(module_, class_name)
            listener = class_()
            listener.notify  # make sure it has this method
            listeners.append(listener)
        except Exception as e:
            logging.fatal('problem with plugin class %r: %s', qualname, e)
            sys.exit(1)

    writer_pool = warcprox.writer.WarcWriterPool(options=options)
    # number of warc writer threads = sqrt(proxy.max_threads)
    # I came up with this out of thin air because it strikes me as reasonable
    # 1=>1 2=>1 5=>2 10=>3 50=>7 100=>10 200=>14 500=>22 1000=>32 2000=>45
    num_writer_threads = args.writer_threads or int(proxy.max_threads**0.5)
    logging.debug('initializing %d warc writer threads', num_writer_threads)
    warc_writer_threads = [
        warcprox.writerthread.WarcWriterThread(name='WarcWriterThread%03d' % i,
                                               recorded_url_q=recorded_url_q,
                                               writer_pool=writer_pool,
                                               dedup_db=dedup_db,
                                               listeners=listeners,
                                               options=options)
        for i in range(num_writer_threads)
    ]

    if args.rethinkdb_services_url:
        parsed = doublethink.parse_rethinkdb_url(
            options.rethinkdb_services_url)
        rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
        svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
    else:
        svcreg = None

    controller = warcprox.controller.WarcproxController(
        proxy,
        warc_writer_threads,
        playback_proxy,
        service_registry=svcreg,
        options=options)

    return controller
Beispiel #17
0
def ensure_rethinkdb_tables(argv=None):
    '''
    Creates rethinkdb tables if they don't already exist. Warcprox normally
    creates the tables it needs on demand at startup, but if multiple instances
    are starting up at the same time, you can end up with duplicate broken
    tables. So it's a good idea to use this utility at an early step when
    spinning up a cluster.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument(
        '--rethinkdb-stats-url',
        dest='rethinkdb_stats_url',
        help=('rethinkdb stats table url, e.g. rethinkdb://db0.foo.org,'
              'db1.foo.org:38015/my_warcprox_db/my_stats_table'))
    group = arg_parser.add_mutually_exclusive_group()
    group.add_argument(
        '--rethinkdb-dedup-url',
        dest='rethinkdb_dedup_url',
        help=('rethinkdb dedup url, e.g. rethinkdb://db0.foo.org,'
              'db1.foo.org:38015/my_warcprox_db/my_dedup_table'))
    group.add_argument(
        '--rethinkdb-big-table-url',
        dest='rethinkdb_big_table_url',
        help=('rethinkdb big table url (table will be populated with '
              'various capture information and is suitable for use as '
              'index for playback), e.g. rethinkdb://db0.foo.org,'
              'db1.foo.org:38015/my_warcprox_db/captures'))
    group.add_argument(
        '--rethinkdb-trough-db-url',
        dest='rethinkdb_trough_db_url',
        help=('🐷   url pointing to trough configuration rethinkdb database, '
              'e.g. rethinkdb://db0.foo.org,db1.foo.org:38015'
              '/trough_configuration'))
    arg_parser.add_argument(
        '--rethinkdb-services-url',
        dest='rethinkdb_services_url',
        help=('rethinkdb service registry table url; if provided, warcprox '
              'will create and heartbeat entry for itself'))
    arg_parser.add_argument('-q',
                            '--quiet',
                            dest='log_level',
                            action='store_const',
                            default=logging.INFO,
                            const=logging.WARN)
    arg_parser.add_argument('-v',
                            '--verbose',
                            dest='log_level',
                            action='store_const',
                            default=logging.INFO,
                            const=logging.DEBUG)
    args = arg_parser.parse_args(args=argv[1:])

    logging.basicConfig(
        stream=sys.stdout,
        level=args.log_level,
        format=('%(asctime)s %(levelname)s %(name)s.%(funcName)s'
                '(%(filename)s:%(lineno)d) %(message)s'))

    options = warcprox.Options(**vars(args))

    did_something = False
    if args.rethinkdb_services_url:
        parsed = doublethink.parse_rethinkdb_url(
            options.rethinkdb_services_url)
        rr = doublethink.Rethinker(servers=parsed.hosts, db=parsed.database)
        svcreg = doublethink.ServiceRegistry(rr, table=parsed.table)
        did_something = True
    if args.rethinkdb_stats_url:
        stats_db = warcprox.stats.RethinkStatsProcessor(options=options)
        stats_db._ensure_db_table()
        did_something = True
    if args.rethinkdb_dedup_url:
        dedup_db = warcprox.dedup.RethinkDedupDb(options=options)
        did_something = True
    if args.rethinkdb_big_table_url:
        dedup_db = warcprox.bigtable.RethinkCapturesDedup(options=options)
        did_something = True
    if args.rethinkdb_trough_db_url:
        dedup_db = warcprox.dedup.TroughDedupDb(options)
        logging.warn(
            'trough is responsible for creating most of the rethinkdb '
            'tables that it uses')
        did_something = True

    if not did_something:
        logging.error('nothing to do, no --rethinkdb-* options supplied')
Beispiel #18
0
def test_warcprox_outage_resiliency(httpd):
    '''
    Tests resiliency to warcprox outage.

    If no instances of warcprox are healthy when starting to crawl a site,
    brozzler-worker should sit there and wait until a healthy instance appears.

    If an instance goes down, sites assigned to that instance should bounce
    over to a healthy instance.

    If all instances of warcprox go down, brozzler-worker should sit and wait.
    '''
    rr = doublethink.Rethinker('localhost', db='brozzler')
    frontier = brozzler.RethinkDbFrontier(rr)
    svcreg = doublethink.ServiceRegistry(rr)

    # run two instances of warcprox
    opts = warcprox.Options()
    opts.address = '0.0.0.0'
    opts.port = 0

    warcprox1 = warcprox.controller.WarcproxController(
            service_registry=svcreg, options=opts)
    warcprox2 = warcprox.controller.WarcproxController(
            service_registry=svcreg, options=opts)
    warcprox1_thread = threading.Thread(
            target=warcprox1.run_until_shutdown, name='warcprox1')
    warcprox2_thread = threading.Thread(
            target=warcprox2.run_until_shutdown, name='warcprox2')

    # put together a site to crawl
    test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat()
    site = brozzler.Site(rr, {
        'seed': 'http://localhost:%s/infinite/' % httpd.server_port,
        'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}})

    try:
        # we manage warcprox instances ourselves, so stop the one running on
        # the system, if any
        try:
            stop_service('warcprox')
        except Exception as e:
            logging.warn('problem stopping warcprox service: %s', e)

        # queue the site for brozzling
        brozzler.new_site(frontier, site)

        # check that nothing happens
        # XXX tail brozzler-worker.log or something?
        time.sleep(30)
        site.refresh()
        assert site.status == 'ACTIVE'
        assert not site.proxy
        assert len(list(frontier.site_pages(site.id))) == 1

        # start one instance of warcprox
        warcprox1_thread.start()

        # check that it started using that instance
        start = time.time()
        while not site.proxy and time.time() - start < 30:
            time.sleep(0.5)
            site.refresh()
        assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port)

        # check that the site accumulates pages in the frontier, confirming
        # that crawling is really happening
        start = time.time()
        while (len(list(frontier.site_pages(site.id))) <= 1
               and time.time() - start < 60):
            time.sleep(0.5)
            site.refresh()
        assert len(list(frontier.site_pages(site.id))) > 1

        # stop warcprox #1, start warcprox #2
        warcprox2_thread.start()
        warcprox1.stop.set()
        warcprox1_thread.join()

        # check that it switched over to warcprox #2
        start = time.time()
        while ((not site.proxy
                or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port))
               and time.time() - start < 30):
            time.sleep(0.5)
            site.refresh()
        assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port)

        # stop warcprox #2
        warcprox2.stop.set()
        warcprox2_thread.join()

        page_count = len(list(frontier.site_pages(site.id)))
        assert page_count > 1

        # check that it is waiting for a warcprox to appear
        time.sleep(30)
        site.refresh()
        assert site.status == 'ACTIVE'
        assert not site.proxy
        assert len(list(frontier.site_pages(site.id))) == page_count

        # stop crawling the site, else it can pollute subsequent test runs
        brozzler.cli.brozzler_stop_crawl([
            'brozzler-stop-crawl', '--site=%s' % site.id])
        site.refresh()
        assert site.stop_requested

        # stop request should be honored quickly
        start = time.time()
        while not site.status.startswith(
                'FINISHED') and time.time() - start < 120:
            time.sleep(0.5)
            site.refresh()
        assert site.status == 'FINISHED_STOP_REQUESTED'
    finally:
        warcprox1.stop.set()
        warcprox2.stop.set()
        warcprox1_thread.join()
        warcprox2_thread.join()
        start_service('warcprox')
Beispiel #19
0
def service_registry():
    global _svc_reg
    if not _svc_reg:
        _svc_reg = doublethink.ServiceRegistry(rr)
    return _svc_reg
Beispiel #20
0
def test_service_registry(rr):
    svcreg = doublethink.ServiceRegistry(rr)
    assert svcreg.available_service("yes-such-role") == None
    assert svcreg.available_services("yes-such-role") == []
    assert svcreg.available_services() == []
    svc0 = {
        "role": "yes-such-role",
        "load": 100.0,
        "heartbeat_interval": 0.4,
    }
    svc1 = {
        "role": "yes-such-role",
        "load": 200.0,
        "heartbeat_interval": 0.4,
    }
    svc0 = svcreg.heartbeat(svc0)
    svc1 = svcreg.heartbeat(svc1)
    assert "id" in svc0
    assert "id" in svc1
    assert svc0["id"] != svc1["id"]

    assert svc0["host"] == socket.gethostname()
    assert svc1["host"] == socket.gethostname()

    assert "pid" in svc0
    assert "pid" in svc1
    assert svc0["pid"] == os.getpid()
    assert svc1["pid"] == os.getpid()
    assert "first_heartbeat" in svc0
    assert "first_heartbeat" in svc1
    assert "last_heartbeat" in svc0
    assert "last_heartbeat" in svc1

    time.sleep(0.2)
    assert svcreg.available_service("no-such-role") == None
    assert svcreg.available_services("no-such-role") == []
    # svc0 has less load
    assert svcreg.available_service("yes-such-role")["id"] == svc0["id"]
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 2

    svc1["load"] = 50.0
    svc1 = svcreg.heartbeat(svc1)
    time.sleep(0.2)
    assert svcreg.available_service("no-such-role") == None
    # now svc1 has less load
    assert svcreg.available_service("yes-such-role")["id"] == svc1["id"]
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 2

    svc1["load"] = 200.0
    svc1 = svcreg.heartbeat(svc1)
    time.sleep(0.2)
    assert svcreg.available_service("no-such-role") == None
    # now svc0 has less load again
    assert svcreg.available_service("yes-such-role")["id"] == svc0["id"]
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 2

    svc1 = svcreg.heartbeat(svc1)
    time.sleep(0.2)
    svc1 = svcreg.heartbeat(svc1)
    time.sleep(0.7)
    assert svcreg.available_service("no-such-role") == None
    # now it's been too long since the last heartbeat from svc0
    assert svcreg.available_service("yes-such-role")["id"] == svc1["id"]
    assert len(svcreg.available_services("yes-such-role")) == 1
    assert len(svcreg.available_services()) == 1

    svcreg.unregister(svc1["id"])
    time.sleep(0.2)
    assert svcreg.available_service("no-such-role") == None
    assert svcreg.available_service("yes-such-role") == None
    assert svcreg.available_services("yes-such-role") == []
    assert svcreg.available_services() == []

    svc0 = {
        "role": "yes-such-role",
        "load": 100.0,
        "heartbeat_interval": 0.4,
    }
    svc1 = {
        "role": "yes-such-role",
        "load": 200.0,
        "heartbeat_interval": 0.4,
    }
    svc0 = svcreg.heartbeat(svc0)
    svc1 = svcreg.heartbeat(svc1)
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 2
    svcreg.unregister(svc0["id"])
    svcreg.unregister(svc1["id"])

    svc0 = {
        "role": "yes-such-role",
        "load": 100.0,
        "heartbeat_interval": 0.4,
    }
    svc1 = {
        "role": "yes-such-role",
        "load": 200.0,
        "heartbeat_interval": 0.4,
    }
    svc2 = {
        "role": "another-such-role",
        "load": 200.0,
        "heartbeat_interval": 0.4,
    }
    svc3 = {
        "role": "yet-another-such-role",
        "load": 200.0,
        "heartbeat_interval": 0.4,
    }
    svc0 = svcreg.heartbeat(svc0)
    svc1 = svcreg.heartbeat(svc1)
    svc2 = svcreg.heartbeat(svc2)
    svc3 = svcreg.heartbeat(svc3)
    assert len(svcreg.available_services("yes-such-role")) == 2
    assert len(svcreg.available_services()) == 4