def test_hashtag_links(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) parent_page = frontier.seed_page(site.id) assert not parent_page.hashtags outlinks = [ 'http://example.org/#foo', 'http://example.org/bar', 'http://example.org/bar#baz', 'http://example.org/bar#quux', 'http://example.org/zuh#buh', ] frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 3 assert pages[0].url == 'http://example.org/' assert sorted(pages[0].outlinks['accepted']) == [ 'http://example.org/', 'http://example.org/bar', 'http://example.org/zuh' ] assert not pages[0].outlinks['blocked'] assert not pages[0].outlinks['rejected'] assert pages[0].hashtags == [ '#foo', ] assert pages[0].hops_from_seed == 0 assert pages[1].url == 'http://example.org/bar' assert sorted(pages[1].hashtags) == ['#baz', '#quux'] assert pages[1].priority == 36 assert pages[1].hops_from_seed == 1 assert pages[2].url == 'http://example.org/zuh' assert pages[2].hashtags == ['#buh'] assert pages[2].priority == 12
def test_login(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site2/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}, 'username': '******', 'password': '******'}) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port captures = list(rr.table('captures').filter( {'test_id':test_id}).order_by('timestamp').run()) meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] # there are several forms in in htdocs/site2/login.html but only one # that brozzler's heuristic should match and try to submit, and it has # action='00', so we can check for that here assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url # sanity check the rest of the crawl assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
def brozzler_new_job(): ''' Command line utility entry point for queuing a new brozzler job. Takes a yaml brozzler job configuration file, creates job, sites, and pages objects in rethinkdb, which brozzler-workers will look at and start crawling. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzler-new-job - queue new job with brozzler', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument( 'job_conf_file', metavar='JOB_CONF_FILE', help='brozzler job configuration file in yaml') _add_rethinkdb_options(arg_parser) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(','), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) brozzler.job.new_job_file(frontier, args.job_conf_file)
def test_claim_site(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) rr.table('sites').delete().run() # clean slate with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_site(worker_id='test_claim_site') site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) claimed_site = frontier.claim_site(worker_id='test_claim_site') assert claimed_site.id == site.id assert claimed_site.claimed assert claimed_site.last_claimed >= doublethink.utcnow( ) - datetime.timedelta(minutes=1) with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_site(worker_id='test_claim_site') # site last_claimed less than 1 hour ago still not to be reclaimed claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta( minutes=55) claimed_site.save() with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_site(worker_id='test_claim_site') # site last_claimed more than 1 hour ago can be reclaimed site = claimed_site claimed_site = None site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) site.save() claimed_site = frontier.claim_site(worker_id='test_claim_site') assert claimed_site.id == site.id # clean up rr.table('sites').get(claimed_site.id).delete().run()
def test_redirect_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site9/' % httpd.server_port site = brozzler.Site(rr, { 'seed': seed_url, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site9/redirect.html' % httpd.server_port] assert not pages[0].hashtags assert pages[1].url == 'http://localhost:%s/site9/redirect.html' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == ['#hash1','#hash2',] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() redirect_captures = [c for c in captures if c['url'] == 'http://localhost:%s/site9/redirect.html' % httpd.server_port and c['http_method'] == 'GET'] assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
def test_scope_and_schedule_outlinks(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed':'http://example.com/'}) parent_page = brozzler.Page(rr, { 'hops_from_seed': 1, 'url': 'http://example.com/whatever'}) outlinks = [ 'https://example.com/', 'https://example.com/foo', 'http://example.com/bar', 'HTtp://exAMPle.COm/bar', 'HTtp://exAMPle.COm/BAr', 'HTtp://exAMPle.COm/BAZZZZ',] orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert sorted(parent_page.outlinks['rejected']) == [ 'https://example.com/', 'https://example.com/foo'] assert sorted(parent_page.outlinks['accepted']) == [ 'http://example.com/BAZZZZ', 'http://example.com/BAr', 'http://example.com/bar'] assert parent_page.outlinks['blocked'] == [] pp = brozzler.Page.load(rr, parent_page.id) assert pp == parent_page for url in parent_page.outlinks['rejected']: id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id) is None for url in parent_page.outlinks['accepted']: id = brozzler.Page.compute_id(site.id, url) assert brozzler.Page.load(rr, id)
def test_seed_page(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) site.save() assert frontier.seed_page(site.id) is None page1 = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/b/', 'hops_from_seed': 1}) page1.save() assert frontier.seed_page(site.id) is None page0 = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/', 'hops_from_seed': 0}) page0.save() assert frontier.seed_page(site.id) == page0
def test_basics(): rr = doublethink.Rethinker(db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) job_conf = {'seeds': [ {'url': 'http://example.com'}, {'url': 'https://example.org/'}]} job = brozzler.new_job(frontier, job_conf) assert job.id assert job.starts_and_stops assert job.starts_and_stops[0]['start'] assert job == { 'id': job.id, 'conf': { 'seeds': [ {'url': 'http://example.com'}, {'url': 'https://example.org/'} ] }, 'status': 'ACTIVE', 'starts_and_stops': [ { 'start': job.starts_and_stops[0]['start'], 'stop': None } ] } sites = sorted(list(frontier.job_sites(job.id)), key=lambda x: x.seed) assert len(sites) == 2 assert sites[0].starts_and_stops[0]['start'] assert sites[1].starts_and_stops[0]['start'] assert sites[0] == { 'claimed': False, 'id': sites[0].id, 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, 'scope': { 'surt': 'http://(com,example,)/' }, 'seed': 'http://example.com', 'starts_and_stops': [ { 'start': sites[0].starts_and_stops[0]['start'], 'stop': None } ], 'status': 'ACTIVE' } assert sites[1] == { 'claimed': False, 'id': sites[1].id, 'job_id': job.id, 'last_claimed': brozzler.EPOCH_UTC, 'last_disclaimed': brozzler.EPOCH_UTC, 'scope': { 'surt': 'https://(org,example,)/', }, 'seed': 'https://example.org/', 'starts_and_stops': [ { 'start': sites[1].starts_and_stops[0]['start'], 'stop': None, }, ], 'status': 'ACTIVE', } pages = list(frontier.site_pages(sites[0].id)) assert len(pages) == 1 assert pages[0] == { 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, 'hops_off_surt': 0, 'id': brozzler.Page.compute_id(sites[0].id, 'http://example.com'), 'job_id': job.id, 'needs_robots_check': True, 'priority': 1000, 'site_id': sites[0].id, 'url': 'http://example.com', } pages = list(frontier.site_pages(sites[1].id)) assert len(pages) == 1 assert pages[0] == { 'brozzle_count': 0, 'claimed': False, 'hops_from_seed': 0, 'hops_off_surt': 0, 'id': brozzler.Page.compute_id(sites[1].id, 'https://example.org/'), 'job_id': job.id, 'needs_robots_check': True, 'priority': 1000, 'site_id': sites[1].id, 'url': 'https://example.org/', } # test "brozzled" parameter of frontier.site_pages assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 0 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 1 pages[0].brozzle_count = 1 pages[0].save() assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0 pages[0].brozzle_count = 32819 pages[0].save() assert len(list(frontier.site_pages(sites[1].id))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=True))) == 1 assert len(list(frontier.site_pages(sites[1].id, brozzled=False))) == 0
def test_brozzle_site(httpd): test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) # the two pages we expect to be crawled page1 = 'http://localhost:%s/site1/' % httpd.server_port page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port robots = 'http://localhost:%s/robots.txt' % httpd.server_port # so we can examine rethinkdb before it does anything try: stop_service('brozzler-worker') assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 finally: start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/site1/' % httpd.server_port, 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # no screenshots of plaintext # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload url = 'screenshot:%s' % page1 t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url) response = requests.get(wb_url) assert response.status_code == 200 assert response.headers['content-type'] == 'image/jpeg' url = 'thumbnail:%s' % page1 t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url) response = requests.get(wb_url) assert response.status_code == 200 assert response.headers['content-type'] == 'image/jpeg'
def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = make_url(httpd, '/site7/') site = brozzler.Site( rr, { 'seed': seed_url, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == [ make_url(httpd, '/site7/foo.html') ] assert not pages[0].hashtags assert pages[1].url == make_url(httpd, '/site7/foo.html') assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == [ '#boosh', '#ignored', '#whee', ] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id': test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD' } assert seed_url in captures_by_url assert make_url(httpd, '/site7/foo.html') in captures_by_url assert make_url(httpd, '/site7/whee.txt') in captures_by_url assert make_url(httpd, '/site7/boosh.txt') in captures_by_url assert 'screenshot:%s' % seed_url in captures_by_url assert 'thumbnail:%s' % seed_url in captures_by_url assert 'screenshot:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url assert 'thumbnail:%s' % make_url(httpd, '/site7/foo.html') in captures_by_url
def test_max_hops_off(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site( rr, { 'seed': 'http://example.com/', 'scope': { 'max_hops_off_surt': 1, 'blocks': [{ 'ssurt': 'domain,bad,' }] } }) brozzler.new_site(frontier, site) site.refresh() # get it back from the db # renamed this param assert not 'max_hops_off_surt' in site.scope assert site.scope['max_hops_off'] == 1 seed_page = frontier.seed_page(site.id) assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: # two of these are in scope because of max_hops_off frontier.scope_and_schedule_outlinks(site, seed_page, [ 'http://foo.org/', 'https://example.com/toot', 'http://example.com/toot', 'https://some.bad.domain/something' ]) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 4 assert pages[0].url == 'http://example.com/' assert pages[0].hops_off == 0 assert not 'hops_off_surt' in pages[0] assert set(pages[0].outlinks['accepted']) == { 'https://example.com/toot', 'http://foo.org/', 'http://example.com/toot' } assert pages[0].outlinks['blocked'] == [] assert pages[0].outlinks['rejected'] == [ 'https://some.bad.domain/something' ] assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 0, 'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://example.com/toot', 'via_page_id': seed_page.id } in pages assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://foo.org/', 'via_page_id': seed_page.id } in pages assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'https://example.com/toot', 'via_page_id': seed_page.id } in pages # next hop is past max_hops_off, but normal in scope url is in scope foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0] orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks( site, foo_page, ['http://foo.org/bar', 'http://example.com/blah']) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert foo_page == { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://foo.org/', 'via_page_id': seed_page.id, 'outlinks': { 'accepted': ['http://example.com/blah'], 'blocked': [], 'rejected': ['http://foo.org/bar'], } } pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 5 assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 2, 'hops_off': 0, 'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'), 'job_id': None, 'needs_robots_check': False, 'priority': 11, 'site_id': site.id, 'url': 'http://example.com/blah', 'via_page_id': foo_page.id } in pages
def brozzler_purge(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzler-purge - purge crawl state from rethinkdb', formatter_class=BetterArgumentDefaultsHelpFormatter) group = arg_parser.add_mutually_exclusive_group(required=True) group.add_argument( '--job', dest='job', metavar='JOB_ID', help=( 'purge crawl state from rethinkdb for a job, including all ' 'sites and pages')) group.add_argument( '--site', dest='site', metavar='SITE_ID', help=( 'purge crawl state from rethinkdb for a site, including all ' 'pages')) group.add_argument( '--finished-before', dest='finished_before', metavar='YYYY-MM-DD', help=('purge crawl state from rethinkdb for a jobs that ended ' 'before this date')) arg_parser.add_argument( '--force', dest='force', action='store_true', help=( 'purge even if job or site is still has status ACTIVE')) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) if args.job: try: job_id = int(args.job) except ValueError: job_id = args.job job = brozzler.Job.load(rr, job_id) if not job: logging.fatal('no such job %r', job_id) sys.exit(1) if job.status == 'ACTIVE': if args.force: logging.warning( 'job %s has status ACTIVE, purging anyway because ' '--force was supplied', job_id) else: logging.fatal( 'refusing to purge job %s because status is ACTIVE ' '(override with --force)', job_id) sys.exit(1) _purge_job(rr, job_id) elif args.site: site_id = args.site site = brozzler.Site.load(rr, site_id) if not site: logging.fatal('no such job %r', job_id) sys.exit(1) if site.status == 'ACTIVE': if args.force: logging.warning( 'site %s has status ACTIVE, purging anyway because ' '--force was supplied', site_id) else: logging.fatal( 'refusing to purge site %s because status is ACTIVE ' '(override with --force)', site_id) sys.exit(1) _purge_site(rr, site_id) elif args.finished_before: finished_before = datetime.datetime.strptime( args.finished_before, '%Y-%m-%d').replace( tzinfo=doublethink.UTC) reql = rr.table('jobs').filter( r.row['finished'].default(r.maxval).lt(finished_before).or_( r.row['starts_and_stops'].nth(-1)['stop'].default(r.maxval).lt(finished_before))) logging.debug( 'retrieving jobs older than %s: %s', finished_before, reql) for job in reql.run(): # logging.info('job %s finished=%s starts_and_stops[-1]["stop"]=%s', # job['id'], job.get('finished'), # job.get('starts_and_stops', [{'stop':None}])[-1]['stop']) _purge_job(rr, job['id'])
def brozzler_purge(argv=None): argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzler-purge - purge crawl state from rethinkdb', formatter_class=BetterArgumentDefaultsHelpFormatter) group = arg_parser.add_mutually_exclusive_group(required=True) group.add_argument( '--job', dest='job', metavar='JOB_ID', help=('purge crawl state from rethinkdb for a job, including all ' 'sites and pages')) group.add_argument( '--site', dest='site', metavar='SITE_ID', help=('purge crawl state from rethinkdb for a site, including all ' 'pages')) arg_parser.add_argument( '--force', dest='force', action='store_true', help=('purge even if job or site is still has status ACTIVE')) add_rethinkdb_options(arg_parser) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) if args.job: try: job_id = int(args.job) except ValueError: job_id = args.job job = brozzler.Job.load(rr, job_id) if not job: logging.fatal('no such job %r', job_id) sys.exit(1) if job.status == 'ACTIVE': if args.force: logging.warn( 'job %s has status ACTIVE, purging anyway because ' '--force was supplied', job_id) else: logging.fatal( 'refusing to purge job %s because status is ACTIVE ' '(override with --force)', job_id) sys.exit(1) _purge_job(rr, job_id) elif args.site: site_id = args.site site = brozzler.Site.load(rr, site_id) if not site: logging.fatal('no such job %r', job_id) sys.exit(1) if site.status == 'ACTIVE': if args.force: logging.warn( 'site %s has status ACTIVE, purging anyway because ' '--force was supplied', site_id) else: logging.fatal( 'refusing to purge site %s because status is ACTIVE ' '(override with --force)', site_id) sys.exit(1) _purge_site(rr, site_id)
def brozzler_worker(): ''' Main entrypoint for brozzler, gets sites and pages to brozzle from rethinkdb, brozzles them. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(__file__), formatter_class=argparse.ArgumentDefaultsHelpFormatter) _add_rethinkdb_options(arg_parser) arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) def sigterm(signum, frame): raise brozzler.ShutdownRequested('shutdown requested (caught SIGTERM)') def sigint(signum, frame): raise brozzler.ShutdownRequested('shutdown requested (caught SIGINT)') # do not print in signal handler to avoid RuntimeError: reentrant call state_dump_msgs = [] def queue_state_dump(signum, frame): signal.signal(signal.SIGQUIT, signal.SIG_IGN) try: state_strs = [] frames = sys._current_frames() threads = {th.ident: th for th in threading.enumerate()} for ident in frames: if threads[ident]: state_strs.append(str(threads[ident])) else: state_strs.append('<???:thread:ident=%s>' % ident) stack = traceback.format_stack(frames[ident]) state_strs.append(''.join(stack)) state_dump_msgs.append( 'dumping state (caught signal %s)\n%s' % ( signum, '\n'.join(state_strs))) except BaseException as e: state_dump_msgs.append('exception dumping state: %s' % e) finally: signal.signal(signal.SIGQUIT, queue_state_dump) signal.signal(signal.SIGQUIT, queue_state_dump) signal.signal(signal.SIGTERM, sigterm) signal.signal(signal.SIGINT, sigint) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(','), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) service_registry = rethinkstuff.ServiceRegistry(r) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe) worker.start() try: while worker.is_alive(): while state_dump_msgs: logging.warn(state_dump_msgs.pop(0)) time.sleep(0.5) logging.critical('worker thread has died, shutting down') except brozzler.ShutdownRequested as e: pass finally: worker.shutdown_now() logging.info('brozzler-worker is all done, exiting')
def test_parent_url_scoping(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # scope rules that look at parent page url should consider both the # original url and the redirect url, if any, of the parent page site = brozzler.Site(rr, { 'seed': 'http://example.com/foo/', 'scope': { 'accepts': [{ 'parent_url_regex': '^http://example.com/acceptme/.*$'}], 'blocks': [{ 'parent_url_regex': '^http://example.com/blockme/.*$'}], }, 'remember_outlinks': True}) site.save() # an outlink that would not otherwise be in scope outlinks = ['https://some-random-url.com/'] # parent page does not match any parent_url_regex parent_page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/foo/spluh'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == outlinks assert parent_page.outlinks['accepted'] == [] # parent page url matches accept parent_url_regex parent_page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/acceptme/futz'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == [] assert parent_page.outlinks['accepted'] == outlinks # parent page redirect_url matches accept parent_url_regex parent_page_c = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/toot/blah', 'redirect_url':'http://example.com/acceptme/futz'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == [] assert parent_page.outlinks['accepted'] == outlinks # an outlink that would normally be in scope outlinks = ['http://example.com/foo/whatever/'] # parent page does not match any parent_url_regex parent_page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/foo/spluh'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == [] assert parent_page.outlinks['accepted'] == outlinks # parent page url matches block parent_url_regex parent_page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/blockme/futz'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == outlinks assert parent_page.outlinks['accepted'] == [] # parent page redirect_url matches block parent_url_regex parent_page_c = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/toot/blah', 'redirect_url':'http://example.com/blockme/futz'}) orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert parent_page.outlinks['rejected'] == outlinks assert parent_page.outlinks['accepted'] == []
def test_completed_page(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # redirect that changes scope surt site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) site.save() page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/', 'claimed': True, 'brozzle_count': 0, 'hops_from_seed': 0, 'redirect_url':'http://example.com/b/', }) page.save() assert site.scope == {'surt': 'http://(com,example,)/a/'} frontier.completed_page(site, page) assert site.scope == {'surt': 'http://(com,example,)/b/'} site.refresh() assert site.scope == {'surt': 'http://(com,example,)/b/'} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() assert page.brozzle_count == 1 assert page.claimed == False # redirect that doesn't change scope surt because destination is covered by # the original surt site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) site.save() page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/', 'claimed': True, 'brozzle_count': 0, 'hops_from_seed': 0, 'redirect_url':'http://example.com/a/x/', }) page.save() assert site.scope == {'surt': 'http://(com,example,)/a/'} frontier.completed_page(site, page) assert site.scope == {'surt': 'http://(com,example,)/a/'} site.refresh() assert site.scope == {'surt': 'http://(com,example,)/a/'} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() assert page.brozzle_count == 1 assert page.claimed == False # redirect that doesn't change scope surt because page is not the seed page site = brozzler.Site(rr, {'seed':'http://example.com/a/'}) site.save() page = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/c/', 'claimed': True, 'brozzle_count': 0, 'hops_from_seed': 1, 'redirect_url':'http://example.com/d/', }) page.save() assert site.scope == {'surt': 'http://(com,example,)/a/'} frontier.completed_page(site, page) assert site.scope == {'surt': 'http://(com,example,)/a/'} site.refresh() assert site.scope == {'surt': 'http://(com,example,)/a/'} assert page.brozzle_count == 1 assert page.claimed == False page.refresh() assert page.brozzle_count == 1 assert page.claimed == False
def test_resume_job(): ''' Tests that the right stuff gets twiddled in rethinkdb when we "start" and "finish" crawling a job. Doesn't actually crawl anything. ''' # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker(db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) job_conf = {'seeds': [{'url': 'http://example.com/'}]} job = brozzler.new_job(frontier, job_conf) assert len(list(frontier.job_sites(job.id))) == 1 site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) job.refresh() assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[0]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start'] # resuming a job == resuming all of its sites frontier.resume_job(job) site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 3 assert job.starts_and_stops[2]['start'] assert job.starts_and_stops[2]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 3 assert site.starts_and_stops[2]['start'] assert site.starts_and_stops[2]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 3 assert job.starts_and_stops[2]['start'] assert job.starts_and_stops[2]['stop'] assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[0]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 3 assert site.starts_and_stops[2]['start'] assert site.starts_and_stops[2]['stop'] assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[0]['start']
def test_choose_warcprox(): rr = doublethink.Rethinker('localhost', db='ignoreme') svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 rr.table('sites').wait().run() rr.table('services').wait().run() rr.table('sites').index_wait().run() rr.table('services').index_wait().run() # clean slate rr.table('sites').delete().run() rr.table('services').delete().run() worker = brozzler.BrozzlerWorker(frontier, svcreg) assert worker._choose_warcprox() is None rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host1', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8001, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host3', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host4', 'port': 8000, 'load': 1, 'ttl': 60}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8001', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host3' assert instance['port'] == 8000 rr.table('sites').insert({ 'proxy': 'host3:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host4' assert instance['port'] == 8000 # clean up rr.table('sites').delete().run() rr.table('services').delete().run()
def brozzler_new_site(argv=None): ''' Command line utility entry point for queuing a new brozzler site. Takes a seed url and creates a site and page object in rethinkdb, which brozzler-workers will look at and start crawling. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzler-new-site - register site to brozzle', formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') add_rethinkdb_options(arg_parser) arg_parser.add_argument('--time-limit', dest='time_limit', default=None, help='time limit in seconds for this site') arg_parser.add_argument('--ignore-robots', dest='ignore_robots', action='store_true', help='ignore robots.txt for this site') arg_parser.add_argument( '--warcprox-meta', dest='warcprox_meta', help=('Warcprox-Meta http request header to send with each request; ' 'must be a json blob, ignored unless warcprox features are ' 'enabled')) arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=('json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--username', dest='username', default=None, help='use this username to try to log in if a login form is found') arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) site = brozzler.Site( rr, { 'seed': args.seed, 'time_limit': int(args.time_limit) if args.time_limit else None, 'ignore_robots': args.ignore_robots, 'warcprox_meta': json.loads(args.warcprox_meta) if args.warcprox_meta else None, 'behavior_parameters': json.loads(args.behavior_parameters) if args.behavior_parameters else None, 'username': args.username, 'password': args.password }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site)
def test_stop_crawl(httpd): test_id = 'test_stop_crawl_job-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) # create a new job with three sites that could be crawled forever job_conf = { 'seeds': [{ 'url': make_url(httpd, '/infinite/foo/') }, { 'url': make_url(httpd, '/infinite/bar/') }, { 'url': make_url(httpd, '/infinite/baz/') }] } job = brozzler.new_job(frontier, job_conf) assert job.id sites = list(frontier.job_sites(job.id)) assert not sites[0].stop_requested assert not sites[1].stop_requested # request crawl stop for one site using the command line entrypoint brozzler.cli.brozzler_stop_crawl( ['brozzler-stop-crawl', '--site=%s' % sites[0].id]) sites[0].refresh() assert sites[0].stop_requested # stop request should be honored quickly start = time.time() while not sites[0].status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) sites[0].refresh() assert sites[0].status == 'FINISHED_STOP_REQUESTED' # but the other sites and the job as a whole should still be crawling sites[1].refresh() assert sites[1].status == 'ACTIVE' sites[2].refresh() assert sites[2].status == 'ACTIVE' job.refresh() assert job.status == 'ACTIVE' # request crawl stop for the job using the command line entrypoint brozzler.cli.brozzler_stop_crawl( ['brozzler-stop-crawl', '--job=%s' % job.id]) job.refresh() assert job.stop_requested # stop request should be honored quickly start = time.time() while not job.status.startswith('FINISHED') and time.time() - start < 120: time.sleep(0.5) job.refresh() assert job.status == 'FINISHED' # the other sites should also be FINISHED_STOP_REQUESTED sites[0].refresh() assert sites[0].status == 'FINISHED_STOP_REQUESTED' sites[1].refresh() assert sites[1].status == 'FINISHED_STOP_REQUESTED' sites[2].refresh() assert sites[2].status == 'FINISHED_STOP_REQUESTED'
def brozzler_worker(argv=None): ''' Main entry point for brozzler, gets sites and pages to brozzle from rethinkdb, brozzles them. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), formatter_class=BetterArgumentDefaultsHelpFormatter) add_rethinkdb_options(arg_parser) arg_parser.add_argument('-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '-n', '--max-browsers', dest='max_browsers', default='1', help='max number of chrome instances simultaneously browsing pages') arg_parser.add_argument('--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--warcprox-auto', dest='warcprox_auto', action='store_true', help=('when needed, choose an available instance of warcprox from ' 'the rethinkdb service registry')) arg_parser.add_argument('--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) def dump_state(signum, frame): signal.signal(signal.SIGQUIT, signal.SIG_IGN) try: state_strs = [] frames = sys._current_frames() threads = {th.ident: th for th in threading.enumerate()} for ident in frames: if threads[ident]: state_strs.append(str(threads[ident])) else: state_strs.append('<???:thread:ident=%s>' % ident) stack = traceback.format_stack(frames[ident]) state_strs.append(''.join(stack)) logging.info('dumping state (caught signal %s)\n%s' % (signum, '\n'.join(state_strs))) except BaseException as e: logging.error('exception dumping state: %s' % e) finally: signal.signal(signal.SIGQUIT, dump_state) rr = rethinker(args) frontier = brozzler.RethinkDbFrontier(rr) service_registry = doublethink.ServiceRegistry(rr) worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=int(args.max_browsers), chrome_exe=args.chrome_exe, proxy=args.proxy, warcprox_auto=args.warcprox_auto, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) signal.signal(signal.SIGQUIT, dump_state) signal.signal(signal.SIGTERM, lambda s, f: worker.stop()) signal.signal(signal.SIGINT, lambda s, f: worker.stop()) th = threading.Thread(target=worker.run, name='BrozzlerWorkerThread') th.start() th.join() logging.info('brozzler-worker is all done, exiting')
def _test_proxy_setting( httpd, proxy=None, warcprox_auto=False, is_warcprox=False): test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % ( proxy, warcprox_auto, is_warcprox, datetime.datetime.utcnow().isoformat()) # the two pages we expect to be crawled page1 = 'http://localhost:%s/site1/' % httpd.server_port page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port robots = 'http://localhost:%s/robots.txt' % httpd.server_port rr = doublethink.Rethinker('localhost', db='brozzler') service_registry = doublethink.ServiceRegistry(rr) site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=1, chrome_exe=brozzler.suggest_default_chrome_exe(), warcprox_auto=warcprox_auto, proxy=proxy) browser = worker._browser_pool.acquire() worker.brozzle_site(browser, site) worker._browser_pool.release(browser) # check proxy is set assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy site.refresh() # check that these things were persisted assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/site1/' % httpd.server_port, 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} if is_warcprox: assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload else: assert captures_by_url == {}
def test_resume_job(): ''' Tests that the right stuff gets twiddled in rethinkdb when we "start" and "finish" crawling a job. Doesn't actually crawl anything. ''' # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker(db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) job_conf = {'seeds': [{'url': 'http://example.com/'}]} job = brozzler.new_job(frontier, job_conf) assert len(list(frontier.job_sites(job.id))) == 1 site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) job.refresh() assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] assert job.starts_and_stops[1]['stop'] > job.starts_and_stops[1]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start'] # resuming a job == resuming all of its sites frontier.resume_job(job) site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 3 assert job.starts_and_stops[2]['start'] assert job.starts_and_stops[2]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 3 assert site.starts_and_stops[2]['start'] assert site.starts_and_stops[2]['stop'] is None frontier.finished(site, 'FINISHED') job.refresh() assert job.status == 'FINISHED' assert len(job.starts_and_stops) == 3 assert job.starts_and_stops[2]['start'] assert job.starts_and_stops[2]['stop'] assert job.starts_and_stops[2]['stop'] > job.starts_and_stops[2]['start'] assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 3 assert site.starts_and_stops[2]['start'] assert site.starts_and_stops[2]['stop'] assert site.starts_and_stops[2]['stop'] > site.starts_and_stops[2]['start'] frontier.resume_job(job) site = list(frontier.job_sites(job.id))[0] assert job.status == 'ACTIVE' assert len(job.starts_and_stops) == 4 assert job.starts_and_stops[3]['start'] assert job.starts_and_stops[3]['stop'] is None assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 4 assert site.starts_and_stops[3]['start'] assert site.starts_and_stops[3]['stop'] is None # simulate a job stop request job_conf = {'seeds': [{'url': 'http://example.com/'}, {'url': 'http://example_2.com/'}]} job = brozzler.new_job(frontier, job_conf) assert len(list(frontier.job_sites(job.id))) == 2 site1 = list(frontier.job_sites(job.id))[0] site2 = list(frontier.job_sites(job.id))[1] job.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC) job.save() # should raise a CrawlStopped with pytest.raises(brozzler.CrawlStopped): frontier.honor_stop_request(site1) frontier.finished(site1, 'FINISHED_STOP_REQUESTED') frontier.finished(site2, 'FINISHED_STOP_REQUESTED') job.refresh() assert job.status == 'FINISHED' assert job.stop_requested assert len(job.starts_and_stops) == 1 assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] assert job.starts_and_stops[0]['stop'] > job.starts_and_stops[0]['start'] assert site1.status == 'FINISHED_STOP_REQUESTED' assert site2.status == 'FINISHED_STOP_REQUESTED' assert len(site1.starts_and_stops) == 1 assert len(site2.starts_and_stops) == 1 assert site1.starts_and_stops[0]['start'] assert site1.starts_and_stops[0]['stop'] assert site1.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] assert site2.starts_and_stops[0]['start'] assert site2.starts_and_stops[0]['stop'] assert site2.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] # simulate job resume after a stop request frontier.resume_job(job) site1 = list(frontier.job_sites(job.id))[0] site2 = list(frontier.job_sites(job.id))[1] assert job.status == 'ACTIVE' assert job.stop_requested is None assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site1.status == 'ACTIVE' assert len(site1.starts_and_stops) == 2 assert site1.starts_and_stops[1]['start'] assert site1.starts_and_stops[1]['stop'] is None assert site2.status == 'ACTIVE' assert len(site2.starts_and_stops) == 2 assert site2.starts_and_stops[1]['start'] assert site2.starts_and_stops[1]['stop'] is None # simulate a site stop request site1.stop_requested = datetime.datetime.utcnow().replace(tzinfo=doublethink.UTC) site1.save() # should not raise a CrawlStopped frontier.honor_stop_request(site2) frontier.finished(site1, 'FINISHED_STOP_REQUESTED') job.refresh() assert job.status == 'ACTIVE' assert job.stop_requested is None assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site1.status == 'FINISHED_STOP_REQUESTED' assert len(site1.starts_and_stops) == 2 assert site1.starts_and_stops[1]['start'] assert site1.starts_and_stops[1]['stop'] assert site1.starts_and_stops[1]['stop'] > site.starts_and_stops[1]['start'] assert site2.status == 'ACTIVE' assert len(site2.starts_and_stops) == 2 assert site2.starts_and_stops[1]['start'] assert site2.starts_and_stops[1]['stop'] is None # simulate site resume after a stop request frontier.resume_site(site1) site1 = list(frontier.job_sites(job.id))[0] site2 = list(frontier.job_sites(job.id))[1] assert job.status == 'ACTIVE' assert job.stop_requested is None assert len(job.starts_and_stops) == 2 assert job.starts_and_stops[1]['start'] assert job.starts_and_stops[1]['stop'] is None assert site1.status == 'ACTIVE' assert site1.stop_requested is None assert len(site1.starts_and_stops) == 3 assert site1.starts_and_stops[2]['start'] assert site1.starts_and_stops[2]['stop'] is None assert site2.status == 'ACTIVE' assert len(site2.starts_and_stops) == 2 assert site2.starts_and_stops[1]['start'] assert site2.starts_and_stops[1]['stop'] is None
def test_warcprox_outage_resiliency(httpd): ''' Tests resiliency to warcprox outage. If no instances of warcprox are healthy when starting to crawl a site, brozzler-worker should sit there and wait until a healthy instance appears. If an instance goes down, sites assigned to that instance should bounce over to a healthy instance. If all instances of warcprox go down, brozzler-worker should sit and wait. ''' rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) svcreg = doublethink.ServiceRegistry(rr) # run two instances of warcprox opts = warcprox.Options() opts.address = '0.0.0.0' opts.port = 0 warcprox1 = warcprox.controller.WarcproxController( service_registry=svcreg, options=opts) warcprox2 = warcprox.controller.WarcproxController( service_registry=svcreg, options=opts) warcprox1_thread = threading.Thread( target=warcprox1.run_until_shutdown, name='warcprox1') warcprox2_thread = threading.Thread( target=warcprox2.run_until_shutdown, name='warcprox2') # put together a site to crawl test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/infinite/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) try: # we manage warcprox instances ourselves, so stop the one running on # the system, if any try: stop_service('warcprox') except Exception as e: logging.warn('problem stopping warcprox service: %s', e) # queue the site for brozzling brozzler.new_site(frontier, site) # check that nothing happens # XXX tail brozzler-worker.log or something? time.sleep(30) site.refresh() assert site.status == 'ACTIVE' assert not site.proxy assert len(list(frontier.site_pages(site.id))) == 1 # start one instance of warcprox warcprox1_thread.start() # check that it started using that instance start = time.time() while not site.proxy and time.time() - start < 30: time.sleep(0.5) site.refresh() assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port) # check that the site accumulates pages in the frontier, confirming # that crawling is really happening start = time.time() while (len(list(frontier.site_pages(site.id))) <= 1 and time.time() - start < 60): time.sleep(0.5) site.refresh() assert len(list(frontier.site_pages(site.id))) > 1 # stop warcprox #1, start warcprox #2 warcprox2_thread.start() warcprox1.stop.set() warcprox1_thread.join() # check that it switched over to warcprox #2 start = time.time() while ((not site.proxy or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port)) and time.time() - start < 30): time.sleep(0.5) site.refresh() assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port) # stop warcprox #2 warcprox2.stop.set() warcprox2_thread.join() page_count = len(list(frontier.site_pages(site.id))) assert page_count > 1 # check that it is waiting for a warcprox to appear time.sleep(30) site.refresh() assert site.status == 'ACTIVE' assert not site.proxy assert len(list(frontier.site_pages(site.id))) == page_count # stop crawling the site, else it can pollute subsequent test runs brozzler.cli.brozzler_stop_crawl([ 'brozzler-stop-crawl', '--site=%s' % site.id]) site.refresh() assert site.stop_requested # stop request should be honored quickly start = time.time() while not site.status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED_STOP_REQUESTED' finally: warcprox1.stop.set() warcprox2.stop.set() warcprox1_thread.join() warcprox2_thread.join() start_service('warcprox')
def test_time_limit(): # XXX test not thoroughly adapted to change in time accounting, since # starts_and_stops is no longer used to enforce time limits # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed':'http://example.com/', 'time_limit':99999}) brozzler.new_site(frontier, site) site.refresh() # get it back from the db assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None # time limit not reached yet frontier.enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.time_limit = 0.1 site.claimed = True site.save() # time limit not reached yet frontier.enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.active_brozzling_time = 0.2 # this is why the time limit will be hit try: frontier.enforce_time_limit(site) except brozzler.ReachedTimeLimit: frontier.finished(site, 'FINISHED_TIME_LIMIT') assert site.status == 'FINISHED_TIME_LIMIT' assert not site.claimed assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']