def test_seed_redirect(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the pages table pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 pages.sort(key=lambda page: page.hops_from_seed) assert pages[0].hops_from_seed == 0 assert pages[0].url == seed_url assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port # check that scope has been updated properly assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site7/' % httpd.server_port site = brozzler.Site( rr, { 'seed': seed_url, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == [ 'http://localhost:%s/site7/foo.html' % httpd.server_port ] assert not pages[0].hashtags assert pages[ 1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == [ '#boosh', '#ignored', '#whee', ] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id': test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD' } assert seed_url in captures_by_url assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url assert 'screenshot:%s' % seed_url in captures_by_url assert 'thumbnail:%s' % seed_url in captures_by_url assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
def test_hashtag_seed(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # no hash tag site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) assert site.scope['surt'] == 'http://(org,example,)/' pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert pages[0].url == 'http://example.org/' assert not pages[0].hashtags # yes hash tag site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) brozzler.new_site(frontier, site) assert site.scope['surt'] == 'http://(org,example,)/' pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert pages[0].url == 'http://example.org/' assert pages[0].hashtags == ['#hash',]
def test_hashtag_seed(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # no hash tag site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert pages[0].url == 'http://example.org/' assert not pages[0].hashtags # yes hash tag site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) brozzler.new_site(frontier, site) assert site.scope == {'accepts': [{'ssurt': 'org,example,//http:/'}]} pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert pages[0].url == 'http://example.org/' assert pages[0].hashtags == ['#hash',]
def test_seed_redirect(httpd): test_id = 'test_seed_redirect-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.scope == {'accepts': [{'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}]} frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the pages table pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 pages.sort(key=lambda page: page.hops_from_seed) assert pages[0].hops_from_seed == 0 assert pages[0].url == seed_url assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port # check that scope has been updated properly assert site.scope == {'accepts': [ {'ssurt': 'localhost,//%s:http:/site5/redirect/' % httpd.server_port}, {'ssurt': 'localhost,//%s:http:/site5/destination/' % httpd.server_port}]}
def test_obey_robots(httpd): test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site( rr, { 'seed': make_url(httpd, '/site1/'), 'user_agent': 'im a badbot', # robots.txt blocks badbot 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) # so we can examine rethinkdb before it does anything try: stop_service('brozzler-worker') assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None site_pages = list(frontier.site_pages(site.id)) assert len(site_pages) == 1 assert site_pages[0].url == site.seed assert site_pages[0].needs_robots_check finally: start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that only the one page is in rethinkdb pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] assert page.url == make_url(httpd, '/site1/') assert page.blocked_by_robots # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = make_url(httpd, '/robots.txt') captures = list(rr.table('captures').filter({'test_id': test_id}).run()) assert len(captures) == 1 assert captures[0]['url'] == robots_url # check pywb t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url) expected_payload = open( os.path.join(os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read() assert requests.get(wb_url, allow_redirects=False).content == expected_payload
def test_brozzle_site(httpd): test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site( seed='http://localhost:%s/' % httpd.server_port, proxy='localhost:8000', enable_warcprox_features=True, warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) # the two pages we expect to be crawled page1 = 'http://localhost:%s/' % httpd.server_port page2 = 'http://localhost:%s/file1.txt' % httpd.server_port # so we can examine rethinkdb before it does anything try: stop_service('brozzler-worker') assert site.id is None r = rethinkstuff.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 finally: start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site = frontier.site(site.id) assert site.status == 'FINISHED' # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 3 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, 'http://localhost:%s/robots.txt' % httpd.server_port, 'http://localhost:%s/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} assert page1 in captures_by_url assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # no screenshots of plaintext # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload
def brozzler_new_site(argv=None): ''' Command line utility entry point for queuing a new brozzler site. Takes a seed url and creates a site and page object in rethinkdb, which brozzler-workers will look at and start crawling. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzler-new-site - register site to brozzle', formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') add_rethinkdb_options(arg_parser) arg_parser.add_argument( '--time-limit', dest='time_limit', default=None, help='time limit in seconds for this site') arg_parser.add_argument( '--ignore-robots', dest='ignore_robots', action='store_true', help='ignore robots.txt for this site') arg_parser.add_argument( '--warcprox-meta', dest='warcprox_meta', help=( 'Warcprox-Meta http request header to send with each request; ' 'must be a json blob, ignored unless warcprox features are ' 'enabled')) arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=( 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--username', dest='username', default=None, help='use this username to try to log in if a login form is found') arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) site = brozzler.Site(rr, { 'seed': args.seed, 'time_limit': int(args.time_limit) if args.time_limit else None, 'ignore_robots': args.ignore_robots, 'warcprox_meta': json.loads( args.warcprox_meta) if args.warcprox_meta else None, 'behavior_parameters': json.loads( args.behavior_parameters) if args.behavior_parameters else None, 'username': args.username, 'password': args.password}) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site)
def test_login(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site( rr, { 'seed': 'http://localhost:%s/site2/' % httpd.server_port, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } }, 'username': '******', 'password': '******' }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port captures = list( rr.table('captures').filter({ 'test_id': test_id }).order_by('timestamp').run()) meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] # there are several forms in in htdocs/site2/login.html but only one # that brozzler's heuristic should match and try to submit, and it has # action='00', so we can check for that here assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url # sanity check the rest of the crawl assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ( 'WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ( 'WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
def test_time_limit(): # XXX test not thoroughly adapted to change in time accounting, since # starts_and_stops is no longer used to enforce time limits # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { 'seed': 'http://example.com/', 'time_limit': 99999 }) brozzler.new_site(frontier, site) site.refresh() # get it back from the db assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None # no time limit set frontier.enforce_time_limit(site) site.time_limit = 10 site.claimed = True site.save() # time limit not reached yet frontier.enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.time_limit = 0.1 time.sleep(0.1) with pytest.raises(brozzler.ReachedTimeLimit): frontier.enforce_time_limit(site)
def test_redirect_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = make_url(httpd, '/site9/') site = brozzler.Site( rr, { 'seed': seed_url, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == [ make_url(httpd, '/site9/redirect.html') ] assert not pages[0].hashtags assert pages[1].url == make_url(httpd, '/site9/redirect.html') assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == [ '#hash1', '#hash2', ] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id': test_id}).run() redirect_captures = [ c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET' ] assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
def test_time_limit(): # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { 'seed': 'http://example.com/', 'time_limit': 99999 }) brozzler.new_site(frontier, site) site.refresh() # get it back from the db assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None # time limit not reached yet frontier._enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.time_limit = 0.1 site.claimed = True site.save() time.sleep(0.1) frontier._enforce_time_limit(site) assert site.status == 'FINISHED_TIME_LIMIT' assert not site.claimed assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
def test_obey_robots(httpd): test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site( seed='http://localhost:%s/' % httpd.server_port, proxy='localhost:8000', enable_warcprox_features=True, user_agent='im a badbot', # robots.txt blocks badbot warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) # so we can examine rethinkdb before it does anything try: stop_service('brozzler-worker') assert site.id is None r = rethinkstuff.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site) assert site.id is not None site_pages = list(frontier.site_pages(site.id)) assert len(site_pages) == 1 assert site_pages[0].url == site.seed assert site_pages[0].needs_robots_check finally: start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site = frontier.site(site.id) assert site.status == 'FINISHED' # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port} # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port captures = list(r.table('captures').filter({'test_id':test_id}).run()) assert len(captures) == 1 assert captures[0]['url'] == robots_url # check pywb t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read() assert requests.get( wb_url, allow_redirects=False).content == expected_payload
def test_time_limit(): # XXX test not thoroughly adapted to change in time accounting, since # starts_and_stops is no longer used to enforce time limits # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed':'http://example.com/', 'time_limit':99999}) brozzler.new_site(frontier, site) site.refresh() # get it back from the db assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None # no time limit set frontier.enforce_time_limit(site) site.time_limit = 10 site.claimed = True site.save() # time limit not reached yet frontier.enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.time_limit = 0.1 time.sleep(0.1) with pytest.raises(brozzler.ReachedTimeLimit): frontier.enforce_time_limit(site)
def brozzler_new_site(): ''' Command line utility entry point for queuing a new brozzler site. Takes a seed url and creates a site and page object in rethinkdb, which brozzler-workers will look at and start crawling. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzler-new-site - register site to brozzle', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') _add_rethinkdb_options(arg_parser) _add_proxy_options(arg_parser) arg_parser.add_argument( '--time-limit', dest='time_limit', default=None, help='time limit in seconds for this site') arg_parser.add_argument( '--ignore-robots', dest='ignore_robots', action='store_true', help='ignore robots.txt for this site') arg_parser.add_argument( '--warcprox-meta', dest='warcprox_meta', help=( 'Warcprox-Meta http request header to send with each request; ' 'must be a json blob, ignored unless warcprox features are ' 'enabled')) arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=( 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( seed=args.seed, proxy=args.proxy, time_limit=int(args.time_limit) if args.time_limit else None, ignore_robots=args.ignore_robots, enable_warcprox_features=args.enable_warcprox_features, warcprox_meta=json.loads( args.warcprox_meta) if args.warcprox_meta else None, behavior_parameters=json.loads( args.behavior_parameters) if args.behavior_parameters else None) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site)
def test_ydl_stitching(httpd): test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site( rr, { 'seed': make_url(httpd, '/site10/'), 'warcprox_meta': { 'warc-prefix': 'test_ydl_stitching', 'captures-table-extra-fields': { 'test_id': test_id } } }) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check page.videos pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] while time.time() - start < 600 and not page.videos: time.sleep(0.5) page.refresh() assert len(page.videos) == 6 stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/') assert { 'blame': 'youtube-dl', 'content-length': 267900, 'content-type': 'video/mp4', 'response_code': 204, 'url': stitched_url, } in page.videos time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = list(rr.table('captures').filter({'test_id': test_id}).run()) l = [c for c in captures if c['url'] == stitched_url] assert len(l) == 1 c = l[0] assert c['filename'].startswith('test_ydl_stitching') assert c['content_type'] == 'video/mp4' assert c['http_method'] == 'WARCPROX_WRITE_RECORD'
def test_brozzle_site(httpd): test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site( seed='http://localhost:%s/' % httpd.server_port, proxy='localhost:8000', enable_warcprox_features=True, warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) # the two pages we expect to be crawled page1 = 'http://localhost:%s/' % httpd.server_port page2 = 'http://localhost:%s/file1.txt' % httpd.server_port assert site.id is None r = rethinkstuff.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site = frontier.site(site.id) assert site.status == 'FINISHED' # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, 'http://localhost:%s/file1.txt' % httpd.server_port } # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'} assert page1 in captures_by_url assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # no screenshots of plaintext # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload
def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site7/' % httpd.server_port site = brozzler.Site(rr, { 'seed': seed_url, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == ['http://localhost:%s/site7/foo.html' % httpd.server_port] assert not pages[0].hashtags assert pages[1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == ['#boosh','#ignored','#whee',] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} assert seed_url in captures_by_url assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url assert 'screenshot:%s' % seed_url in captures_by_url assert 'thumbnail:%s' % seed_url in captures_by_url assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
def brozzler_new_site(): ''' Command line utility entry point for queuing a new brozzler site. Takes a seed url and creates a site and page object in rethinkdb, which brozzler-workers will look at and start crawling. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzler-new-site - register site to brozzle', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') _add_rethinkdb_options(arg_parser) _add_proxy_options(arg_parser) arg_parser.add_argument( '--time-limit', dest='time_limit', default=None, help='time limit in seconds for this site') arg_parser.add_argument( '--ignore-robots', dest='ignore_robots', action='store_true', help='ignore robots.txt for this site') arg_parser.add_argument( '--warcprox-meta', dest='warcprox_meta', help=( 'Warcprox-Meta http request header to send with each request; ' 'must be a json blob, ignored unless warcprox features are ' 'enabled')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( seed=args.seed, proxy=args.proxy, time_limit=int(args.time_limit) if args.time_limit else None, ignore_robots=args.ignore_robots, enable_warcprox_features=args.enable_warcprox_features, warcprox_meta=( json.loads(args.warcprox_meta) if args.warcprox_meta else None)) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site)
def test_claim_site(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) rr.table('sites').delete().run() # clean slate with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id assert claimed_site.claimed assert claimed_site.last_claimed >= doublethink.utcnow( ) - datetime.timedelta(minutes=1) with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed less than 1 hour ago still not to be reclaimed claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta( minutes=55) claimed_site.save() with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed more than 1 hour ago can be reclaimed site = claimed_site claimed_site = None site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) site.save() claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id # clean up rr.table('sites').get(claimed_site.id).delete().run()
def test_ydl_stitching(httpd): test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site10/' % httpd.server_port, 'warcprox_meta': { 'warc-prefix': 'test_ydl_stitching', 'captures-table-extra-fields': {'test_id':test_id}}}) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check page.videos pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] assert len(page.videos) == 6 stitched_url = 'youtube-dl:00001:http://localhost:%s/site10/' % httpd.server_port assert { 'blame': 'youtube-dl', 'content-length': 267900, 'content-type': 'video/mp4', 'response_code': 204, 'url': stitched_url, } in page.videos time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = list(rr.table('captures').filter({'test_id':test_id}).run()) l = [c for c in captures if c['url'] == stitched_url] assert len(l) == 1 c = l[0] assert c['filename'].startswith('test_ydl_stitching') assert c['content_type'] == 'video/mp4' assert c['http_method'] == 'WARCPROX_WRITE_RECORD'
def test_claim_site(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) rr.table('sites').delete().run() # clean slate with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id assert claimed_site.claimed assert claimed_site.last_claimed >= doublethink.utcnow() - datetime.timedelta(minutes=1) with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed less than 1 hour ago still not to be reclaimed claimed_site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=55) claimed_site.save() with pytest.raises(brozzler.NothingToClaim): claimed_site = frontier.claim_sites() # site last_claimed more than 1 hour ago can be reclaimed site = claimed_site claimed_site = None site.last_claimed = doublethink.utcnow() - datetime.timedelta(minutes=65) site.save() claimed_sites = frontier.claim_sites() assert len(claimed_sites) == 1 claimed_site = claimed_sites[0] assert claimed_site.id == site.id # clean up rr.table('sites').get(claimed_site.id).delete().run()
def test_hashtag_links(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) parent_page = frontier.seed_page(site.id) assert not parent_page.hashtags outlinks = [ 'http://example.org/#foo', 'http://example.org/bar', 'http://example.org/bar#baz', 'http://example.org/bar#quux', 'http://example.org/zuh#buh', ] frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 3 assert pages[0].url == 'http://example.org/' assert sorted(pages[0].outlinks['accepted']) == [ 'http://example.org/', 'http://example.org/bar', 'http://example.org/zuh' ] assert not pages[0].outlinks['blocked'] assert not pages[0].outlinks['rejected'] assert pages[0].hashtags == [ '#foo', ] assert pages[0].hops_from_seed == 0 assert pages[1].url == 'http://example.org/bar' assert sorted(pages[1].hashtags) == ['#baz', '#quux'] assert pages[1].priority == 36 assert pages[1].hops_from_seed == 1 assert pages[2].url == 'http://example.org/zuh' assert pages[2].hashtags == ['#buh'] assert pages[2].priority == 12
def test_login(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site2/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}, 'username': '******', 'password': '******'}) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port captures = list(rr.table('captures').filter( {'test_id':test_id}).order_by('timestamp').run()) meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] # there are several forms in in htdocs/site2/login.html but only one # that brozzler's heuristic should match and try to submit, and it has # action='00', so we can check for that here assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url # sanity check the rest of the crawl assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
def test_hashtag_links(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) parent_page = frontier.seed_page(site.id) assert not parent_page.hashtags outlinks = [ 'http://example.org/#foo', 'http://example.org/bar', 'http://example.org/bar#baz', 'http://example.org/bar#quux', 'http://example.org/zuh#buh', ] frontier.scope_and_schedule_outlinks(site, parent_page, outlinks) pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 3 assert pages[0].url == 'http://example.org/' assert sorted(pages[0].outlinks['accepted']) == [ 'http://example.org/', 'http://example.org/bar', 'http://example.org/zuh'] assert not pages[0].outlinks['blocked'] assert not pages[0].outlinks['rejected'] assert pages[0].hashtags == ['#foo',] assert pages[0].hops_from_seed == 0 assert pages[1].url == 'http://example.org/bar' assert sorted(pages[1].hashtags) == ['#baz','#quux'] assert pages[1].priority == 36 assert pages[1].hops_from_seed == 1 assert pages[2].url == 'http://example.org/zuh' assert pages[2].hashtags == ['#buh'] assert pages[2].priority == 12
def test_warcprox_outage_resiliency(httpd): ''' Tests resiliency to warcprox outage. If no instances of warcprox are healthy when starting to crawl a site, brozzler-worker should sit there and wait until a healthy instance appears. If an instance goes down, sites assigned to that instance should bounce over to a healthy instance. If all instances of warcprox go down, brozzler-worker should sit and wait. ''' rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) svcreg = doublethink.ServiceRegistry(rr) # run two instances of warcprox opts = warcprox.Options() opts.address = '0.0.0.0' opts.port = 0 warcprox1 = warcprox.controller.WarcproxController( service_registry=svcreg, options=opts) warcprox2 = warcprox.controller.WarcproxController( service_registry=svcreg, options=opts) warcprox1_thread = threading.Thread( target=warcprox1.run_until_shutdown, name='warcprox1') warcprox2_thread = threading.Thread( target=warcprox2.run_until_shutdown, name='warcprox2') # put together a site to crawl test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/infinite/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) try: # we manage warcprox instances ourselves, so stop the one running on # the system, if any try: stop_service('warcprox') except Exception as e: logging.warn('problem stopping warcprox service: %s', e) # queue the site for brozzling brozzler.new_site(frontier, site) # check that nothing happens # XXX tail brozzler-worker.log or something? time.sleep(30) site.refresh() assert site.status == 'ACTIVE' assert not site.proxy assert len(list(frontier.site_pages(site.id))) == 1 # start one instance of warcprox warcprox1_thread.start() # check that it started using that instance start = time.time() while not site.proxy and time.time() - start < 30: time.sleep(0.5) site.refresh() assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port) # check that the site accumulates pages in the frontier, confirming # that crawling is really happening start = time.time() while (len(list(frontier.site_pages(site.id))) <= 1 and time.time() - start < 60): time.sleep(0.5) site.refresh() assert len(list(frontier.site_pages(site.id))) > 1 # stop warcprox #1, start warcprox #2 warcprox2_thread.start() warcprox1.stop.set() warcprox1_thread.join() # check that it switched over to warcprox #2 start = time.time() while ((not site.proxy or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port)) and time.time() - start < 30): time.sleep(0.5) site.refresh() assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port) # stop warcprox #2 warcprox2.stop.set() warcprox2_thread.join() page_count = len(list(frontier.site_pages(site.id))) assert page_count > 1 # check that it is waiting for a warcprox to appear time.sleep(30) site.refresh() assert site.status == 'ACTIVE' assert not site.proxy assert len(list(frontier.site_pages(site.id))) == page_count # stop crawling the site, else it can pollute subsequent test runs brozzler.cli.brozzler_stop_crawl([ 'brozzler-stop-crawl', '--site=%s' % site.id]) site.refresh() assert site.stop_requested # stop request should be honored quickly start = time.time() while not site.status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED_STOP_REQUESTED' finally: warcprox1.stop.set() warcprox2.stop.set() warcprox1_thread.join() warcprox2_thread.join() start_service('warcprox')
def _test_proxy_setting( httpd, proxy=None, warcprox_auto=False, is_warcprox=False): test_id = 'test_proxy=%s_warcprox_auto=%s_is_warcprox=%s-%s' % ( proxy, warcprox_auto, is_warcprox, datetime.datetime.utcnow().isoformat()) # the two pages we expect to be crawled page1 = 'http://localhost:%s/site1/' % httpd.server_port page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port robots = 'http://localhost:%s/robots.txt' % httpd.server_port rr = doublethink.Rethinker('localhost', db='brozzler') service_registry = doublethink.ServiceRegistry(rr) site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 worker = brozzler.worker.BrozzlerWorker( frontier, service_registry, max_browsers=1, chrome_exe=brozzler.suggest_default_chrome_exe(), warcprox_auto=warcprox_auto, proxy=proxy) browser = worker._browser_pool.acquire() worker.brozzle_site(browser, site) worker._browser_pool.release(browser) # check proxy is set assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy site.refresh() # check that these things were persisted assert site.status == 'FINISHED' if warcprox_auto: assert site.proxy[-5:] == ':8000' else: assert not site.proxy # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/site1/' % httpd.server_port, 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} if is_warcprox: assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload else: assert captures_by_url == {}
def test_brozzle_site(httpd): test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site1/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) # the two pages we expect to be crawled page1 = 'http://localhost:%s/site1/' % httpd.server_port page2 = 'http://localhost:%s/site1/file1.txt' % httpd.server_port robots = 'http://localhost:%s/robots.txt' % httpd.server_port # so we can examine rethinkdb before it does anything try: stop_service('brozzler-worker') assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 finally: start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/site1/' % httpd.server_port, 'http://localhost:%s/site1/file1.txt' % httpd.server_port} time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id':test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD'} assert robots in captures_by_url assert page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # no screenshots of plaintext # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'site1', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload url = 'screenshot:%s' % page1 t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url) response = requests.get(wb_url) assert response.status_code == 200 assert response.headers['content-type'] == 'image/jpeg' url = 'thumbnail:%s' % page1 t14 = captures_by_url[url]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, url) response = requests.get(wb_url) assert response.status_code == 200 assert response.headers['content-type'] == 'image/jpeg'
def test_max_hops_off(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { 'seed': 'http://example.com/', 'scope': { 'max_hops_off_surt': 1, 'blocks': [{'ssurt': 'domain,bad,'}]}}) brozzler.new_site(frontier, site) site.refresh() # get it back from the db # renamed this param assert not 'max_hops_off_surt' in site.scope assert site.scope['max_hops_off'] == 1 seed_page = frontier.seed_page(site.id) assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: # two of these are in scope because of max_hops_off frontier.scope_and_schedule_outlinks(site, seed_page, [ 'http://foo.org/', 'https://example.com/toot', 'http://example.com/toot', 'https://some.bad.domain/something']) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 4 assert pages[0].url == 'http://example.com/' assert pages[0].hops_off == 0 assert not 'hops_off_surt' in pages[0] assert set(pages[0].outlinks['accepted']) == { 'https://example.com/toot', 'http://foo.org/', 'http://example.com/toot'} assert pages[0].outlinks['blocked'] == [] assert pages[0].outlinks['rejected'] == [ 'https://some.bad.domain/something'] assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 0, 'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://example.com/toot', 'via_page_id': seed_page.id } in pages assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://foo.org/', 'via_page_id': seed_page.id } in pages assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'https://example.com/toot', 'via_page_id': seed_page.id } in pages # next hop is past max_hops_off, but normal in scope url is in scope foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0] orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks(site, foo_page, [ 'http://foo.org/bar', 'http://example.com/blah']) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert foo_page == { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://foo.org/', 'via_page_id': seed_page.id, 'outlinks': { 'accepted': ['http://example.com/blah'], 'blocked': [], 'rejected': ['http://foo.org/bar'], } } pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 5 assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 2, 'hops_off': 0, 'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'), 'job_id': None, 'needs_robots_check': False, 'priority': 11, 'site_id': site.id, 'url': 'http://example.com/blah', 'via_page_id': foo_page.id } in pages
def test_warcprox_outage_resiliency(httpd): ''' Tests resiliency to warcprox outage. If no instances of warcprox are healthy when starting to crawl a site, brozzler-worker should sit there and wait until a healthy instance appears. If an instance goes down, sites assigned to that instance should bounce over to a healthy instance. If all instances of warcprox go down, brozzler-worker should sit and wait. ''' rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) svcreg = doublethink.ServiceRegistry(rr) # run two instances of warcprox opts = warcprox.Options() opts.address = '0.0.0.0' opts.port = 0 opts.rethinkdb_services_url = 'rethinkdb://localhost/brozzler/services' warcprox1 = warcprox.controller.WarcproxController(opts) warcprox2 = warcprox.controller.WarcproxController(opts) warcprox1_thread = threading.Thread( target=warcprox1.run_until_shutdown, name='warcprox1') warcprox2_thread = threading.Thread( target=warcprox2.run_until_shutdown, name='warcprox2') # put together a site to crawl test_id = 'test_warcprox_death-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/infinite/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) try: # we manage warcprox instances ourselves, so stop the one running on # the system, if any try: stop_service('warcprox') except Exception as e: logging.warn('problem stopping warcprox service: %s', e) # queue the site for brozzling brozzler.new_site(frontier, site) # check that nothing happens # XXX tail brozzler-worker.log or something? time.sleep(30) site.refresh() assert site.status == 'ACTIVE' assert not site.proxy assert len(list(frontier.site_pages(site.id))) == 1 # start one instance of warcprox warcprox1_thread.start() # check that it started using that instance start = time.time() while not site.proxy and time.time() - start < 30: time.sleep(0.5) site.refresh() assert site.proxy.endswith(':%s' % warcprox1.proxy.server_port) # check that the site accumulates pages in the frontier, confirming # that crawling is really happening start = time.time() while (len(list(frontier.site_pages(site.id))) <= 1 and time.time() - start < 60): time.sleep(0.5) site.refresh() assert len(list(frontier.site_pages(site.id))) > 1 # stop warcprox #1, start warcprox #2 warcprox2_thread.start() warcprox1.stop.set() warcprox1_thread.join() # check that it switched over to warcprox #2 start = time.time() while ((not site.proxy or not site.proxy.endswith(':%s' % warcprox2.proxy.server_port)) and time.time() - start < 30): time.sleep(0.5) site.refresh() assert site.proxy.endswith(':%s' % warcprox2.proxy.server_port) # stop warcprox #2 warcprox2.stop.set() warcprox2_thread.join() page_count = len(list(frontier.site_pages(site.id))) assert page_count > 1 # check that it is waiting for a warcprox to appear time.sleep(30) site.refresh() assert site.status == 'ACTIVE' assert not site.proxy assert len(list(frontier.site_pages(site.id))) == page_count # stop crawling the site, else it can pollute subsequent test runs brozzler.cli.brozzler_stop_crawl([ 'brozzler-stop-crawl', '--site=%s' % site.id]) site.refresh() assert site.stop_requested # stop request should be honored quickly start = time.time() while not site.status.startswith( 'FINISHED') and time.time() - start < 120: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED_STOP_REQUESTED' finally: warcprox1.stop.set() warcprox2.stop.set() warcprox1_thread.join() warcprox2_thread.join() start_service('warcprox')
def test_max_hops_off(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site( rr, { 'seed': 'http://example.com/', 'scope': { 'max_hops_off_surt': 1, 'blocks': [{ 'ssurt': 'domain,bad,' }] } }) brozzler.new_site(frontier, site) site.refresh() # get it back from the db # renamed this param assert not 'max_hops_off_surt' in site.scope assert site.scope['max_hops_off'] == 1 seed_page = frontier.seed_page(site.id) assert site.accept_reject_or_neither('http://foo.org/', seed_page) is None assert site.accept_reject_or_neither('https://example.com/toot', seed_page) is None assert site.accept_reject_or_neither('http://example.com/toot', seed_page) is True assert site.accept_reject_or_neither('https://some.bad.domain/something', seed_page) is False orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: # two of these are in scope because of max_hops_off frontier.scope_and_schedule_outlinks(site, seed_page, [ 'http://foo.org/', 'https://example.com/toot', 'http://example.com/toot', 'https://some.bad.domain/something' ]) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 4 assert pages[0].url == 'http://example.com/' assert pages[0].hops_off == 0 assert not 'hops_off_surt' in pages[0] assert set(pages[0].outlinks['accepted']) == { 'https://example.com/toot', 'http://foo.org/', 'http://example.com/toot' } assert pages[0].outlinks['blocked'] == [] assert pages[0].outlinks['rejected'] == [ 'https://some.bad.domain/something' ] assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 0, 'id': brozzler.Page.compute_id(site.id, 'http://example.com/toot'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://example.com/toot', 'via_page_id': seed_page.id } in pages assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://foo.org/', 'via_page_id': seed_page.id } in pages assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'https://example.com/toot'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'https://example.com/toot', 'via_page_id': seed_page.id } in pages # next hop is past max_hops_off, but normal in scope url is in scope foo_page = [pg for pg in pages if pg.url == 'http://foo.org/'][0] orig_is_permitted_by_robots = brozzler.is_permitted_by_robots brozzler.is_permitted_by_robots = lambda *args: True try: frontier.scope_and_schedule_outlinks( site, foo_page, ['http://foo.org/bar', 'http://example.com/blah']) finally: brozzler.is_permitted_by_robots = orig_is_permitted_by_robots assert foo_page == { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 1, 'hops_off': 1, 'id': brozzler.Page.compute_id(site.id, 'http://foo.org/'), 'job_id': None, 'needs_robots_check': False, 'priority': 12, 'site_id': site.id, 'url': 'http://foo.org/', 'via_page_id': seed_page.id, 'outlinks': { 'accepted': ['http://example.com/blah'], 'blocked': [], 'rejected': ['http://foo.org/bar'], } } pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 5 assert { 'brozzle_count': 0, 'claimed': False, 'hashtags': [], 'hops_from_seed': 2, 'hops_off': 0, 'id': brozzler.Page.compute_id(site.id, 'http://example.com/blah'), 'job_id': None, 'needs_robots_check': False, 'priority': 11, 'site_id': site.id, 'url': 'http://example.com/blah', 'via_page_id': foo_page.id } in pages