def test_start_stop_backwards_compat(): site = brozzler.Site(None, {'seed': 'http://example.com/'}) assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None assert not 'start_time' in site site = brozzler.Site(None, { 'seed': 'http://example.com/', 'start_time': datetime.datetime(2017,1,1)}) assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1) assert site.starts_and_stops[0]['stop'] is None assert not 'start_time' in site job = brozzler.Job(None, {'seeds': [{'url':'https://example.com/'}]}) assert job.starts_and_stops[0]['start'] assert job.starts_and_stops[0]['stop'] is None assert not 'started' in job assert not 'finished' in job job = brozzler.Job(None, { 'seeds': [{'url':'https://example.com/'}], 'started': datetime.datetime(2017, 1, 1), 'finished': datetime.datetime(2017, 1, 2)}) assert job.starts_and_stops[0]['start'] == datetime.datetime(2017, 1, 1) assert job.starts_and_stops[0]['stop'] == datetime.datetime(2017, 1, 2) assert not 'started' in job assert not 'finished' in job
def test_seed_redirect(): site = brozzler.Site(None, {'seed': 'http://foo.com/'}) site.note_seed_redirect('https://foo.com/a/b/c') assert site.scope == { 'accepts': [{ 'ssurt': 'com,foo,//http:/', }, { 'ssurt': 'com,foo,//https:/', }] } site = brozzler.Site(None, {'seed': 'https://foo.com/'}) site.note_seed_redirect('http://foo.com/a/b/c') assert site.scope == { 'accepts': [{ 'ssurt': 'com,foo,//https:/', }, { 'ssurt': 'com,foo,//http:/', }] } site = brozzler.Site(None, {'seed': 'http://foo.com/'}) site.note_seed_redirect('https://bar.com/a/b/c') assert site.scope == { 'accepts': [{ 'ssurt': 'com,foo,//http:/', }, { 'ssurt': 'com,bar,//https:/a/b/c', }] }
def test_hashtag_seed(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) # no hash tag site = brozzler.Site(rr, {'seed': 'http://example.org/'}) brozzler.new_site(frontier, site) assert site.scope['surt'] == 'http://(org,example,)/' pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert pages[0].url == 'http://example.org/' assert not pages[0].hashtags # yes hash tag site = brozzler.Site(rr, {'seed': 'http://example.org/#hash'}) brozzler.new_site(frontier, site) assert site.scope['surt'] == 'http://(org,example,)/' pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 assert pages[0].url == 'http://example.org/' assert pages[0].hashtags == ['#hash',]
def test_robots(httpd): ''' Basic test of robots.txt user-agent substring matching. ''' url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed':url,'user_agent':'im/a/GoOdbot/yep'}) assert brozzler.is_permitted_by_robots(site, url) site = brozzler.Site(None, {'seed':url,'user_agent':'im/a bAdBOt/uh huh'}) assert not brozzler.is_permitted_by_robots(site, url)
def test_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site7/' % httpd.server_port site = brozzler.Site( rr, { 'seed': seed_url, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == [ 'http://localhost:%s/site7/foo.html' % httpd.server_port ] assert not pages[0].hashtags assert pages[ 1].url == 'http://localhost:%s/site7/foo.html' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == [ '#boosh', '#ignored', '#whee', ] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id': test_id}).run() captures_by_url = { c['url']: c for c in captures if c['http_method'] != 'HEAD' } assert seed_url in captures_by_url assert 'http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'http://localhost:%s/site7/whee.txt' % httpd.server_port in captures_by_url assert 'http://localhost:%s/site7/boosh.txt' % httpd.server_port in captures_by_url assert 'screenshot:%s' % seed_url in captures_by_url assert 'thumbnail:%s' % seed_url in captures_by_url assert 'screenshot:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url assert 'thumbnail:http://localhost:%s/site7/foo.html' % httpd.server_port in captures_by_url
def test_page_videos(httpd): # test depends on behavior of youtube-dl and chromium, could fail and need # to be adjusted on youtube-dl or chromium updates chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site6/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: worker.brozzle_page(browser, site, page) assert page.videos assert len(page.videos) == 2 assert page.videos[0] == { 'blame': 'youtube-dl', 'response_code': 200, 'content-length': 383631, 'content-type': 'video/mp4', 'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port, } assert page.videos[1] == { 'blame': 'browser', # 'response_code': 206, # 'content-range': 'bytes 0-229454/229455', 'response_code': 200, 'content-length': 229455, 'content-type': 'video/webm', 'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, }
def test_seed_redirect(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = 'http://localhost:%s/site5/redirect/' % httpd.server_port site = brozzler.Site(rr, { 'seed': 'http://localhost:%s/site5/redirect/' % httpd.server_port, 'warcprox_meta': {'captures-table-extra-fields':{'test_id':test_id}}}) assert site.scope['surt'] == 'http://(localhost:%s,)/site5/redirect/' % httpd.server_port frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the pages table pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 pages.sort(key=lambda page: page.hops_from_seed) assert pages[0].hops_from_seed == 0 assert pages[0].url == seed_url assert pages[0].redirect_url == 'http://localhost:%s/site5/destination/' % httpd.server_port assert pages[1].hops_from_seed == 1 assert pages[1].url == 'http://localhost:%s/site5/destination/page2.html' % httpd.server_port # check that scope has been updated properly assert site.scope['surt'] == 'http://(localhost:%s,)/site5/destination/' % httpd.server_port
def new_job(frontier, job_conf): '''Returns new Job.''' validate_conf(job_conf) job = Job(frontier.rr, { "conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()}) if "id" in job_conf: job.id = job_conf["id"] if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] job.save() sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) merged_conf.pop("seeds") merged_conf["job_id"] = job.id merged_conf["seed"] = merged_conf.pop("url") site = brozzler.Site(frontier.rr, merged_conf) sites.append(site) for site in sites: new_site(frontier, site) return job
def test_robots_http_statuses(): for status in (200, 204, 400, 401, 402, 403, 404, 405, 500, 501, 502, 503, 504, 505): class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): response = (('HTTP/1.1 %s Meaningless message\r\n' + 'Content-length: 0\r\n' + '\r\n') % status).encode('utf-8') self.connection.sendall(response) # self.send_response(status) # self.end_headers() httpd = http.server.HTTPServer(('localhost', 0), Handler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() try: url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed': url}) assert brozzler.is_permitted_by_robots(site, url) finally: httpd.shutdown() httpd.server_close() httpd_thread.join()
def test_seed_page(): rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, {'seed': 'http://example.com/a/'}) site.save() assert frontier.seed_page(site.id) is None page1 = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/b/', 'hops_from_seed': 1 }) page1.save() assert frontier.seed_page(site.id) is None page0 = brozzler.Page(rr, { 'site_id': site.id, 'url': 'http://example.com/a/', 'hops_from_seed': 0 }) page0.save() assert frontier.seed_page(site.id) == page0
def new_job(frontier, job_conf): job = Job(id=job_conf.get("id"), conf=job_conf, status="ACTIVE", started=rethinkstuff.utcnow()) sites = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) # XXX check for unknown settings, invalid url, etc site = brozzler.Site( job_id=job.id, seed=merged_conf["url"], scope=merged_conf.get("scope"), time_limit=merged_conf.get("time_limit"), proxy=merged_conf.get("proxy"), ignore_robots=merged_conf.get("ignore_robots"), enable_warcprox_features=merged_conf.get( "enable_warcprox_features"), warcprox_meta=merged_conf.get("warcprox_meta"), metadata=merged_conf.get("metadata"), remember_outlinks=merged_conf.get("remember_outlinks")) sites.append(site) # insert all the sites into database before the job for site in sites: new_site(frontier, site) frontier.new_job(job)
def new_job(frontier, job_conf): '''Returns new Job.''' validate_conf(job_conf) job = Job(frontier.rr, { "conf": job_conf, "status": "ACTIVE", "started": doublethink.utcnow()}) if "id" in job_conf: job.id = job_conf["id"] if "max_claimed_sites" in job_conf: job.max_claimed_sites = job_conf["max_claimed_sites"] job.save() sites = [] pages = [] for seed_conf in job_conf["seeds"]: merged_conf = merge(seed_conf, job_conf) merged_conf.pop("seeds") merged_conf["job_id"] = job.id merged_conf["seed"] = merged_conf.pop("url") site = brozzler.Site(frontier.rr, merged_conf) site.id = str(uuid.uuid4()) sites.append(site) pages.append(new_seed_page(frontier, site)) # insert in batches to avoid this error # rethinkdb.errors.ReqlDriverError: Query size (167883036) greater than maximum (134217727) in: for batch in (pages[i:i+500] for i in range(0, len(pages), 500)): logging.info('inserting batch of %s pages', len(batch)) result = frontier.rr.table('pages').insert(batch).run() for batch in (sites[i:i+100] for i in range(0, len(sites), 100)): logging.info('inserting batch of %s sites', len(batch)) result = frontier.rr.table('sites').insert(batch).run() logging.info('job %s fully started', job.id) return job
def test_robots_socket_timeout(): stop_hanging = threading.Event() class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): stop_hanging.wait(60) self.connection.sendall( b'HTTP/1.1 200 OK\r\nContent-length: 0\r\n\r\n') orig_timeout = brozzler.robots._SessionRaiseOn420.timeout httpd = http.server.HTTPServer(('localhost', 0), Handler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() try: url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed': url}) brozzler.robots._SessionRaiseOn420.timeout = 2 assert brozzler.is_permitted_by_robots(site, url) finally: brozzler.robots._SessionRaiseOn420.timeout = orig_timeout stop_hanging.set() httpd.shutdown() httpd.server_close() httpd_thread.join()
def test_obey_robots(httpd): test_id = 'test_obey_robots-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site( rr, { 'seed': make_url(httpd, '/site1/'), 'user_agent': 'im a badbot', # robots.txt blocks badbot 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) # so we can examine rethinkdb before it does anything try: stop_service('brozzler-worker') assert site.id is None frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id is not None site_pages = list(frontier.site_pages(site.id)) assert len(site_pages) == 1 assert site_pages[0].url == site.seed assert site_pages[0].needs_robots_check finally: start_service('brozzler-worker') # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that only the one page is in rethinkdb pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] assert page.url == make_url(httpd, '/site1/') assert page.blocked_by_robots # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = make_url(httpd, '/robots.txt') captures = list(rr.table('captures').filter({'test_id': test_id}).run()) assert len(captures) == 1 assert captures[0]['url'] == robots_url # check pywb t14 = captures[0]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, robots_url) expected_payload = open( os.path.join(os.path.dirname(__file__), 'htdocs', 'robots.txt'), 'rb').read() assert requests.get(wb_url, allow_redirects=False).content == expected_payload
def claim_sites(self, n=1): result = ( self.rr.table('sites').get_all( r.args( r.db(self.rr.dbname).table( 'sites', read_mode='majority').between( ['ACTIVE', r.minval], ['ACTIVE', r.maxval], index='sites_last_disclaimed').order_by( r.desc('claimed'), 'last_disclaimed'). fold({}, lambda acc, site: acc.merge( r.branch( site.has_fields('job_id'), r.object( site['job_id'].coerce_to('string'), acc[ site['job_id'].coerce_to('string')]. default(0).add(1)), {})), emit=lambda acc, site, new_acc: r.branch( r.and_( r.or_( site['claimed'].not_(), site[ 'last_claimed'].lt(r.now().sub(60 * 60 ))), r.or_( site.has_fields('max_claimed_sites').not_( ), new_acc[site['job_id'].coerce_to( 'string')].le(site['max_claimed_sites' ]))), [site['id']], [])).limit(n))). update( # try to avoid a race condition resulting in multiple # brozzler-workers claiming the same site # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 r.branch( r.or_(r.row['claimed'].not_(), r.row['last_claimed'].lt(r.now().sub(60 * 60))), { 'claimed': True, 'last_claimed': r.now() }, {}), return_changes=True)).run() self._vet_result(result, replaced=list(range(n + 1)), unchanged=list(range(n + 1))) sites = [] for i in range(result["replaced"]): if result["changes"][i]["old_val"]["claimed"]: self.logger.warn( "re-claimed site that was still marked 'claimed' " "because it was last claimed a long time ago " "at %s, and presumably some error stopped it from " "being disclaimed", result["changes"][i]["old_val"]["last_claimed"]) site = brozzler.Site(self.rr, result["changes"][i]["new_val"]) sites.append(site) if sites: return sites else: raise brozzler.NothingToClaim
def site(self, id): if id is None: return None result = self.r.table("sites").get(id).run() if result: return brozzler.Site(**result) else: return None
def test_login(httpd): test_id = 'test_login-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') site = brozzler.Site( rr, { 'seed': 'http://localhost:%s/site2/' % httpd.server_port, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } }, 'username': '******', 'password': '******' }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # take a look at the captures table time.sleep(2) # in case warcprox hasn't finished processing urls robots_url = 'http://localhost:%s/robots.txt' % httpd.server_port captures = list( rr.table('captures').filter({ 'test_id': test_id }).order_by('timestamp').run()) meth_url = ['%s %s' % (c['http_method'], c['url']) for c in captures] # there are several forms in in htdocs/site2/login.html but only one # that brozzler's heuristic should match and try to submit, and it has # action='00', so we can check for that here assert ('POST http://localhost:%s/site2/00' % httpd.server_port) in meth_url # sanity check the rest of the crawl assert ('GET http://localhost:%s/robots.txt' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/' % httpd.server_port) in meth_url assert ('GET http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ( 'WARCPROX_WRITE_RECORD screenshot:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url assert ( 'WARCPROX_WRITE_RECORD thumbnail:http://localhost:%s/site2/login.html' % httpd.server_port) in meth_url
def test_scoping(): test_scope = yaml.load(''' max_hops: 100 accepts: - url_match: REGEX_MATCH value: ^.*/audio_file/.*\.mp3$ - url_match: SURT_MATCH value: http://(com,vimeocdn, - url_match: STRING_MATCH value: ec-media.soundcloud.com - regex: ^https?://twitter\.com.*$ - substring: facebook.com - regex: ^https?://(www.)?youtube.com/watch?.*$ parent_url_regex: ^https?://(www.)?youtube.com/user/.*$ blocks: - domain: twitter.com url_match: REGEX_MATCH value: ^.*lang=(?!en).*$ ''') site = brozzler.Site( None, { 'id': 1, 'seed': 'http://example.com/foo/bar?baz=quux#monkey', 'scope': test_scope }) page = brozzler.Page(None, { 'url': 'http://example.com/foo/bar?baz=quux#monkey', 'site_id': site.id }) assert site.is_in_scope('http://example.com/foo/bar', page) assert not site.is_in_scope('http://example.com/foo/baz', page) assert not site.is_in_scope('http://foo.com/some.mp3', page) assert site.is_in_scope('http://foo.com/blah/audio_file/some.mp3', page) assert site.is_in_scope('http://a.b.vimeocdn.com/blahblah', page) assert not site.is_in_scope('https://a.b.vimeocdn.com/blahblah', page) assert site.is_in_scope('https://twitter.com/twit', page) assert site.is_in_scope('https://twitter.com/twit?lang=en', page) assert not site.is_in_scope('https://twitter.com/twit?lang=es', page) assert site.is_in_scope('https://www.facebook.com/whatevz', page) assert not site.is_in_scope('https://www.youtube.com/watch?v=dUIn5OAPS5s', page) yt_user_page = brozzler.Page( None, { 'url': 'https://www.youtube.com/user/SonoraSantaneraVEVO', 'site_id': site.id, 'hops_from_seed': 10 }) assert site.is_in_scope('https://www.youtube.com/watch?v=dUIn5OAPS5s', yt_user_page)
def brozzler_new_site(argv=None): ''' Command line utility entry point for queuing a new brozzler site. Takes a seed url and creates a site and page object in rethinkdb, which brozzler-workers will look at and start crawling. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzler-new-site - register site to brozzle', formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') add_rethinkdb_options(arg_parser) arg_parser.add_argument( '--time-limit', dest='time_limit', default=None, help='time limit in seconds for this site') arg_parser.add_argument( '--ignore-robots', dest='ignore_robots', action='store_true', help='ignore robots.txt for this site') arg_parser.add_argument( '--warcprox-meta', dest='warcprox_meta', help=( 'Warcprox-Meta http request header to send with each request; ' 'must be a json blob, ignored unless warcprox features are ' 'enabled')) arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=( 'json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--username', dest='username', default=None, help='use this username to try to log in if a login form is found') arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) rr = rethinker(args) site = brozzler.Site(rr, { 'seed': args.seed, 'time_limit': int(args.time_limit) if args.time_limit else None, 'ignore_robots': args.ignore_robots, 'warcprox_meta': json.loads( args.warcprox_meta) if args.warcprox_meta else None, 'behavior_parameters': json.loads( args.behavior_parameters) if args.behavior_parameters else None, 'username': args.username, 'password': args.password}) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site)
def brozzle_page(): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzle-page - brozzle a single page', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--enable-warcprox-features', dest='enable_warcprox_features', action='store_true', help=( 'enable special features that assume the configured proxy ' 'is warcprox')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) browser.start(proxy=site.proxy) try: outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()
def test_time_limit(): # XXX test not thoroughly adapted to change in time accounting, since # starts_and_stops is no longer used to enforce time limits # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { 'seed': 'http://example.com/', 'time_limit': 99999 }) brozzler.new_site(frontier, site) site.refresh() # get it back from the db assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None # no time limit set frontier.enforce_time_limit(site) site.time_limit = 10 site.claimed = True site.save() # time limit not reached yet frontier.enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.time_limit = 0.1 time.sleep(0.1) with pytest.raises(brozzler.ReachedTimeLimit): frontier.enforce_time_limit(site)
def test_redirect_hashtags(httpd): test_id = 'test_hashtags-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') seed_url = make_url(httpd, '/site9/') site = brozzler.Site( rr, { 'seed': seed_url, 'warcprox_meta': { 'captures-table-extra-fields': { 'test_id': test_id } } }) frontier = brozzler.RethinkDbFrontier(rr) brozzler.new_site(frontier, site) assert site.id # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check that we the page we expected pages = sorted(list(frontier.site_pages(site.id)), key=lambda p: p.url) assert len(pages) == 2 assert pages[0].url == seed_url assert pages[0].hops_from_seed == 0 assert pages[0].brozzle_count == 1 assert pages[0].outlinks['accepted'] == [ make_url(httpd, '/site9/redirect.html') ] assert not pages[0].hashtags assert pages[1].url == make_url(httpd, '/site9/redirect.html') assert pages[1].hops_from_seed == 1 assert pages[1].brozzle_count == 1 assert sorted(pages[1].hashtags) == [ '#hash1', '#hash2', ] time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = rr.table('captures').filter({'test_id': test_id}).run() redirect_captures = [ c for c in captures if c['url'] == make_url(httpd, '/site9/redirect.html') and c['http_method'] == 'GET' ] assert len(redirect_captures) == 2 # youtube-dl + browser, no hashtags
def test_time_limit(): # vagrant brozzler-worker isn't configured to look at the "ignoreme" db rr = doublethink.Rethinker('localhost', db='ignoreme') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site(rr, { 'seed': 'http://example.com/', 'time_limit': 99999 }) brozzler.new_site(frontier, site) site.refresh() # get it back from the db assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] is None frontier.finished(site, 'FINISHED') assert site.status == 'FINISHED' assert len(site.starts_and_stops) == 1 assert site.starts_and_stops[0]['start'] assert site.starts_and_stops[0]['stop'] assert site.starts_and_stops[0]['stop'] > site.starts_and_stops[0]['start'] frontier.resume_site(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None # time limit not reached yet frontier._enforce_time_limit(site) assert site.status == 'ACTIVE' assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] is None site.time_limit = 0.1 site.claimed = True site.save() time.sleep(0.1) frontier._enforce_time_limit(site) assert site.status == 'FINISHED_TIME_LIMIT' assert not site.claimed assert len(site.starts_and_stops) == 2 assert site.starts_and_stops[1]['start'] assert site.starts_and_stops[1]['stop'] assert site.starts_and_stops[1]['stop'] > site.starts_and_stops[0]['start']
def test_extract_outlinks(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site8/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: outlinks = worker.brozzle_page(browser, site, page) assert outlinks == { 'http://example.com/offsite', 'http://localhost:%s/site8/baz/zuh' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port }
def test_ydl_stitching(httpd): test_id = 'test_ydl_stitching-%s' % datetime.datetime.utcnow().isoformat() rr = doublethink.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(rr) site = brozzler.Site( rr, { 'seed': make_url(httpd, '/site10/'), 'warcprox_meta': { 'warc-prefix': 'test_ydl_stitching', 'captures-table-extra-fields': { 'test_id': test_id } } }) brozzler.new_site(frontier, site) # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site.refresh() assert site.status == 'FINISHED' # check page.videos pages = list(frontier.site_pages(site.id)) assert len(pages) == 1 page = pages[0] while time.time() - start < 600 and not page.videos: time.sleep(0.5) page.refresh() assert len(page.videos) == 6 stitched_url = 'youtube-dl:00001:%s' % make_url(httpd, '/site10/') assert { 'blame': 'youtube-dl', 'content-length': 267900, 'content-type': 'video/mp4', 'response_code': 204, 'url': stitched_url, } in page.videos time.sleep(2) # in case warcprox hasn't finished processing urls # take a look at the captures table captures = list(rr.table('captures').filter({'test_id': test_id}).run()) l = [c for c in captures if c['url'] == stitched_url] assert len(l) == 1 c = l[0] assert c['filename'].startswith('test_ydl_stitching') assert c['content_type'] == 'video/mp4' assert c['http_method'] == 'WARCPROX_WRITE_RECORD'
def test_brozzle_site(httpd): test_id = 'test_brozzle_site-%s' % datetime.datetime.utcnow().isoformat() site = brozzler.Site( seed='http://localhost:%s/' % httpd.server_port, proxy='localhost:8000', enable_warcprox_features=True, warcprox_meta={'captures-table-extra-fields':{'test_id':test_id}}) # the two pages we expect to be crawled page1 = 'http://localhost:%s/' % httpd.server_port page2 = 'http://localhost:%s/file1.txt' % httpd.server_port assert site.id is None r = rethinkstuff.Rethinker('localhost', db='brozzler') frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site) assert site.id is not None assert len(list(frontier.site_pages(site.id))) == 1 # the site should be brozzled fairly quickly start = time.time() while site.status != 'FINISHED' and time.time() - start < 300: time.sleep(0.5) site = frontier.site(site.id) assert site.status == 'FINISHED' # check that we got the two pages we expected pages = list(frontier.site_pages(site.id)) assert len(pages) == 2 assert {page.url for page in pages} == { 'http://localhost:%s/' % httpd.server_port, 'http://localhost:%s/file1.txt' % httpd.server_port } # take a look at the captures table captures = r.table('captures').filter({'test_id':test_id}).run() captures_by_url = {c['url']:c for c in captures if c['http_method'] != 'HEAD'} assert page1 in captures_by_url assert '%srobots.txt' % page1 in captures_by_url assert page2 in captures_by_url assert 'screenshot:%s' % page1 in captures_by_url assert 'thumbnail:%s' % page1 in captures_by_url # no screenshots of plaintext # check pywb t14 = captures_by_url[page2]['timestamp'].strftime('%Y%m%d%H%M%S') wb_url = 'http://localhost:8880/brozzler/%s/%s' % (t14, page2) expected_payload = open(os.path.join( os.path.dirname(__file__), 'htdocs', 'file1.txt'), 'rb').read() assert requests.get(wb_url).content == expected_payload
def test_proxy_down(): ''' Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. This test needs to cover every possible fetch through the proxy other than fetches from the browser. For that, see test_brozzling.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ('127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) site = brozzler.Site(None, { 'id': str(uuid.uuid4()), 'seed': 'http://example.com/' }) page = brozzler.Page(None, {'url': 'http://example.com/'}) # robots.txt fetch with pytest.raises(brozzler.ProxyError): brozzler.is_permitted_by_robots(site, 'http://example.com/', proxy=not_listening_proxy) # youtube-dl fetch with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = worker._youtube_dl(tempdir, site) with pytest.raises(brozzler.ProxyError): worker._try_youtube_dl(ydl, site, page) # raw fetch with pytest.raises(brozzler.ProxyError): worker._fetch_url(site, page) # WARCPROX_WRITE_RECORD with pytest.raises(brozzler.ProxyError): worker._warcprox_write_record( warcprox_address=not_listening_proxy, url='test://proxy_down/warcprox_write_record', warc_type='metadata', content_type='text/plain', payload=b'''payload doesn't matter here''')
def test_robots_empty_response(): class Handler(http.server.BaseHTTPRequestHandler): def do_GET(self): self.connection.shutdown(socket.SHUT_RDWR) self.connection.close() httpd = http.server.HTTPServer(('localhost', 0), Handler) httpd_thread = threading.Thread(name='httpd', target=httpd.serve_forever) httpd_thread.start() try: url = 'http://localhost:%s/' % httpd.server_port site = brozzler.Site(None, {'seed': url}) assert brozzler.is_permitted_by_robots(site, url) finally: httpd.shutdown() httpd.server_close() httpd_thread.join()
def claim_site(self, worker_id): # XXX keep track of aggregate priority and prioritize sites accordingly? while True: result = ( self.r.table("sites", read_mode="majority") .between( ["ACTIVE",rethinkdb.minval], ["ACTIVE",rethinkdb.maxval], index="sites_last_disclaimed") .order_by(index="sites_last_disclaimed") .filter( (rethinkdb.row["claimed"] != True) | (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2*60*60)) .limit(1) .update( # try to avoid a race condition resulting in multiple # brozzler-workers claiming the same site # see https://github.com/rethinkdb/rethinkdb/issues/3235#issuecomment-60283038 rethinkdb.branch( (rethinkdb.row["claimed"] != True) | (rethinkdb.row["last_claimed"] < rethinkdb.now() - 2*60*60), { "claimed": True, "last_claimed_by": worker_id, "last_claimed": rethinkstuff.utcnow() }, {}), return_changes=True)).run() self._vet_result(result, replaced=[0,1], unchanged=[0,1]) if result["replaced"] == 1: if result["changes"][0]["old_val"]["claimed"]: self.logger.warn( "re-claimed site that was still marked 'claimed' " "because it was last claimed a long time ago " "at %s, and presumably some error stopped it from " "being disclaimed", result["changes"][0]["old_val"]["last_claimed"]) site = brozzler.Site(**result["changes"][0]["new_val"]) else: raise brozzler.NothingToClaim # XXX This is the only place we enforce time limit for now. Worker # loop should probably check time limit. Maybe frontier needs a # housekeeping thread to ensure that time limits get enforced in a # timely fashion. if not self._enforce_time_limit(site): return site
def brozzler_new_site(): ''' Command line utility entry point for queuing a new brozzler site. Takes a seed url and creates a site and page object in rethinkdb, which brozzler-workers will look at and start crawling. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzler-new-site - register site to brozzle', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('seed', metavar='SEED', help='seed url') _add_rethinkdb_options(arg_parser) _add_proxy_options(arg_parser) arg_parser.add_argument( '--time-limit', dest='time_limit', default=None, help='time limit in seconds for this site') arg_parser.add_argument( '--ignore-robots', dest='ignore_robots', action='store_true', help='ignore robots.txt for this site') arg_parser.add_argument( '--warcprox-meta', dest='warcprox_meta', help=( 'Warcprox-Meta http request header to send with each request; ' 'must be a json blob, ignored unless warcprox features are ' 'enabled')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( seed=args.seed, proxy=args.proxy, time_limit=int(args.time_limit) if args.time_limit else None, ignore_robots=args.ignore_robots, enable_warcprox_features=args.enable_warcprox_features, warcprox_meta=( json.loads(args.warcprox_meta) if args.warcprox_meta else None)) r = rethinkstuff.Rethinker( args.rethinkdb_servers.split(","), args.rethinkdb_db) frontier = brozzler.RethinkDbFrontier(r) brozzler.new_site(frontier, site)