def test_page_videos(httpd): # test depends on behavior of youtube-dl and chromium, could fail and need # to be adjusted on youtube-dl or chromium updates chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site6/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: worker.brozzle_page(browser, site, page) assert page.videos assert len(page.videos) == 2 assert page.videos[0] == { 'blame': 'youtube-dl', 'response_code': 200, 'content-length': 383631, 'content-type': 'video/mp4', 'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port, } assert page.videos[1] == { 'blame': 'browser', # 'response_code': 206, # 'content-range': 'bytes 0-229454/229455', 'response_code': 200, 'content-length': 229455, 'content-type': 'video/webm', 'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, }
def brozzle_page(): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzle-page - brozzle a single page', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--enable-warcprox-features', dest='enable_warcprox_features', action='store_true', help=( 'enable special features that assume the configured proxy ' 'is warcprox')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) browser.start(proxy=site.proxy) try: outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()
def test_limit_failures(): page = mock.Mock() page.failed_attempts = None page.brozzle_count = 0 site = mock.Mock() site.status = 'ACTIVE' site.active_brozzling_time = 0 site.starts_and_stops = [{'start':datetime.datetime.utcnow()}] rr = mock.Mock() rr.servers = [mock.Mock()] rethink_query = mock.Mock(run=mock.Mock(return_value=[])) rr.db_list = mock.Mock(return_value=rethink_query) rr.table_list = mock.Mock(return_value=rethink_query) rr.table = mock.Mock( return_value=mock.Mock( between=mock.Mock( return_value=mock.Mock( limit=mock.Mock( return_value=rethink_query))))) assert rr.table().between().limit().run() == [] frontier = brozzler.RethinkDbFrontier(rr) frontier.enforce_time_limit = mock.Mock() frontier.honor_stop_request = mock.Mock() frontier.claim_page = mock.Mock(return_value=page) frontier._maybe_finish_job = mock.Mock() browser = mock.Mock() worker = brozzler.BrozzlerWorker(frontier) worker.brozzle_page = mock.Mock(side_effect=Exception) assert page.failed_attempts is None assert page.brozzle_count == 0 assert site.status == 'ACTIVE' worker.brozzle_site(browser, site) assert page.failed_attempts == 1 assert page.brozzle_count == 0 assert site.status == 'ACTIVE' worker.brozzle_site(browser, site) assert page.failed_attempts == 2 assert page.brozzle_count == 0 assert site.status == 'ACTIVE' worker.brozzle_site(browser, site) assert page.failed_attempts == 3 assert page.brozzle_count == 1 assert site.status == 'FINISHED'
def test_extract_outlinks(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site8/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: outlinks = worker.brozzle_page(browser, site, page) assert outlinks == { 'http://example.com/offsite', 'http://localhost:%s/site8/baz/zuh' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port }
def test_proxy_down(): ''' Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down. This test needs to cover every possible fetch through the proxy other than fetches from the browser. For that, see test_brozzling.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ('127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) site = brozzler.Site(None, { 'id': str(uuid.uuid4()), 'seed': 'http://example.com/' }) page = brozzler.Page(None, {'url': 'http://example.com/'}) # robots.txt fetch with pytest.raises(brozzler.ProxyError): brozzler.is_permitted_by_robots(site, 'http://example.com/', proxy=not_listening_proxy) # youtube-dl fetch with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir: ydl = worker._youtube_dl(tempdir, site) with pytest.raises(brozzler.ProxyError): worker._try_youtube_dl(ydl, site, page) # raw fetch with pytest.raises(brozzler.ProxyError): worker._fetch_url(site, page) # WARCPROX_WRITE_RECORD with pytest.raises(brozzler.ProxyError): worker._warcprox_write_record( warcprox_address=not_listening_proxy, url='test://proxy_down/warcprox_write_record', warc_type='metadata', content_type='text/plain', payload=b'''payload doesn't matter here''')
def test_proxy_down(): ''' Test that browsing raises `brozzler.ProxyError` when proxy is down. See also `test_proxy_down` in test_units.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ('127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): site = brozzler.Site(None, {'seed': 'http://example.com/'}) page = brozzler.Page(None, {'url': 'http://example.com/'}) worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ProxyError): worker.brozzle_page(browser, site, page)
def test_choose_warcprox(): rr = doublethink.Rethinker('localhost', db='ignoreme') svcreg = doublethink.ServiceRegistry(rr) frontier = brozzler.RethinkDbFrontier(rr) # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021 rr.table('sites').wait().run() rr.table('services').wait().run() rr.table('sites').index_wait().run() rr.table('services').index_wait().run() # clean slate rr.table('sites').delete().run() rr.table('services').delete().run() worker = brozzler.BrozzlerWorker(frontier, svcreg) assert worker._choose_warcprox() is None rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host1', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host2', 'port': 8001, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host3', 'port': 8000, 'load': 0, 'ttl': 60}).run() rr.table('services').insert({ 'role': 'warcprox', 'first_heartbeat': doublethink.utcnow(), 'last_heartbeat': doublethink.utcnow(), 'host': 'host4', 'port': 8000, 'load': 1, 'ttl': 60}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host1:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() rr.table('sites').insert({ 'proxy': 'host2:8001', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host3' assert instance['port'] == 8000 rr.table('sites').insert({ 'proxy': 'host3:8000', 'status': 'ACTIVE', 'last_disclaimed': doublethink.utcnow()}).run() instance = worker._choose_warcprox() assert instance['host'] == 'host4' assert instance['port'] == 8000 # clean up rr.table('sites').delete().run() rr.table('services').delete().run()
def brozzle_page(argv=None): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzle-page - brozzle a single page', formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument('-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=('json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--username', dest='username', default=None, help='use this username to try to log in if a login form is found') arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') arg_parser.add_argument('--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument('--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) behavior_parameters = {} if args.behavior_parameters: behavior_parameters = json.loads(args.behavior_parameters) site = brozzler.Site( None, { 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password }) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) worker = brozzler.BrozzlerWorker( frontier=None, proxy=args.proxy, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) try: browser.start(proxy=args.proxy) outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot, enable_youtube_dl=not args.skip_youtube_dl) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()