def test_page_videos(httpd): # test depends on behavior of youtube-dl and chromium, could fail and need # to be adjusted on youtube-dl or chromium updates chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site6/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: worker.brozzle_page(browser, site, page) assert page.videos assert len(page.videos) == 2 assert page.videos[0] == { 'blame': 'youtube-dl', 'response_code': 200, 'content-length': 383631, 'content-type': 'video/mp4', 'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port, } assert page.videos[1] == { 'blame': 'browser', # 'response_code': 206, # 'content-range': 'bytes 0-229454/229455', 'response_code': 200, 'content-length': 229455, 'content-type': 'video/webm', 'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port, }
def test_420(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/420' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ReachedLimit) as excinfo: browser.browse_page(url) assert excinfo.value.warcprox_meta == WARCPROX_META_420
def test_try_login(httpd): """Test try_login behavior. """ response_urls = [] def on_response(msg): response_urls.append(msg['params']['response']['url']) chrome_exe = brozzler.suggest_default_chrome_exe() form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port login_url = 'http://localhost:%s/login-action' % httpd.server_port # When username and password are defined and initial page has login form, # detect login form, submit login, and then return to the initial page. username = '******' password = '******' with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(form_url, username=username, password=password, on_response=on_response) assert len(response_urls) == 4 assert response_urls[0] == form_url assert response_urls[1] == favicon_url assert response_urls[2] == login_url assert response_urls[3] == form_url # When username and password are not defined, just load the initial page. response_urls = [] with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(form_url, on_response=on_response) assert len(response_urls) == 2 assert response_urls[0] == form_url assert response_urls[1] == favicon_url # when the page doesn't have a form with username/password, don't submit it response_urls = [] form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(form_without_login_url, username=username, password=password, on_response=on_response) assert len(response_urls) == 2 assert response_urls[0] == form_without_login_url assert response_urls[1] == favicon_url
def test_js_dialogs(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/site4/alert.html' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: # before commit d2ed6b97a24 these would hang and eventually raise # brozzler.browser.BrowsingTimeout, which would cause this test to fail browser.browse_page('http://localhost:%s/site4/alert.html' % httpd.server_port) browser.browse_page('http://localhost:%s/site4/confirm.html' % httpd.server_port) browser.browse_page('http://localhost:%s/site4/prompt.html' % httpd.server_port)
def test_on_response(httpd): response_urls = [] def on_response(msg): response_urls.append(msg['params']['response']['url']) chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/site3/page.html' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: browser.browse_page(url, on_response=on_response) assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
def brozzle_page(): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' arg_parser = argparse.ArgumentParser( prog=os.path.basename(sys.argv[0]), description='brozzle-page - brozzle a single page', formatter_class=argparse.ArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument( '-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument( '--enable-warcprox-features', dest='enable_warcprox_features', action='store_true', help=( 'enable special features that assume the configured proxy ' 'is warcprox')) _add_common_options(arg_parser) args = arg_parser.parse_args(args=sys.argv[1:]) _configure_logging(args) site = brozzler.Site( id=-1, seed=args.url, proxy=args.proxy, enable_warcprox_features=args.enable_warcprox_features) page = brozzler.Page(url=args.url, site_id=site.id) worker = brozzler.BrozzlerWorker(frontier=None) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) browser.start(proxy=site.proxy) try: outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()
def test_extract_outlinks(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() worker = brozzler.BrozzlerWorker(None) site = brozzler.Site(None, {}) page = brozzler.Page( None, {'url': 'http://localhost:%s/site8/' % httpd.server_port}) with brozzler.Browser(chrome_exe=chrome_exe) as browser: outlinks = worker.brozzle_page(browser, site, page) assert outlinks == { 'http://example.com/offsite', 'http://localhost:%s/site8/baz/zuh' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port, 'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port }
def test_proxy_down(): ''' Test that browsing raises `brozzler.ProxyError` when proxy is down. See also `test_proxy_down` in test_units.py. Tests two different kinds of connection error: - nothing listening the port (nobody listens on on port 4 :)) - port bound but not accepting connections ''' sock = socket.socket() sock.bind(('127.0.0.1', 0)) for not_listening_proxy in ('127.0.0.1:4', '127.0.0.1:%s' % sock.getsockname()[1]): site = brozzler.Site(None, {'seed': 'http://example.com/'}) page = brozzler.Page(None, {'url': 'http://example.com/'}) worker = brozzler.BrozzlerWorker(frontier=None, proxy=not_listening_proxy) chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.ProxyError): worker.brozzle_page(browser, site, page)
def brozzle_page(argv=None): ''' Command line utility entry point for brozzling a single page. Opens url in a browser, running some javascript behaviors, and prints outlinks. ''' argv = argv or sys.argv arg_parser = argparse.ArgumentParser( prog=os.path.basename(argv[0]), description='brozzle-page - brozzle a single page', formatter_class=BetterArgumentDefaultsHelpFormatter) arg_parser.add_argument('url', metavar='URL', help='page url') arg_parser.add_argument('-e', '--chrome-exe', dest='chrome_exe', default=suggest_default_chrome_exe(), help='executable to use to invoke chrome') arg_parser.add_argument( '--behavior-parameters', dest='behavior_parameters', default=None, help=('json blob of parameters to populate the javascript behavior ' 'template, e.g. {"parameter_username":"******",' '"parameter_password":"******"}')) arg_parser.add_argument( '--username', dest='username', default=None, help='use this username to try to log in if a login form is found') arg_parser.add_argument( '--password', dest='password', default=None, help='use this password to try to log in if a login form is found') arg_parser.add_argument('--proxy', dest='proxy', default=None, help='http proxy') arg_parser.add_argument('--skip-extract-outlinks', dest='skip_extract_outlinks', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-visit-hashtags', dest='skip_visit_hashtags', action='store_true', help=argparse.SUPPRESS) arg_parser.add_argument('--skip-youtube-dl', dest='skip_youtube_dl', action='store_true', help=argparse.SUPPRESS) add_common_options(arg_parser, argv) args = arg_parser.parse_args(args=argv[1:]) configure_logging(args) behavior_parameters = {} if args.behavior_parameters: behavior_parameters = json.loads(args.behavior_parameters) site = brozzler.Site( None, { 'id': -1, 'seed': args.url, 'behavior_parameters': behavior_parameters, 'username': args.username, 'password': args.password }) page = brozzler.Page(None, {'url': args.url, 'site_id': site.id}) worker = brozzler.BrozzlerWorker( frontier=None, proxy=args.proxy, skip_extract_outlinks=args.skip_extract_outlinks, skip_visit_hashtags=args.skip_visit_hashtags, skip_youtube_dl=args.skip_youtube_dl) def on_screenshot(screenshot_png): OK_CHARS = (string.ascii_letters + string.digits) filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format( ''.join(ch if ch in OK_CHARS else '_' for ch in args.url), datetime.datetime.now()) # logging.info('len(screenshot_png)=%s', len(screenshot_png)) with open(filename, 'wb') as f: f.write(screenshot_png) logging.info('wrote screenshot to %s', filename) browser = brozzler.Browser(chrome_exe=args.chrome_exe) try: browser.start(proxy=args.proxy) outlinks = worker.brozzle_page( browser, site, page, on_screenshot=on_screenshot, enable_youtube_dl=not args.skip_youtube_dl) logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks))) except brozzler.ReachedLimit as e: logging.error('reached limit %s', e) finally: browser.stop()
def test_page_interstitial_exception(httpd): chrome_exe = brozzler.suggest_default_chrome_exe() url = 'http://localhost:%s/401' % httpd.server_port with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.PageInterstitialShown): browser.browse_page(url)
def test_aw_snap_hes_dead_jim(): chrome_exe = brozzler.suggest_default_chrome_exe() with brozzler.Browser(chrome_exe=chrome_exe) as browser: with pytest.raises(brozzler.BrowsingException): browser.browse_page('chrome://crash')