Exemple #1
0
def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site6/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 2
    assert page.videos[0] == {
        'blame': 'youtube-dl',
        'response_code': 200,
        'content-length': 383631,
        'content-type': 'video/mp4',
        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
    }
    assert page.videos[1] == {
        'blame': 'browser',
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
        'response_code': 200,
        'content-length': 229455,
        'content-type': 'video/webm',
        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
    }
Exemple #2
0
def test_420(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/420' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.ReachedLimit) as excinfo:
            browser.browse_page(url)
        assert excinfo.value.warcprox_meta == WARCPROX_META_420
Exemple #3
0
def test_try_login(httpd):
    """Test try_login behavior.
    """
    response_urls = []

    def on_response(msg):
        response_urls.append(msg['params']['response']['url'])

    chrome_exe = brozzler.suggest_default_chrome_exe()
    form_url = 'http://localhost:%s/site11/form1.html' % httpd.server_port
    favicon_url = 'http://localhost:%s/favicon.ico' % httpd.server_port
    login_url = 'http://localhost:%s/login-action' % httpd.server_port
    # When username and password are defined and initial page has login form,
    # detect login form, submit login, and then return to the initial page.
    username = '******'
    password = '******'
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(form_url,
                            username=username,
                            password=password,
                            on_response=on_response)
    assert len(response_urls) == 4
    assert response_urls[0] == form_url
    assert response_urls[1] == favicon_url
    assert response_urls[2] == login_url
    assert response_urls[3] == form_url

    # When username and password are not defined, just load the initial page.
    response_urls = []
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(form_url, on_response=on_response)
    assert len(response_urls) == 2
    assert response_urls[0] == form_url
    assert response_urls[1] == favicon_url

    # when the page doesn't have a form with username/password, don't submit it
    response_urls = []
    form_without_login_url = 'http://localhost:%s/site11/form-no-login.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(form_without_login_url,
                            username=username,
                            password=password,
                            on_response=on_response)
    assert len(response_urls) == 2
    assert response_urls[0] == form_without_login_url
    assert response_urls[1] == favicon_url
Exemple #4
0
def test_js_dialogs(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/site4/alert.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        # before commit d2ed6b97a24 these would hang and eventually raise
        # brozzler.browser.BrowsingTimeout, which would cause this test to fail
        browser.browse_page('http://localhost:%s/site4/alert.html' %
                            httpd.server_port)
        browser.browse_page('http://localhost:%s/site4/confirm.html' %
                            httpd.server_port)
        browser.browse_page('http://localhost:%s/site4/prompt.html' %
                            httpd.server_port)
def test_on_response(httpd):
    response_urls = []
    def on_response(msg):
        response_urls.append(msg['params']['response']['url'])

    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/site3/page.html' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        browser.browse_page(url, on_response=on_response)
    assert response_urls[0] == 'http://localhost:%s/site3/page.html' % httpd.server_port
    assert response_urls[1] == 'http://localhost:%s/site3/brozzler.svg' % httpd.server_port
    assert response_urls[2] == 'http://localhost:%s/favicon.ico' % httpd.server_port
Exemple #6
0
def brozzle_page():
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzle-page - brozzle a single page',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None,
            help='http proxy')
    arg_parser.add_argument(
            '--enable-warcprox-features', dest='enable_warcprox_features',
            action='store_true', help=(
                'enable special features that assume the configured proxy '
                'is warcprox'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            id=-1, seed=args.url, proxy=args.proxy,
            enable_warcprox_features=args.enable_warcprox_features)
    page = brozzler.Page(url=args.url, site_id=site.id)
    worker = brozzler.BrozzlerWorker(frontier=None)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
                ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
                datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    browser.start(proxy=site.proxy)
    try:
        outlinks = worker.brozzle_page(
                browser, site, page, on_screenshot=on_screenshot)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()
Exemple #7
0
def test_extract_outlinks(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site8/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        outlinks = worker.brozzle_page(browser, site, page)
    assert outlinks == {
        'http://example.com/offsite',
        'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
    }
Exemple #8
0
def test_proxy_down():
    '''
    Test that browsing raises `brozzler.ProxyError` when proxy is down.

    See also `test_proxy_down` in test_units.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in ('127.0.0.1:4',
                                '127.0.0.1:%s' % sock.getsockname()[1]):
        site = brozzler.Site(None, {'seed': 'http://example.com/'})
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        worker = brozzler.BrozzlerWorker(frontier=None,
                                         proxy=not_listening_proxy)
        chrome_exe = brozzler.suggest_default_chrome_exe()

        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
            with pytest.raises(brozzler.ProxyError):
                worker.brozzle_page(browser, site, page)
Exemple #9
0
def brozzle_page(argv=None):
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        description='brozzle-page - brozzle a single page',
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument('-e',
                            '--chrome-exe',
                            dest='chrome_exe',
                            default=suggest_default_chrome_exe(),
                            help='executable to use to invoke chrome')
    arg_parser.add_argument(
        '--behavior-parameters',
        dest='behavior_parameters',
        default=None,
        help=('json blob of parameters to populate the javascript behavior '
              'template, e.g. {"parameter_username":"******",'
              '"parameter_password":"******"}'))
    arg_parser.add_argument(
        '--username',
        dest='username',
        default=None,
        help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
        '--password',
        dest='password',
        default=None,
        help='use this password to try to log in if a login form is found')
    arg_parser.add_argument('--proxy',
                            dest='proxy',
                            default=None,
                            help='http proxy')
    arg_parser.add_argument('--skip-extract-outlinks',
                            dest='skip_extract_outlinks',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-visit-hashtags',
                            dest='skip_visit_hashtags',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-youtube-dl',
                            dest='skip_youtube_dl',
                            action='store_true',
                            help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    behavior_parameters = {}
    if args.behavior_parameters:
        behavior_parameters = json.loads(args.behavior_parameters)
    site = brozzler.Site(
        None, {
            'id': -1,
            'seed': args.url,
            'behavior_parameters': behavior_parameters,
            'username': args.username,
            'password': args.password
        })
    page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
    worker = brozzler.BrozzlerWorker(
        frontier=None,
        proxy=args.proxy,
        skip_extract_outlinks=args.skip_extract_outlinks,
        skip_visit_hashtags=args.skip_visit_hashtags,
        skip_youtube_dl=args.skip_youtube_dl)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
            ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
            datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    try:
        browser.start(proxy=args.proxy)
        outlinks = worker.brozzle_page(
            browser,
            site,
            page,
            on_screenshot=on_screenshot,
            enable_youtube_dl=not args.skip_youtube_dl)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()
Exemple #10
0
def test_page_interstitial_exception(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    url = 'http://localhost:%s/401' % httpd.server_port
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.PageInterstitialShown):
            browser.browse_page(url)
Exemple #11
0
def test_aw_snap_hes_dead_jim():
    chrome_exe = brozzler.suggest_default_chrome_exe()
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        with pytest.raises(brozzler.BrowsingException):
            browser.browse_page('chrome://crash')