Ejemplo n.º 1
0
def test_page_videos(httpd):
    # test depends on behavior of youtube-dl and chromium, could fail and need
    # to be adjusted on youtube-dl or chromium updates
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site6/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        worker.brozzle_page(browser, site, page)
    assert page.videos
    assert len(page.videos) == 2
    assert page.videos[0] == {
        'blame': 'youtube-dl',
        'response_code': 200,
        'content-length': 383631,
        'content-type': 'video/mp4',
        'url': 'http://localhost:%s/site6/small.mp4' % httpd.server_port,
    }
    assert page.videos[1] == {
        'blame': 'browser',
        # 'response_code': 206,
        # 'content-range': 'bytes 0-229454/229455',
        'response_code': 200,
        'content-length': 229455,
        'content-type': 'video/webm',
        'url': 'http://localhost:%s/site6/small.webm' % httpd.server_port,
    }
Ejemplo n.º 2
0
def brozzle_page():
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            description='brozzle-page - brozzle a single page',
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument(
            '-e', '--chrome-exe', dest='chrome_exe',
            default=suggest_default_chome_exe(),
            help='executable to use to invoke chrome')
    arg_parser.add_argument(
            '--proxy', dest='proxy', default=None,
            help='http proxy')
    arg_parser.add_argument(
            '--enable-warcprox-features', dest='enable_warcprox_features',
            action='store_true', help=(
                'enable special features that assume the configured proxy '
                'is warcprox'))
    _add_common_options(arg_parser)

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    site = brozzler.Site(
            id=-1, seed=args.url, proxy=args.proxy,
            enable_warcprox_features=args.enable_warcprox_features)
    page = brozzler.Page(url=args.url, site_id=site.id)
    worker = brozzler.BrozzlerWorker(frontier=None)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
                ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
                datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    browser.start(proxy=site.proxy)
    try:
        outlinks = worker.brozzle_page(
                browser, site, page, on_screenshot=on_screenshot)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()
Ejemplo n.º 3
0
def test_limit_failures():
    page = mock.Mock()
    page.failed_attempts = None
    page.brozzle_count = 0

    site = mock.Mock()
    site.status = 'ACTIVE'
    site.active_brozzling_time = 0
    site.starts_and_stops = [{'start':datetime.datetime.utcnow()}]

    rr = mock.Mock()
    rr.servers = [mock.Mock()]
    rethink_query = mock.Mock(run=mock.Mock(return_value=[]))
    rr.db_list = mock.Mock(return_value=rethink_query)
    rr.table_list = mock.Mock(return_value=rethink_query)
    rr.table = mock.Mock(
            return_value=mock.Mock(
                between=mock.Mock(
                    return_value=mock.Mock(
                        limit=mock.Mock(
                            return_value=rethink_query)))))
    assert rr.table().between().limit().run() == []
    frontier = brozzler.RethinkDbFrontier(rr)
    frontier.enforce_time_limit = mock.Mock()
    frontier.honor_stop_request = mock.Mock()
    frontier.claim_page = mock.Mock(return_value=page)
    frontier._maybe_finish_job = mock.Mock()

    browser = mock.Mock()

    worker = brozzler.BrozzlerWorker(frontier)
    worker.brozzle_page = mock.Mock(side_effect=Exception)

    assert page.failed_attempts is None
    assert page.brozzle_count == 0
    assert site.status == 'ACTIVE'

    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 1
    assert page.brozzle_count == 0
    assert site.status == 'ACTIVE'

    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 2
    assert page.brozzle_count == 0
    assert site.status == 'ACTIVE'

    worker.brozzle_site(browser, site)
    assert page.failed_attempts == 3
    assert page.brozzle_count == 1
    assert site.status == 'FINISHED'
Ejemplo n.º 4
0
def test_extract_outlinks(httpd):
    chrome_exe = brozzler.suggest_default_chrome_exe()
    worker = brozzler.BrozzlerWorker(None)
    site = brozzler.Site(None, {})
    page = brozzler.Page(
        None, {'url': 'http://localhost:%s/site8/' % httpd.server_port})
    with brozzler.Browser(chrome_exe=chrome_exe) as browser:
        outlinks = worker.brozzle_page(browser, site, page)
    assert outlinks == {
        'http://example.com/offsite',
        'http://localhost:%s/site8/baz/zuh' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#1' % httpd.server_port,
        'http://localhost:%s/site8/fdjisapofdjisap#2' % httpd.server_port
    }
Ejemplo n.º 5
0
def test_proxy_down():
    '''
    Test all fetching scenarios raise `brozzler.ProxyError` when proxy is down.

    This test needs to cover every possible fetch through the proxy other than
    fetches from the browser. For that, see test_brozzling.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in ('127.0.0.1:4',
                                '127.0.0.1:%s' % sock.getsockname()[1]):
        worker = brozzler.BrozzlerWorker(frontier=None,
                                         proxy=not_listening_proxy)
        site = brozzler.Site(None, {
            'id': str(uuid.uuid4()),
            'seed': 'http://example.com/'
        })
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        # robots.txt fetch
        with pytest.raises(brozzler.ProxyError):
            brozzler.is_permitted_by_robots(site,
                                            'http://example.com/',
                                            proxy=not_listening_proxy)

        # youtube-dl fetch
        with tempfile.TemporaryDirectory(prefix='brzl-ydl-') as tempdir:
            ydl = worker._youtube_dl(tempdir, site)
            with pytest.raises(brozzler.ProxyError):
                worker._try_youtube_dl(ydl, site, page)

        # raw fetch
        with pytest.raises(brozzler.ProxyError):
            worker._fetch_url(site, page)

        # WARCPROX_WRITE_RECORD
        with pytest.raises(brozzler.ProxyError):
            worker._warcprox_write_record(
                warcprox_address=not_listening_proxy,
                url='test://proxy_down/warcprox_write_record',
                warc_type='metadata',
                content_type='text/plain',
                payload=b'''payload doesn't matter here''')
Ejemplo n.º 6
0
def test_proxy_down():
    '''
    Test that browsing raises `brozzler.ProxyError` when proxy is down.

    See also `test_proxy_down` in test_units.py.

    Tests two different kinds of connection error:
    - nothing listening the port (nobody listens on on port 4 :))
    - port bound but not accepting connections
    '''
    sock = socket.socket()
    sock.bind(('127.0.0.1', 0))
    for not_listening_proxy in ('127.0.0.1:4',
                                '127.0.0.1:%s' % sock.getsockname()[1]):
        site = brozzler.Site(None, {'seed': 'http://example.com/'})
        page = brozzler.Page(None, {'url': 'http://example.com/'})

        worker = brozzler.BrozzlerWorker(frontier=None,
                                         proxy=not_listening_proxy)
        chrome_exe = brozzler.suggest_default_chrome_exe()

        with brozzler.Browser(chrome_exe=chrome_exe) as browser:
            with pytest.raises(brozzler.ProxyError):
                worker.brozzle_page(browser, site, page)
Ejemplo n.º 7
0
def test_choose_warcprox():
    rr = doublethink.Rethinker('localhost', db='ignoreme')
    svcreg = doublethink.ServiceRegistry(rr)
    frontier = brozzler.RethinkDbFrontier(rr)

    # avoid this of error: https://travis-ci.org/internetarchive/brozzler/jobs/330991786#L1021
    rr.table('sites').wait().run()
    rr.table('services').wait().run()
    rr.table('sites').index_wait().run()
    rr.table('services').index_wait().run()

    # clean slate
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
    worker = brozzler.BrozzlerWorker(frontier, svcreg)
    assert worker._choose_warcprox() is None

    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host1', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host2', 'port': 8001,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host3', 'port': 8000,
        'load': 0, 'ttl': 60}).run()
    rr.table('services').insert({
        'role': 'warcprox',
        'first_heartbeat': doublethink.utcnow(),
        'last_heartbeat': doublethink.utcnow(),
        'host': 'host4', 'port': 8000,
        'load': 1, 'ttl': 60}).run()

    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host1:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()
    rr.table('sites').insert({
        'proxy': 'host2:8001', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host3'
    assert instance['port'] == 8000
    rr.table('sites').insert({
        'proxy': 'host3:8000', 'status': 'ACTIVE',
        'last_disclaimed': doublethink.utcnow()}).run()

    instance = worker._choose_warcprox()
    assert instance['host'] == 'host4'
    assert instance['port'] == 8000

    # clean up
    rr.table('sites').delete().run()
    rr.table('services').delete().run()
Ejemplo n.º 8
0
def brozzle_page(argv=None):
    '''
    Command line utility entry point for brozzling a single page. Opens url in
    a browser, running some javascript behaviors, and prints outlinks.
    '''
    argv = argv or sys.argv
    arg_parser = argparse.ArgumentParser(
        prog=os.path.basename(argv[0]),
        description='brozzle-page - brozzle a single page',
        formatter_class=BetterArgumentDefaultsHelpFormatter)
    arg_parser.add_argument('url', metavar='URL', help='page url')
    arg_parser.add_argument('-e',
                            '--chrome-exe',
                            dest='chrome_exe',
                            default=suggest_default_chrome_exe(),
                            help='executable to use to invoke chrome')
    arg_parser.add_argument(
        '--behavior-parameters',
        dest='behavior_parameters',
        default=None,
        help=('json blob of parameters to populate the javascript behavior '
              'template, e.g. {"parameter_username":"******",'
              '"parameter_password":"******"}'))
    arg_parser.add_argument(
        '--username',
        dest='username',
        default=None,
        help='use this username to try to log in if a login form is found')
    arg_parser.add_argument(
        '--password',
        dest='password',
        default=None,
        help='use this password to try to log in if a login form is found')
    arg_parser.add_argument('--proxy',
                            dest='proxy',
                            default=None,
                            help='http proxy')
    arg_parser.add_argument('--skip-extract-outlinks',
                            dest='skip_extract_outlinks',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-visit-hashtags',
                            dest='skip_visit_hashtags',
                            action='store_true',
                            help=argparse.SUPPRESS)
    arg_parser.add_argument('--skip-youtube-dl',
                            dest='skip_youtube_dl',
                            action='store_true',
                            help=argparse.SUPPRESS)
    add_common_options(arg_parser, argv)

    args = arg_parser.parse_args(args=argv[1:])
    configure_logging(args)

    behavior_parameters = {}
    if args.behavior_parameters:
        behavior_parameters = json.loads(args.behavior_parameters)
    site = brozzler.Site(
        None, {
            'id': -1,
            'seed': args.url,
            'behavior_parameters': behavior_parameters,
            'username': args.username,
            'password': args.password
        })
    page = brozzler.Page(None, {'url': args.url, 'site_id': site.id})
    worker = brozzler.BrozzlerWorker(
        frontier=None,
        proxy=args.proxy,
        skip_extract_outlinks=args.skip_extract_outlinks,
        skip_visit_hashtags=args.skip_visit_hashtags,
        skip_youtube_dl=args.skip_youtube_dl)

    def on_screenshot(screenshot_png):
        OK_CHARS = (string.ascii_letters + string.digits)
        filename = '/tmp/{}-{:%Y%m%d%H%M%S}.png'.format(
            ''.join(ch if ch in OK_CHARS else '_' for ch in args.url),
            datetime.datetime.now())
        # logging.info('len(screenshot_png)=%s', len(screenshot_png))
        with open(filename, 'wb') as f:
            f.write(screenshot_png)
        logging.info('wrote screenshot to %s', filename)

    browser = brozzler.Browser(chrome_exe=args.chrome_exe)
    try:
        browser.start(proxy=args.proxy)
        outlinks = worker.brozzle_page(
            browser,
            site,
            page,
            on_screenshot=on_screenshot,
            enable_youtube_dl=not args.skip_youtube_dl)
        logging.info('outlinks: \n\t%s', '\n\t'.join(sorted(outlinks)))
    except brozzler.ReachedLimit as e:
        logging.error('reached limit %s', e)
    finally:
        browser.stop()